Massive commit of all changes, and added files for the Monitor-server package.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Wed, 30 Jul 2008 20:55:23 +0000 (20:55 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Wed, 30 Jul 2008 20:55:23 +0000 (20:55 +0000)
32 files changed:
automate.py [new file with mode: 0644]
automate_pl03.sh
bootman.py [moved from nodereboot.py with 97% similarity]
clean_policy.py [new file with mode: 0644]
const.py [new file with mode: 0644]
emailTxt.py
findbad.py
findbadpcu.py
getconf.py
getsshkeys.py
grouprins.py
monitor.py
monitor_policy.py
nodeaction.py [new file with mode: 0755]
nodebad.py [new file with mode: 0755]
nodegroups.py
nodehistory.py
nodeinfo.py
nodequery.py
nodesets.py [new file with mode: 0755]
pcubad.py [new file with mode: 0755]
pcuinfo.py [new file with mode: 0755]
plc.py
policy.py
reboot.py
rt.py
showlatlon.py [new file with mode: 0755]
sitebad.py [new file with mode: 0755]
siteinfo.py [new file with mode: 0755]
soltesz.py
template.py [new file with mode: 0644]
unified_model.py

diff --git a/automate.py b/automate.py
new file mode 100644 (file)
index 0000000..2c67ffe
--- /dev/null
@@ -0,0 +1,40 @@
+import csv
+from glob import glob
+import os
+import time
+
+def time_to_str(t):
+       return time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(t))
+
+def get_filelist_from_dir(dirname):
+       filelist = glob("%s/*.out" % dirname)
+
+       ret_list = []
+       for file in filelist:
+               ret_list.append(file)
+       return ret_list
+
+def get_hostlist_from_dir(dirname):
+       filelist = glob("%s/*.out" % dirname)
+
+       ret_list = []
+       for file in filelist:
+               ret_list.append([os.path.basename(file)[:-4], ''])
+       return ret_list
+
+def csv_to_hash(r):
+       ret = {}
+       for line in r:
+               (k,v) = (line[0], line[1])
+               if k not in ret:
+                       ret[k] = v
+               else:
+                       # multiple values for the same key
+                       if isinstance(ret[k], list):
+                               ret[k].append(v)
+                       else:
+                               ret[k] = [ret[k], v]
+       return ret
+
+def getcsv(file):
+       return csv_to_hash(csv.reader(open(file,'r')))
index 32a1a17..a7712b4 100755 (executable)
@@ -21,12 +21,13 @@ if [ -f $HOME/monitor/SKIP ] ; then
        fi 
 fi
 echo $$ > $HOME/monitor/SKIP
+
 #########################
 # 1. FINDBAD NODES 
 rm -f pdb/production.findbad2.pkl
 ./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE
 
-ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill
+ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || :
 
 ########################
 # COPY to golf for diagnose.py and action.py
@@ -41,29 +42,38 @@ cp pdb/production.findbad2.pkl pdb/production.findbad.pkl
 # badcsv.txt
 ./printbadcsv.py  | grep -v loading | tr -d ' ' > badcsv.txt
 cp badcsv.txt /plc/data/var/www/html/monitor/
+./showlatlon.py | head -9 | awk 'BEGIN {print "<table>"} { print "<tr><td>", $0, "</td></tr>"} END{print "</table>"}'  | sed -e 's\|\</td><td>\g' > /plc/data/var/www/html/monitor/regions.html
 
 #########################
 # 2. FINDBAD PCUS
 rm -f pdb/production.findbadpcus2.pkl
 ./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE            
 
-./sitebad.py --increment
+./sitebad.py --increment || :
+./nodebad.py --increment || :
+./pcubad.py --increment || :
 
 # clean up stray 'locfg' processes that hang around inappropriately...
-ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill
+ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill || :
 
 # convert pkl to php serialize format.
 cp pdb/production.findbadpcus2.pkl pdb/production.findbadpcus.pkl
-./pkl2php.py -i findbadpcus2 -o findbadpcus
 
+./pkl2php.py -i findbadpcus2 -o findbadpcus
 ./pkl2php.py -i act_all -o act_all
 ./pkl2php.py -i plcdb_hn2lb -o plcdb_hn2lb
 ./pkl2php.py -i findbad -o findbadnodes
 ./pkl2php.py -i ad_dbTickets -o ad_dbTickets
 ./pkl2php.py -i idTickets -o idTickets
 
-for f in findbad act_all findbadpcus l_plcnodes; do 
+#for f in findbad act_all findbadpcus l_plcnodes; do 
+#for f in findbad act_all findbadpcus l_plcnodes site_persistflags ; do 
+for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do 
        cp pdb/production.$f.pkl archive-pdb/`date +%F-%H:%M`.production.$f.pkl
 done
 
+./grouprins.py --mail=1 --nodeselect 'state=DEBUG&&boot_state=dbg' \
+                                               --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
+                                               --reboot || :
+
 rm -f $HOME/monitor/SKIP
similarity index 97%
rename from nodereboot.py
rename to bootman.py
index 3f9d6c6..ce9bb6e 100755 (executable)
@@ -494,7 +494,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
-                       ('downloadfail' , 'Unable to download main tarball /boot-alpha/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
+                       ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                        ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
@@ -538,7 +538,9 @@ def reboot(hostname, config=None, forced_action=None):
        # restart_bootmanager_boot
        for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-debug-done",
                        "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
@@ -556,6 +558,7 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
@@ -660,7 +663,8 @@ def reboot(hostname, config=None, forced_action=None):
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('boot')
+                               conn.restart_bootmanager('rins')
+                               pass
                        else:
                                # there was some failure to synchronize the keys.
                                print "...Unable to repair node keys on %s" % node
@@ -713,8 +717,8 @@ def reboot(hostname, config=None, forced_action=None):
                        loginbase = plc.siteId(hostname)
                        m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 
-                       #print "\tDisabling %s due to out-of-date BOOTCD" % hostname
-                       #conn.set_nodestate('disable')
+                       print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+                       conn.set_nodestate('disable')
 
                elif sequences[s] == "broken_hardware_email":
                        # MAKE An ACTION record that this host has failed hardware.  May
diff --git a/clean_policy.py b/clean_policy.py
new file mode 100644 (file)
index 0000000..dba9b9b
--- /dev/null
@@ -0,0 +1,355 @@
+from config import config
+#print "policy"
+config = config()
+import soltesz
+import time
+import mailer
+from www.printbadnodes import cmpCategoryVal
+import sys
+import emailTxt
+import string
+
+from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
+from rt import is_host_in_rt_tickets
+import plc
+
+# Time to enforce policy
+POLSLEEP = 7200
+
+# Where to email the summary
+SUMTO = "soltesz@cs.princeton.edu"
+
+from const import *
+
+from unified_model import *
+
+class MonitorMergeDiagnoseSendEscellate:
+       def __init__(self, hostname, act):
+               self.hostname = hostname
+               self.act = act
+               self.plcdb_hn2lb = None
+               if self.plcdb_hn2lb is None:
+                       self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.loginbase = self.plcdb_hn2lb[self.hostname]
+               return
+
+       def getFBRecord(self):
+               fb = soltesz.dbLoad("findbad")
+               if self.hostname in fb['nodes']:
+                       fbnode = fb['nodes'][self.hostname]['values']
+               else:
+                       raise Exception("Hostname %s not in scan database"% self.hostname)
+               return fbnode
+
+       def getActionRecord(self):
+               # update ticket status
+               act_all = soltesz.dbLoad("act_all")
+               if self.hostname in act_all and len(act_all[self.hostname]) > 0:
+                       actnode = act_all[self.hostname][0]
+               else:
+                       actnode = None
+               del act_all
+               return actnode
+
+       def getKernel(self, unamestr):
+               s = unamestr.split()
+               if len(s) > 2:
+                       return s[2]
+               else:
+                       return ""
+
+       def mergeRecord(self, fbnode, actnode):
+               fbnode['kernel'] = self.getKernel(fbnode['kernel'])
+               fbnode['stage'] = "findbad"
+               fbnode['message'] = None
+               fbnode['args'] = None
+               fbnode['info'] = None
+               fbnode['log'] = None
+               fbnode['time'] = time.time()
+               fbnode['date_created'] = time.time()
+
+               if actnode is None:
+                       actnode = {} 
+                       actnode.update(fbnode)
+                       actnode['ticket_id'] = ""
+                       actnode['prev_category'] = "NORECORD" 
+               else:
+                       actnode['prev_category']= actnode['category']
+                       actnode['comonstats']   = fbnode['comonstats']
+                       actnode['category']             = fbnode['category']
+                       actnode['state']                = fbnode['state']
+                       actnode['kernel']               = fbnode['kernel']
+                       actnode['bootcd']               = fbnode['bootcd']
+                       actnode['plcnode']              = fbnode['plcnode']
+                       ticket = get_ticket_id(actnode)
+                       if ticket is None: actnode['ticket_id'] = ""
+                       actnode['rt'] = mailer.getTicketStatus(ticket)
+
+                       #for key in actnode.keys():
+                       #       print "%10s %s %s " % (key, "==", actnode[key])
+                       #print "----------------------------"
+
+               return actnode
+
+       def run(self):
+               fbnode = self.getFBRecord()
+               actnode= self.getActionRecord()
+               actrec = self.mergeRecord(fbnode, actnode)
+               record = Record(self.hostname, actrec)
+               diag   = self.diagnose(record)
+               if self.act and diag is not None:
+                       self.action(record,diag)
+       
+       def diagnose(self, record):
+
+               diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
+               # NOTE: change record stage based on RT status.
+               diag.setFlag('ResetStage')
+               if record.stageIswaitforever():
+                       ticket = record.data['rt']
+                       if 'new' in ticket['Status']:
+                               diag.setFlag('ResetStage')
+                               
+                       if 'resolved' in ticket['Status']:
+                               diag.setFlag('EndRecord')
+
+               # NOTE: take category, and prepare action
+               category = record.getCategory()
+               if category == "error":
+                       diag.setFlag('SendNodedown')
+                       record.data['message'] = emailTxt.mailtxt.newdown
+                       record.data['log'] = self.getDownLog(record)
+
+               elif category == "prod":
+                       state = diag.getState()
+                       if state == "boot":
+                               diag.setFlag('SendThankyou')
+                               record.data['message'] = emailTxt.mailtxt.newthankyou
+                               record.data['log'] = self.getThankyouLog(record)
+
+                       elif state == "debug":
+                               pass
+                       else:
+                               print "unknown state %s for host %s" % (state, self.hostname)
+               else:
+                       print "unknown category: %s" % category
+
+               if diag.getFlag('ResetStage'):
+                       print "resetting stage"
+                       record.reset_stage()
+
+               record = self.checkStageAndTime(diag,record)
+               if record:
+                       print "checkStageAndTime Returned Valid Record"
+                       site = PersistFlags(self.loginbase, 1, db='site_persistflags')
+
+                       if site.status is not "good":
+                               print "Setting site %s for 'squeeze'" % self.loginbase
+                               diag.setFlag('Squeeze')
+                       else:
+                               print "Setting site %s for 'backoff'" % self.loginbase
+                               diag.setFlag('BackOff')
+
+                       diag.save()
+                       return diag
+               else:
+                       print "checkStageAndTime Returned NULL Record"
+                       return None
+
+       def action(self, record, diag):
+               if record.improved() or diag.getFlag('EndRecord'):
+                       print "end record for %s" % self.hostname
+                       record.end_record()
+                       diag.setFlag('CloseRT')
+                       return None
+
+               if self.getSendEmailFlag(record): 
+                       print "sending email"
+                       message = record.getMessage(record.data['ticket_id'])
+                       message.reset()
+                       message.send(record.getContacts())
+                       if message.rt.ticket_id:
+                               print "setting record ticket_id"
+                               record.data['ticket_id'] = message.rt.ticket_id
+                       if diag.getFlag('CloseRT'):
+                               message.rt.closeTicket()
+               else:
+                       print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
+
+               if record.data['takeaction'] and diag.getFlag('Squeeze'):
+                       print "taking action"
+                       record.takeAction()
+
+               print "saving act_all db"
+               self.add_and_save_act_all(record)
+
+               return
+
+       def getSendEmailFlag(self, record):
+               if not config.mail:
+                       return False
+
+               # resend if open & created longer than 30 days ago.
+               if  'rt' in record.data and \
+                       'Status' in record.data['rt'] and \
+                       "open" in record.data['rt']['Status'] and \
+                       record.data['rt']['Created'] < 60*60*24*30:
+                       return False
+
+               return True
+
+       def add_and_save_act_all(self, record):
+               self.act_all = soltesz.dbLoad("act_all")
+               self.act_all[self.hostname].insert(0,record.data)
+               soltesz.dbDump("act_all", self.act_all)
+               
+       def getDownLog(self, record):
+
+               record.data['args'] = {'nodename': self.hostname}
+               record.data['info'] = (self.hostname, Record.getStrDaysDown(record.data), "")
+
+               #for key in record.data.keys():
+               #       print "%10s %s %s " % (key, "==", record.data[key])
+
+               if record.data['ticket_id'] == "":
+                       log = "DOWN: %20s : %-40s == %20s %s" % \
+                               (self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
+               else:
+                       log = "DOWN: %20s : %-40s == %20s %s" % \
+                               (self.loginbase, self.hostname, record.data['info'][1:], record.data['ticket_id'])
+               return log
+
+       def getThankyouLog(self, record):
+
+               record.data['args'] = {'nodename': self.hostname}
+               record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
+
+               if record.data['ticket_id'] == "":
+                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                               (self.loginbase, self.hostname, record.data['stage'], 
+                                                state, category, record.data['found_rt_ticket'])
+               else:
+                       log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+                                               (self.loginbase, self.hostname, record.data['stage'], 
+                                                state, category, record.data['ticket_id'])
+               return log
+
+       def checkStageAndTime(self, diag, record):
+               current_time = time.time()
+               delta = current_time - record.data['time']
+               if   'findbad' in record.data['stage']:
+                       # The node is bad, and there's no previous record of it.
+                       record.data['email'] = TECH
+                       record.data['action'] = ['noop']
+                       record.data['takeaction'] = False
+                       record.data['message'] = record.data['message'][0]
+                       record.data['stage'] = 'stage_actinoneweek'
+
+               elif 'reboot_node' in record.data['stage']:
+                       record.data['email'] = TECH
+                       record.data['action'] = ['noop']
+                       record.data['message'] = record.data['message'][0]
+                       record.data['stage'] = 'stage_actinoneweek'
+                       record.data['takeaction'] = False
+                       
+               elif 'improvement' in record.data['stage']:
+                       print "backing off of %s" % self.hostname
+                       record.data['action'] = ['close_rt']
+                       record.data['takeaction'] = True
+                       record.data['message'] = record.data['message'][0]
+                       record.data['stage'] = 'monitor-end-record'
+
+               elif 'actinoneweek' in record.data['stage']:
+                       if delta >= 7 * SPERDAY: 
+                               record.data['email'] = TECH | PI
+                               record.data['stage'] = 'stage_actintwoweeks'
+                               record.data['message'] = record.data['message'][1]
+                               record.data['action'] = ['nocreate' ]
+                               record.data['time'] = current_time              # reset clock for waitforever
+                               record.data['takeaction'] = True
+                       elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
+                               record.data['email'] = TECH 
+                               record.data['message'] = record.data['message'][0]
+                               record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
+                               record.data['second-mail-at-oneweek'] = True
+                               record.data['takeaction'] = False
+                       else:
+                               record.data['message'] = None
+                               record.data['action'] = ['waitforoneweekaction' ]
+                               print "ignoring this record for: %s" % self.hostname
+                               return None                     # don't send if there's no action
+
+               elif 'actintwoweeks' in record.data['stage']:
+                       if delta >= 7 * SPERDAY:
+                               record.data['email'] = TECH | PI | USER
+                               record.data['stage'] = 'stage_waitforever'
+                               record.data['message'] = record.data['message'][2]
+                               record.data['action'] = ['suspendslices']
+                               record.data['time'] = current_time              # reset clock for waitforever
+                               record.data['takeaction'] = True
+                       elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
+                               record.data['email'] = TECH | PI
+                               record.data['message'] = record.data['message'][1]
+                               record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
+                               record.data['second-mail-at-twoweeks'] = True
+                               record.data['takeaction'] = False
+                       else:
+                               record.data['message'] = None
+                               record.data['action'] = ['waitfortwoweeksaction']
+                               return None                     # don't send if there's no action
+
+               elif 'ticket_waitforever' in record.data['stage']:
+                       record.data['email'] = TECH
+                       record.data['takeaction'] = True
+                       if 'first-found' not in record.data:
+                               record.data['first-found'] = True
+                               record.data['log'] += " firstfound"
+                               record.data['action'] = ['ticket_waitforever']
+                               record.data['message'] = None
+                               record.data['time'] = current_time
+                       else:
+                               if delta >= 7*SPERDAY:
+                                       record.data['action'] = ['ticket_waitforever']
+                                       record.data['message'] = None
+                                       record.data['time'] = current_time              # reset clock
+                               else:
+                                       record.data['action'] = ['ticket_waitforever']
+                                       record.data['message'] = None
+                                       return None
+
+               elif 'waitforever' in record.data['stage']:
+                       # more than 3 days since last action
+                       # TODO: send only on weekdays.
+                       # NOTE: expects that 'time' has been reset before entering waitforever stage
+                       record.data['takeaction'] = True
+                       if delta >= 3*SPERDAY:
+                               record.data['action'] = ['email-againwaitforever']
+                               record.data['message'] = record.data['message'][2]
+                               record.data['time'] = current_time              # reset clock
+                       else:
+                               record.data['action'] = ['waitforever']
+                               record.data['message'] = None
+                               return None                     # don't send if there's no action
+
+               else:
+                       # There is no action to be taken, possibly b/c the stage has
+                       # already been performed, but diagnose picked it up again.
+                       # two cases, 
+                       #       1. stage is unknown, or 
+                       #       2. delta is not big enough to bump it to the next stage.
+                       # TODO: figure out which. for now assume 2.
+                       print "UNKNOWN stage for %s; nothing done" % self.hostname
+                       record.data['action'] = ['unknown']
+                       record.data['message'] = record.data['message'][0]
+
+                       record.data['email'] = TECH
+                       record.data['action'] = ['noop']
+                       record.data['message'] = record.data['message'][0]
+                       record.data['stage'] = 'stage_actinoneweek'
+                       record.data['time'] = current_time              # reset clock
+                       record.data['takeaction'] = False
+
+               print "%s" % record.data['log'],
+               print "%15s" % record.data['action']
+               return record
+               
diff --git a/const.py b/const.py
new file mode 100644 (file)
index 0000000..9495c0a
--- /dev/null
+++ b/const.py
@@ -0,0 +1,25 @@
+
+TECH=1
+PI=2
+USER=4
+ADMIN=8
+
+TECHEMAIL="tech-%s@sites.planet-lab.org"
+PIEMAIL="pi-%s@sites.planet-lab.org"
+SLICEMAIL="%s@slices.planet-lab.org"
+PLCEMAIL="support@planet-lab.org"
+
+#Thresholds (DAYS)
+SPERMIN = 60
+SPERHOUR = 60*60
+SPERDAY = 86400
+PITHRESH = 7 * SPERDAY
+SLICETHRESH = 7 * SPERDAY
+# Days before attempting rins again
+RINSTHRESH = 5 * SPERDAY
+
+# Days before calling the node dead.
+DEADTHRESH = 30 * SPERDAY
+# Minimum number of nodes up before squeezing
+MINUP = 2
+
index 8a666c8..137e4a9 100644 (file)
@@ -163,6 +163,17 @@ If your node returns to normal operation after following these directions, then
 Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
+       pcuthankyou_one=("""Thank you for correcting your PlanetLab node PCU - %(loginbase)s""",
+       """
+While monitoring your site, we noticed that the following PCU *improved* their states:
+
+%(hostname_list)s  
+Often, system administration is a thankless job, but not today. :-)
+
+Thank you!
+  -- PlanetLab Central (support@planet-lab.org)
+       """)
+
        thankyou=("""Thank you for helping maintain your PlanetLab nodes - %(loginbase)s""",
        """
 While monitoring your site, we noticed that the following nodes *improved*
@@ -196,7 +207,7 @@ Monitor restarted NM on the following machines:
 """As part of our machine monitoring and maintenance, we tried to use the PCU
 registered below, but could not for the following reason at the link below:
 
-       https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
+       https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s
 
 We need your help resolving this issue in two ways:  
 
@@ -228,7 +239,7 @@ Thank you very much for your help,
 registered below, and though it appears to succeed, we do not subsequently
 observe the associated nodes rebooting:
 
-    https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
+    https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s
 
 %(hostname_list)s
 
@@ -289,6 +300,7 @@ Thank you for your help,
        newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
        newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
        newthankyou=[thankyou,thankyou,thankyou]
+       pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
        NMReset=[nmreset,nmreset,nmreset]
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
@@ -309,7 +321,74 @@ BootManager.log output follows:
 ---------------------------------------------------------
 %(bmlog)s
 """      )
+       donation_down_one=("""PlanetLab node donation setup: %(hostname)s""", 
+       """
+Hello,
+
+As part of PlanetLab node monitoring, we noticed the following node is registered in the PlanetLab database, but it is not completly setup and running.
+
+%(hostname_list)s 
+We are writing because we need your help completing the setup to ensure its full operation.
+
+You should have received directions for the complete configuration when you contacted the donation program coordinator at PlanetLab.  For review, or if you did not receive them, you can find the latest version here:
+
+    https://svn.planet-lab.org/wiki/DC7800Configuration
+
+It is essential that the AMT feature be configured to enable PlanetLab staff to remotely manage the machine.  The basic steps are:
+
+    Configure the DC7800 AMT feature  : https://www.planet-lab.org/AMT
+    Add a PCU to your site            : https://www.planet-lab.org/db/sites/pcu.php
+       Associate your node with the PCU  : Follow the 'My Site' link
+       Finally, download the Boot Image  : https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
+       Burn Boot Image to media & Reboot your node
+
+You can confirm that your machine's PCU is correctly configured by visiting the AMT
+port using your browser, such as:
+
+    http://%(hostname)s:16992/
+
+If you need any clarification about the steps mentioned here, please feel free
+to contact us at PlanetLab Support (support@planet-lab.org).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
+
+       donation_nopcu_one=("""PlanetLab node donation, PCU setup: %(hostname)s""", 
+"""
+Hello,
+
+As part of PlanetLab node monitoring, we noticed the following node was not completely setup at your site:
+
+%(hostname_list)s 
+We are writing because we need your help completing the setup to ensure its full operation.
+
+The DC7800 comes with a built-in remote management feature.  The PCU functionality on your node is not configured.  The result of this is that we are unable to remotely administer this machine.
+
+You should have received directions for the complete configuration when you contacted the donation program coordinator at PlanetLab.  For review, or if you did not receive them, you can find the latest version here:
+
+    https://svn.planet-lab.org/wiki/DC7800Configuration
+
+It is essential that the PCU be configured.  The basic steps are:
+
+    Configure the DC7800 AMT feature  : https://www.planet-lab.org/AMT
+    Add a PCU to your site            : https://www.planet-lab.org/db/sites/pcu.php
+       Associate your node with the PCU  : Follow the 'My Site' link
+
+You can confirm that your machine is correctly configured by visiting the AMT
+port using your browser, such as:
+
+    http://%(hostname)s:16992/
+
+If you need any clarification about the steps mentioned here, please feel free
+to contact us at PlanetLab Support (support@planet-lab.org).
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+""")
 
+       donation_nopcu = [ donation_nopcu_one, donation_nopcu_one, donation_nopcu_one ]
+       donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
        minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
                                           """
 While trying to automatically recover this machine:
index 141f9ac..5b04398 100755 (executable)
@@ -8,9 +8,9 @@ import time
 
 # QUERY all nodes.
 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
-                                       "table=table_nodeview&" + \
-                                   "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
-                                   "formatcsv"
+                               "table=table_nodeview&" + \
+                               "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
+                               "formatcsv"
                                    #"formatcsv&" + \
                                        #"select='lastcotop!=0'"
 
@@ -26,7 +26,7 @@ import comon
 import threadpool
 import syncplcdb
 from nodequery import verify,query_to_dict,node_select
-
+import traceback
 import plc
 import auth
 api = plc.PLC(auth.auth, auth.plc)
@@ -72,7 +72,7 @@ EOF                   """)
                                '', 'princeton_comon' : '', 'princeton_comon_running' : '',
                                'princeton_comon_procs' : '', 'sshport' : None})
        except:
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                sys.exit(1)
 
        ### RUN SSH ######################
@@ -181,19 +181,19 @@ EOF                       """)
                values['comonstats'] = {'resptime':  '-1', 
                                                                'uptime':    '-1',
                                                                'sshstatus': '-1', 
-                                                               'lastcotop': '-1'}
+                                                               'lastcotop': '-1',
+                                                               'cpuspeed' : "null",
+                                                               'disksize' : 'null',
+                                                               'memsize'  : 'null'}
        # include output value
        ### GET PLC NODE ######################
        b_except = False
        plc_lock.acquire()
 
        try:
-               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'last_contact', 'boot_state', 'nodegroup_ids'])
+               d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id', 'date_created', 'last_updated', 'last_contact', 'boot_state', 'nodegroup_ids'])
        except:
                b_except = True
-               import traceback
-               b_except = True
-               import traceback
                traceback.print_exc()
 
        plc_lock.release()
@@ -208,13 +208,15 @@ EOF                       """)
                        values['pcu'] = "NOPCU"
                site_id = d_node[0]['site_id']
                last_contact = d_node[0]['last_contact']
-               nodegroups = d_node[0]['nodegroup_ids']
+               nodegroups = [ i['name'] for i in api.GetNodeGroups(d_node[0]['nodegroup_ids']) ]
                values['plcnode'] = {'status' : 'SUCCESS', 
                                                        'pcu_ids': pcu, 
                                                        'boot_state' : d_node[0]['boot_state'],
                                                        'site_id': site_id,
                                                        'nodegroups' : nodegroups,
-                                                       'last_contact': last_contact}
+                                                       'last_contact': last_contact,
+                                                       'date_created': d_node[0]['date_created'],
+                                                       'last_updated': d_node[0]['last_updated']}
        else:
                values['pcu']     = "UNKNOWN"
                values['plcnode'] = {'status' : "GN_FAILED"}
@@ -229,7 +231,6 @@ EOF                 """)
                                                        ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
        except:
                b_except = True
-               import traceback
                traceback.print_exc()
 
        plc_lock.release()
@@ -389,7 +390,6 @@ if __name__ == '__main__':
        try:
                main()
        except Exception, err:
-               import traceback
                print traceback.print_exc()
                print "Exception: %s" % err
                print "Saving data... exitting."
index 017b4c4..399359a 100755 (executable)
@@ -8,6 +8,7 @@ import socket
 
     
 import signal
+import traceback
 
 #old_handler = signal.getsignal(signal.SIGCHLD)
 
@@ -78,7 +79,6 @@ def get_pcu(pcuname):
                                if i['pcu_id'] == pcuname:
                                        l_pcu = i
                except:
-                       import traceback
                        traceback.print_exc()
                        l_pcu = None
 
@@ -97,7 +97,6 @@ def get_nodes(node_ids):
                                if n['node_id'] in node_ids:
                                        l_node.append(n)
                except:
-                       import traceback
                        traceback.print_exc()
                        l_node = None
 
@@ -155,7 +154,6 @@ def get_plc_site_values(site_id):
                                        d_site = site
                                        break
                except:
-                       import traceback
                        traceback.print_exc()
                        values = None
 
@@ -194,7 +192,6 @@ def collectPingAndSSH(pcuname, cohash):
                                continue_probe = False
                except:
                        b_except = True
-                       import traceback
                        traceback.print_exc()
                        continue_probe = False
 
@@ -289,7 +286,6 @@ def collectPingAndSSH(pcuname, cohash):
                print values
                errors = values
                print "____________________________________"
-               import traceback
                errors['traceback'] = traceback.format_exc()
                print errors['traceback']
 
@@ -433,7 +429,6 @@ if __name__ == '__main__':
                main()
                time.sleep(1)
        except Exception, err:
-               import traceback
                traceback.print_exc()
                print "Exception: %s" % err
                print "Saving data... exitting."
index f40e108..31e035c 100755 (executable)
@@ -21,18 +21,18 @@ def getconf(hostname, force=False, media=None):
 
        args = {}
        if not media:
-               args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
-               args['url_list'] += "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+               args['url_list']  = "   http://monitor.planet-lab.org/bootcds/%s-partition.usb\n" % hostname
+               args['url_list'] += "   http://monitor.planet-lab.org/bootcds/%s.iso" % hostname
        else:
                if media == "usb":
-                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+                       args['url_list']  = "   http://monitor.planet-lab.org/bootcds/%s-partition.usb\n" % hostname
                elif media == "iso":
-                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+                       args['url_list']  = "   http://monitor.planet-lab.org/bootcds/%s.iso" % hostname
                else:
-                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
-                       args['url_list'] += "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+                       args['url_list']  = "   http://monitor.planet-lab.org/bootcds/%s-partition.usb\n" % hostname
+                       args['url_list'] += "   http://monitor.planet-lab.org/bootcds/%s.iso" % hostname
                        
-       #print "http://pl-virtual-03.cs.princeton.edu/bootcds/%s.usb\n" % hostname
+       #print "http://monitor.planet-lab.org/bootcds/%s.usb\n" % hostname
 
        return args
 
index 0819abe..eee1fdc 100755 (executable)
@@ -8,7 +8,7 @@ import xml, xmlrpclib
 
 args = {}
 args['known_hosts'] =  os.environ['HOME'] + os.sep + ".ssh" + os.sep + "known_hosts"
-args['XMLRPC_SERVER'] = 'https://www.planet-lab.org/PLCAPI/'
+args['XMLRPC_SERVER'] = 'https://boot.planet-lab.org/PLCAPI/'
 
 class SSHKnownHosts:
        def __init__(self, args = args):
index a1e18d6..95d0fc5 100755 (executable)
@@ -17,15 +17,16 @@ import auth
 api = plc.PLC(auth.auth, auth.plc)
 
 import policy
-
+import traceback
 from config import config as cfg
-import config as config2
+import config as configmodule
 from optparse import OptionParser
 
 from nodecommon import *
 from nodequery import verify,query_to_dict,node_select
 import soltesz
 from unified_model import *
+import os
 
 import time
 
@@ -71,7 +72,7 @@ class Reboot(object):
                                        return ret
 
                                except Exception,e:
-                                       import traceback; print traceback.print_exc(); print e
+                                       print traceback.print_exc(); print e
 
                                        # NOTE: this failure could be an implementation issue on
                                        #               our end.  So, extra notices are confusing...
@@ -112,7 +113,7 @@ class Reboot(object):
                try:
                        return monitor.reboot(host)
                except Exception, e:
-                       import traceback; print traceback.print_exc(); print e
+                       print traceback.print_exc(); print e
                        return False
 
 class RebootDebug(Reboot):
@@ -230,9 +231,8 @@ if config.nodeselect:
 
 if config.findbad:
        # rerun findbad with the nodes in the given nodes.
-       import os
        file = "findbad.txt"
-       config2.setFileFromList(file, hostnames)
+       configmodule.setFileFromList(file, hostnames)
        os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
 
 fb = soltesz.dbLoad("findbad")
@@ -246,7 +246,7 @@ for host in hostnames:
                try:
                        node = api.GetNodes(host)[0]
                except:
-                       import traceback; print traceback.print_exc(); 
+                       print traceback.print_exc(); 
                        print "FAILED GETNODES for host: %s" % host
                        continue
                        
@@ -328,8 +328,8 @@ for host in hostnames:
                                print "ALL METHODS OF RESTARTING %s FAILED" % host
                                args = {}
                                args['hostname'] = host
-                               m = PersistMessage(host, "ALL FAIL for %(hostname)s" % args,
-                                                                                        "nada", False, db='suspect_persistmessages')
+                               m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
+                                                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
                                m.reset()
                                m.send(['monitor-list@lists.planet-lab.org'])
 
@@ -340,7 +340,7 @@ for host in hostnames:
                print "Killed by interrupt"
                sys.exit(0)
        except:
-               import traceback; print traceback.print_exc();
+               print traceback.print_exc();
                print "Continuing..."
 
        time.sleep(1)
index 78db954..b8fe5cb 100644 (file)
@@ -102,7 +102,6 @@ def main():
        for host in sys.argv[1:]:
                reboot(host)
 
-print "hello?"
 if __name__ == '__main__':
        print "calling main"
        main()
index 70eee04..e8789da 100644 (file)
@@ -398,7 +398,7 @@ class Diagnose:
 
                # NOTE: these settings can be overridden by command line arguments,
                #       or the state of a record, i.e. if already in RT's Support Queue.
-               pf = PersistFlags(loginbase, 1, db='site_persitflags')
+               pf = PersistFlags(loginbase, 1, db='site_persistflags')
                nodes_up = pf.nodes_up
                if nodes_up < MINUP:
                        d_diag_site[loginbase]['config']['squeeze'] = True
diff --git a/nodeaction.py b/nodeaction.py
new file mode 100755 (executable)
index 0000000..1b0d38e
--- /dev/null
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+
+import soltesz
+#fb = soltesz.dbLoad("findbad")
+#act_all = soltesz.dbLoad("act_all")
+
+import reboot
+
+import time
+from model import *
+
+from config import config
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.set_defaults(node=None, rins=False, bootstate=None, endrecord=False)
+parser.add_option("", "--backoff", dest="backoff", action="store_true",
+                                       help="Back off all penalties applied to a site.")
+parser.add_option("", "--rins", dest="rins", action="store_true",
+                                       help="Back off all penalties applied to a site.")
+parser.add_option("", "--bootstate", dest="bootstate", 
+                                       help="set the bootstate for a node.")
+config = config(parser)
+config.parse_args()
+
+for node in config.args:
+       config.node = node
+
+       #plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+       #fb_nodeinfo  = fb['nodes'][config.node]['values']
+
+       if config.bootstate:
+               print "Setting %s to bootstate %s" % ( node, config.bootstate )
+               api.UpdateNode(node, {'boot_state' : config.bootstate})
+
+       if config.rins:
+               print "Setting %s to rins" % node
+               api.UpdateNode(node, {'boot_state' : 'rins'})
+
+       if config.backoff:
+               print "Enabling Slices & Slice Creation for %s" % node
+               plc.enableSlices(node)
+               plc.enableSliceCreation(node)
+
+               # plc_print_nodeinfo(plc_nodeinfo)
+               # fb_print_nodeinfo(fb_nodeinfo)
diff --git a/nodebad.py b/nodebad.py
new file mode 100755 (executable)
index 0000000..74117a1
--- /dev/null
@@ -0,0 +1,161 @@
+#!/usr/bin/python
+
+import os
+import sys
+import string
+import time
+
+
+import soltesz
+import comon
+import threadpool
+import syncplcdb
+from nodequery import verify,query_to_dict,node_select
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+from unified_model import *
+from monitor_policy import MINUP
+
+round = 1
+externalState = {'round': round, 'nodes': {}}
+count = 0
+
+def main(config):
+       global externalState
+       externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
+       if config.increment:
+               # update global round number to force refreshes across all nodes
+               externalState['round'] += 1
+
+       l_nodes = syncplcdb.create_plcdb()
+       l_plcnodes = soltesz.dbLoad("l_plcnodes")
+
+       if config.node:
+               l_nodes = [config.node]
+       else:
+               l_nodes = [node['hostname'] for node in l_plcnodes]
+       
+       checkAndRecordState(l_nodes, l_plcnodes)
+
+def checkAndRecordState(l_nodes, l_plcnodes):
+       global externalState
+       global count
+       global_round = externalState['round']
+
+       for nodename in l_nodes:
+               if nodename not in externalState['nodes']:
+                       externalState['nodes'][nodename] = {'round': 0, 'values': []}
+
+               node_round   = externalState['nodes'][nodename]['round']
+               if node_round < global_round:
+                       # do work
+                       values = collectStatusAndState(nodename, l_plcnodes)
+                       global_round = externalState['round']
+                       externalState['nodes'][nodename]['values'] = values
+                       externalState['nodes'][nodename]['round'] = global_round
+               else:
+                       count += 1
+
+               if count % 20 == 0:
+                       soltesz.dbDump(config.dbname, externalState)
+
+       soltesz.dbDump(config.dbname, externalState)
+
+fb = soltesz.dbLoad('findbad')
+hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+
+def getnodesup(nodelist):
+       up = 0
+       for node in nodelist:
+               if node['hostname'] in fb['nodes'].keys():
+                       try:
+                               if fb['nodes'][node['hostname']]['values']['state'] == "BOOT":
+                                       up = up + 1
+                       except:
+                               pass
+       return up
+
+def get(fb, path):
+       indexes = path.split("/")
+       values = fb
+       for index in indexes:
+               if index in values:
+                       values = values[index]
+               else:
+                       return None
+       return values
+
+def collectStatusAndState(nodename, l_plcnodes):
+       global count
+
+       d_node = None
+       for node in l_plcnodes:
+               if node['hostname'] == nodename:
+                       d_node = node
+                       break
+       if not d_node:
+               return None
+
+       pf = PersistFlags(nodename, 1, db='node_persistflags')
+
+       if not pf.checkattr('last_changed'):
+               pf.last_changed = time.time()
+               
+       pf.last_checked = time.time()
+
+       if not pf.checkattr('status'):
+               pf.status = "unknown"
+
+       state_path     = "nodes/" + nodename + "/values/state"
+       bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state"
+
+       if get(fb, state_path) == "BOOT":
+               if pf.status != "good": pf.last_changed = time.time()
+               pf.status = "good"
+       elif get(fb, state_path)  == "DEBUG":
+               bs = get(fb, bootstate_path)
+               if pf.status != bs: pf.last_changed = time.time()
+               pf.status = bs
+       else:
+               if pf.status != "down": pf.last_changed = time.time()
+               pf.status = "down"
+
+       count += 1
+       print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed))
+       # updated by other modules
+       #pf.enabled = 
+       #pf.suspended = 
+
+       pf.save()
+
+       return True
+
+if __name__ == '__main__':
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, 
+                                               increment=False, dbname="nodebad", cachenodes=False)
+       parser.add_option("", "--node", dest="node", metavar="hostname", 
+                                               help="Provide a single node to operate on")
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="file.list", 
+                                               help="Provide a list of files to operate on")
+
+       parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
+                                               help="Specify the name of the database to which the information is saved")
+       parser.add_option("-i", "--increment", action="store_true", dest="increment", 
+                                               help="Increment round number to force refresh or retry")
+       config = config(parser)
+       config.parse_args()
+
+       try:
+               main(config)
+       except Exception, err:
+               import traceback
+               print traceback.print_exc()
+               print "Exception: %s" % err
+               print "Saving data... exitting."
+               soltesz.dbDump(config.dbname, externalState)
+               sys.exit(0)
index 20f1513..fcaaefe 100755 (executable)
@@ -119,15 +119,8 @@ def main():
        hostnames = [ n['hostname'] for n in nodelist ]
 
        # commands:
-       if config.list:
-               print " ---- Nodes in the %s Node Group ----" % group_str
-               i = 1
-               for node in nodelist:
-                       print "%-2d" % i, 
-                       print nodegroup_display(node, fb, config)
-                       i += 1
 
-       elif config.add and config.nodegroup:
+       if config.add and config.nodegroup:
                for node in hostnames:
                        print "Adding %s to %s nodegroup" % (node, config.nodegroup)
                        api.AddNodeToNodeGroup(node, config.nodegroup)
@@ -137,6 +130,14 @@ def main():
                        print "Deleting %s from %s nodegroup" % (node, config.nodegroup)
                        api.DeleteNodeFromNodeGroup(node, config.nodegroup)
 
+       elif config.list:
+               print " ---- Nodes in the %s Node Group ----" % group_str
+               i = 1
+               for node in nodelist:
+                       print "%-2d" % i, 
+                       print nodegroup_display(node, fb, config)
+                       i += 1
+
        else:
                print "no other options supported."
 
index b6d0a58..16e48a2 100755 (executable)
@@ -15,19 +15,20 @@ import time
 from model import *
 from nodecommon import *
 
-from config import config
-from optparse import OptionParser
-
-parser = OptionParser()
-parser.set_defaults(node=None, fields='state', fromtime=None)
-parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
-                                       help="A single node name to add to the nodegroup")
-parser.add_option("", "--fields", dest="fields", metavar="key",
-                                       help="Which record field to extract from all files.")
-parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD",
-                                       help="Specify a starting date from which to begin the query.")
-config = config(parser)
-config.parse_args()
+def get_filefromglob(d, str):
+       import os
+       import glob
+       # TODO: This is aweful.
+       path = "archive-pdb"
+       archive = soltesz.SPickle(path)
+       glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
+       os.chdir(path)
+       #print glob_str
+       file = glob.glob(glob_str)[0]
+       #print "loading %s" % file
+       os.chdir("..")
+       return file[:-4]
+       #fb = archive.load(file[:-4])
 
 
 def fb_print_nodeinfo(fbnode, verbose, date=None):
@@ -72,59 +73,60 @@ def pcu_print_info(pcuinfo, hostname):
                        print "\t cmdhttps/locfg.pl -s %s -f iloxml/Reset_Server.xml -u %s -p '%s' | grep MESSAGE" % \
                                (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
 
-path = "archive-pdb"
-archive = soltesz.SPickle(path)
-
-if config.fromtime:
-       begin = config.fromtime
-else:
-       begin = "2007-11-06"
-
-if config.node is None and len(config.args) > 0:
-       config.node = config.args[0]
-elif config.node is None:
-       print "Add a hostname to arguments"
-       print "exit."
-       sys.exit(1)
+def main():
+       from config import config
+       from optparse import OptionParser
+
+       parser = OptionParser()
+       parser.set_defaults(node=None, fields='state', fromtime=None)
+       parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
+                                               help="A single node name to add to the nodegroup")
+       parser.add_option("", "--fields", dest="fields", metavar="key",
+                                               help="Which record field to extract from all files.")
+       parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD",
+                                               help="Specify a starting date from which to begin the query.")
+       config = config(parser)
+       config.parse_args()
 
-d = datetime_fromstr(begin)
-tdelta = timedelta(1)
-verbose = 1
-
-def get_filefromglob(d, str):
-       import os
-       import glob
-       # TODO: This is aweful.
        path = "archive-pdb"
        archive = soltesz.SPickle(path)
-       glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
-       os.chdir(path)
-       #print glob_str
-       file = glob.glob(glob_str)[0]
-       #print "loading %s" % file
-       os.chdir("..")
-       return file[:-4]
-       #fb = archive.load(file[:-4])
-       
-
-while True:
-       file = get_filefromglob(d, "production.findbad")
-       #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
-       
-       try:
-               fb = archive.load(file)
-               if config.node in fb['nodes']:
-                       fb_nodeinfo  = fb['nodes'][config.node]['values']
-                       fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
-
-               del fb
-               verbose = 0
-       except KeyboardInterrupt:
+
+       if config.fromtime:
+               begin = config.fromtime
+       else:
+               begin = "2007-11-06"
+
+       if config.node is None and len(config.args) > 0:
+               config.node = config.args[0]
+       elif config.node is None:
+               print "Add a hostname to arguments"
+               print "exit."
                sys.exit(1)
-       except:
-               #import traceback; print traceback.print_exc()
-               print d.strftime("%Y-%m-%d"), "No record"
 
-       d = d + tdelta
-       if d > datetime.now(): break
+       d = datetime_fromstr(begin)
+       tdelta = timedelta(1)
+       verbose = 1
 
+       while True:
+               file = get_filefromglob(d, "production.findbad")
+               #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+               
+               try:
+                       fb = archive.load(file)
+                       if config.node in fb['nodes']:
+                               fb_nodeinfo  = fb['nodes'][config.node]['values']
+                               fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+
+                       del fb
+                       verbose = 0
+               except KeyboardInterrupt:
+                       sys.exit(1)
+               except:
+                       #import traceback; print traceback.print_exc()
+                       print d.strftime("%Y-%m-%d"), "No record"
+
+               d = d + tdelta
+               if d > datetime.now(): break
+
+if __name__ == "__main__":
+       main()
index 9458cf2..2a1d5f0 100755 (executable)
@@ -142,11 +142,15 @@ def pcu_print_info(pcuinfo, hostname):
                        print "\t telnet %s" % (reboot.pcu_name(pcuinfo))
                if pcuinfo['portstatus']['80'] == "open" or \
                        pcuinfo['portstatus']['443'] == "open":
-                       print "\t http://%s" % (reboot.pcu_name(pcuinfo))
+                       print "\t https://%s" % (reboot.pcu_name(pcuinfo))
+                       print "\t import %s.png" % (reboot.pcu_name(pcuinfo))
+                       print """\t mutt -s "crash for %s" -a %s.png sapanb@cs.princeton.edu < /dev/null""" % (hostname, reboot.pcu_name(pcuinfo))
                if pcuinfo['portstatus']['443'] == "open":
                        print "\t racadm.py -r %s -u %s -p '%s'" % (pcuinfo['ip'], pcuinfo['username'], pcuinfo['password'])
                        print "\t cmdhttps/locfg.pl -s %s -f iloxml/Reset_Server.xml -u %s -p '%s' | grep MESSAGE" % \
                                (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
+                       print "\t cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
+                               (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
                if pcuinfo['portstatus']['16992'] == "open":
                        print "\t ./cmdamt/remoteControl -A -verbose 'http://%s:16992/RemoteControlService' -user admin -pass '%s'" % (reboot.pcu_name(pcuinfo), pcuinfo['password'])
 
index 465c309..28cedb2 100755 (executable)
@@ -19,6 +19,8 @@ import re
 fb = soltesz.dbLoad("findbad")
 fbpcu = {}
 
+class NoKeyException(Exception): pass
+
 def daysdown_print_nodeinfo(fbnode, hostname):
        fbnode['hostname'] = hostname
        fbnode['daysdown'] = Diagnose.getStrDaysDown(fbnode)
@@ -49,6 +51,79 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None):
                        format += "%%(%s)s " % f
                print format % fbnode
 
+def get(fb, path):
+    indexes = path.split("/")
+    values = fb
+    for index in indexes:
+        if index in values:
+            values = values[index]
+        else:
+            raise NoKeyException(index)
+    return values
+
+def verifyType(constraints, data):
+       """
+               constraints is a list of key, value pairs.
+               # [ {... : ...}==AND , ... , ... , ] == OR
+       """
+       con_or_true = False
+       for con in constraints:
+               #print "con: %s" % con
+               if len(con.keys()) == 0:
+                       con_and_true = False
+               else:
+                       con_and_true = True
+
+               for key in con.keys():
+                       #print "looking at key: %s" % key
+                       if data is None:
+                               con_and_true = False
+                               break
+
+                       try:
+                               get(data,key)
+                               o = con[key]
+                               if o.name() == "Match":
+                                       if get(data,key) is not None:
+                                               value_re = re.compile(o.value)
+                                               con_and_true = con_and_true & (value_re.search(get(data,key)) is not None)
+                                       else:
+                                               con_and_true = False
+                               elif o.name() == "ListMatch":
+                                       if get(data,key) is not None:
+                                               match = False
+                                               for listitem in get(data,key):
+                                                       value_re = re.compile(o.value)
+                                                       if value_re.search(listitem) is not None:
+                                                               match = True
+                                                               break
+                                               con_and_true = con_and_true & match
+                                       else:
+                                               con_and_true = False
+                               elif o.name() == "Is":
+                                       con_and_true = con_and_true & (get(data,key) == o.value)
+                               elif o.name() == "FilledIn":
+                                       con_and_true = con_and_true & (len(get(data,key)) > 0)
+                               elif o.name() == "PortOpen":
+                                       if get(data,key) is not None:
+                                               v = get(data,key)
+                                               con_and_true = con_and_true & (v[str(o.value)] == "open")
+                                       else:
+                                               con_and_true = False
+                               else:
+                                       value_re = re.compile(o.value)
+                                       con_and_true = con_and_true & (value_re.search(get(data,key)) is not None)
+
+                       except NoKeyException, key:
+                               print "missing key %s" % key,
+                               pass
+                               #print "missing key %s" % key
+                               #con_and_true = False
+
+               con_or_true = con_or_true | con_and_true
+
+       return con_or_true
+
 def verify(constraints, data):
        """
                constraints is a list of key, value pairs.
@@ -95,7 +170,7 @@ def query_to_dict(query):
        
        return ad
 
-def _pcu_in(fbdata):
+def pcu_in(fbdata):
        if 'plcnode' in fbdata:
                if 'pcu_ids' in fbdata['plcnode']:
                        if len(fbdata['plcnode']['pcu_ids']) > 0:
@@ -116,7 +191,7 @@ def pcu_select(str_query, nodelist=None):
                        if node not in nodelist: continue
        
                fb_nodeinfo  = fb['nodes'][node]['values']
-               if _pcu_in(fb_nodeinfo):
+               if pcu_in(fb_nodeinfo):
                        pcuinfo = fbpcu['nodes']['id_%s' % fb_nodeinfo['plcnode']['pcu_ids'][0]]['values']
                        if verify(dict_query, pcuinfo):
                                nodenames.append(node)
@@ -125,7 +200,7 @@ def pcu_select(str_query, nodelist=None):
                                pcunames.append(str)
        return (nodenames, pcunames)
 
-def node_select(str_query, nodelist=None):
+def node_select(str_query, nodelist=None, fbdb=None):
        hostnames = []
        if str_query is None: return hostnames
 
@@ -134,6 +209,9 @@ def node_select(str_query, nodelist=None):
        #print dict_query
        global fb
 
+       if fbdb is not None:
+               fb = fbdb
+
        for node in fb['nodes'].keys():
                if nodelist is not None: 
                        if node not in nodelist: continue
diff --git a/nodesets.py b/nodesets.py
new file mode 100755 (executable)
index 0000000..85bf9b7
--- /dev/null
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+
+from config import config as cfg
+import sys
+import os
+from sets import Set
+from optparse import OptionParser
+
+def main():
+       parser = OptionParser()
+       parser.set_defaults(operation="and",)
+       parser.add_option("", "--operation", dest="operation", metavar="and", 
+                                               help="""Which operation to perform on the two sets.  (and, or, minus""")
+
+       config = cfg(parser)
+       config.parse_args()
+
+       f1 = config.args[0]
+       f2 = config.args[1]
+
+       s1 = config.getListFromFile(f1)
+       s2 = config.getListFromFile(f2)
+
+       s = nodesets(config.operation, s1, s2)
+
+       if config.operation == "and":
+               print "Nodes in both sets", len(Set(s1) & Set(s2))
+       elif config.operation == "uniquetoone" or config.operation == "minus":
+               print "Nodes unique to set 1", len(Set(s1) - Set(s2))
+       elif operation == "or":
+               print "Union of nodes in both sets", len(Set(s1) | Set(s2))
+
+       for i in s:
+               print i
+
+
+def nodesets(operation, s1, s2):
+
+       if operation == "and":
+               return Set(s1) & Set(s2)
+       elif operation == "uniquetoone" or operation == "minus":
+               return Set(s1) - Set(s2)
+       elif operation == "or":
+               return Set(s1) | Set(s2)
+       else:
+               print "Unknown operation: %s " % operation
+       
+       return []
diff --git a/pcubad.py b/pcubad.py
new file mode 100755 (executable)
index 0000000..ba9e83c
--- /dev/null
+++ b/pcubad.py
@@ -0,0 +1,163 @@
+#!/usr/bin/python
+
+import os
+import sys
+import string
+import time
+
+from reboot import pcu_name
+
+import soltesz
+import comon
+import threadpool
+import syncplcdb
+from nodequery import verify,query_to_dict,node_select
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+from unified_model import *
+from monitor_policy import MINUP
+
+round = 1
+externalState = {'round': round, 'nodes': {}}
+count = 0
+
+def main(config):
+       global externalState
+       externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
+       if config.increment:
+               # update global round number to force refreshes across all pcus
+               externalState['round'] += 1
+
+       l_plcpcus = soltesz.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs())
+
+       l_pcu = None
+       if config.pcu:
+               for pcu in l_plcpcus:
+                       if pcu['hostname'] == config.pcu  or pcu['ip'] == config.pcu:
+                               l_pcus = [pcu['pcu_id']]
+               if not l_pcu:
+                       print "ERROR: could not find pcu %s" % config.pcu
+                       sys.exit(1)
+       else:
+               l_pcus = [pcu['pcu_id'] for pcu in l_plcpcus]
+       
+       checkAndRecordState(l_pcus, l_plcpcus)
+
+def checkAndRecordState(l_pcus, l_plcpcus):
+       global externalState
+       global count
+       global_round = externalState['round']
+
+       for pcuname in l_pcus:
+               if pcuname not in externalState['nodes']:
+                       externalState['nodes'][pcuname] = {'round': 0, 'values': []}
+
+               pcu_round   = externalState['nodes'][pcuname]['round']
+               if pcu_round < global_round:
+                       # do work
+                       values = collectStatusAndState(pcuname, l_plcpcus)
+                       global_round = externalState['round']
+                       externalState['nodes'][pcuname]['values'] = values
+                       externalState['nodes'][pcuname]['round'] = global_round
+               else:
+                       count += 1
+
+               if count % 20 == 0:
+                       soltesz.dbDump(config.dbname, externalState)
+
+       soltesz.dbDump(config.dbname, externalState)
+
+fbpcu = soltesz.dbLoad('findbadpcus')
+hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+
+def get(fb, path):
+       indexes = path.split("/")
+       values = fb
+       for index in indexes:
+               if index in values:
+                       values = values[index]
+               else:
+                       return None
+       return values
+
+def collectStatusAndState(pcuname, l_plcpcus):
+       global count
+
+       d_pcu = None
+       for pcu in l_plcpcus:
+               if pcu['pcu_id'] == pcuname:
+                       d_pcu = pcu
+                       break
+       if not d_pcu:
+               return None
+
+       pf = PersistFlags(pcuname, 1, db='pcu_persistflags')
+
+       if not pf.checkattr('last_changed'):
+               pf.last_changed = time.time()
+               
+       pf.last_checked = time.time()
+
+       if not pf.checkattr('valid'):
+               pf.valid = "unknown"
+               pf.last_valid = 0
+
+       if not pf.checkattr('status'):
+               pf.status = "unknown"
+
+       state_path     = "nodes/id_" + str(pcuname) + "/values/reboot"
+       bootstate_path = "nodes/id_" + str(pcuname) + "/values/plcpcu/boot_state"
+
+       current_state = get(fbpcu, state_path)
+       if current_state == 0:
+               if pf.status != "good": pf.last_changed = time.time()
+               pf.status = "good"
+       elif current_state == 'NetDown':
+               if pf.status != "netdown": pf.last_changed = time.time()
+               pf.status = "netdown"
+       elif current_state == 'Not_Run':
+               if pf.status != "badconfig": pf.last_changed = time.time()
+               pf.status = "badconfig"
+       else:
+               if pf.status != "error": pf.last_changed = time.time()
+               pf.status = "error"
+
+       count += 1
+       print "%d %35s %s since(%s)" % (count, pcu_name(d_pcu), pf.status, diff_time(pf.last_changed))
+       # updated by other modules
+       #pf.enabled = 
+       #pf.suspended = 
+
+       pf.save()
+
+       return True
+
+if __name__ == '__main__':
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, 
+                                               increment=False, dbname="pcubad", cachepcus=False)
+       parser.add_option("", "--pcu", dest="pcu", metavar="hostname", 
+                                               help="Provide a single pcu to operate on")
+       parser.add_option("", "--pculist", dest="pculist", metavar="file.list", 
+                                               help="Provide a list of files to operate on")
+
+       parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
+                                               help="Specify the name of the database to which the information is saved")
+       parser.add_option("-i", "--increment", action="store_true", dest="increment", 
+                                               help="Increment round number to force refresh or retry")
+       config = config(parser)
+       config.parse_args()
+
+       try:
+               main(config)
+       except Exception, err:
+               import traceback
+               print traceback.print_exc()
+               print "Exception: %s" % err
+               print "Saving data... exitting."
+               soltesz.dbDump(config.dbname, externalState)
+               sys.exit(0)
diff --git a/pcuinfo.py b/pcuinfo.py
new file mode 100755 (executable)
index 0000000..3c61cd1
--- /dev/null
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+import soltesz
+import plc
+from optparse import OptionParser
+import sys
+from reboot import pcu_name, get_pcu_values
+
+import sys
+from config import config
+
+def print_dict(dict):
+       for key in dict.keys():
+               print "%30s : %s" % (key, dict[key])
+
+parser = OptionParser()
+parser.set_defaults(withpcu=False,
+                                       refresh=False)
+parser.add_option("-f", "--nodelist",dest="filename",default="", metavar="FILE",
+                                 help="Provide the input file for the downnode list")
+parser.add_option("", "--refresh", action="store_true", dest="refresh",
+                                       help="Refresh the cached values")
+
+
+config = config(parser)
+config.parse_args()
+
+if not config.run:
+       k = config.__dict__.keys()
+       k.sort()
+       for o in k:
+               print o, "=", config.__dict__[o]
+       print "Add --run to actually perform the command"
+       sys.exit(1)
+
+pculist = soltesz.if_cached_else_refresh(1, 
+                                                       config.refresh, 
+                                                       "pculist", 
+                                                       lambda : plc.GetPCUs())
+for pcu in pculist:
+       #print pcu
+       #sys.exit(1)
+       if pcu['model'] == None:
+               continue
+
+       if True: # pcu['model'].find("APC AP79xx/Masterswitch") >= 0:
+               host = pcu_name(pcu)
+               values = get_pcu_values(pcu['pcu_id'])
+               if 'portstatus' not in values:
+                       portstatus = ""
+               else:
+                       if values['reboot'] == 0 or (not isinstance(values['reboot'],int) and values['reboot'].find("error") >= 0):
+                               portstatus = "22:%(22)s 23:%(23)s" % values['portstatus']
+               if values['reboot'] == 0:
+                       print "%6d %20s %50s %s" % (pcu['pcu_id'], pcu['password'], "%s@%s" % (pcu['username'], host), portstatus)
+
+#soltesz.dbDump("pculist", pculist, 'php')
diff --git a/plc.py b/plc.py
index 7b92445..f609abb 100644 (file)
--- a/plc.py
+++ b/plc.py
@@ -52,6 +52,10 @@ class PLC:
        def __repr__(self):
                return self.api.__repr__()
 
+def getAuthAPI():
+       import auth
+       return PLC(auth.auth, auth.plc)
+
 '''
 Returns list of nodes in dbg as reported by PLC
 '''
index 20c12b4..2afba4d 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -373,34 +373,59 @@ class Diagnose(Thread):
                
        def getDaysDown(cls, diag_record):
                daysdown = -1
-               if diag_record['comonstats']['uptime'] != "null":
-                       #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
+               last_contact = diag_record['plcnode']['last_contact']
+               date_created = diag_record['plcnode']['date_created']
+
+               if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
                        daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
-               elif diag_record['comonstats']['sshstatus'] != "null":
-                       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
-               elif diag_record['comonstats']['lastcotop'] != "null":
-                       daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
+               elif last_contact is None:
+                       if date_created is not None:
+                               now = time.time()
+                               diff = now - date_created
+                               daysdown = diff // (60*60*24)
+                       else:
+                               daysdown = -1
                else:
                        now = time.time()
-                       last_contact = diag_record['plcnode']['last_contact']
-                       if last_contact == None:
-                               # the node has never been up, so give it a break
-                               daysdown = -1
-                       else:
-                               diff = now - last_contact
-                               daysdown = diff // (60*60*24)
+                       diff = now - last_contact
+                       daysdown = diff // (60*60*24)
                return daysdown
        getDaysDown = classmethod(getDaysDown)
 
        def getStrDaysDown(cls, diag_record):
-               daysdown = cls.getDaysDown(diag_record)
-               if daysdown > 0:
-                       return "%d days down"%daysdown
-               elif daysdown == -1:
-                       return "Unknown number of days"
+               daysdown = "unknown"
+               last_contact = diag_record['plcnode']['last_contact']
+               date_created = diag_record['plcnode']['date_created']
+
+               if      diag_record['comonstats']['uptime'] != "null" and \
+                       diag_record['comonstats']['uptime'] != "-1":
+                       daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+                       daysdown = "%d days up" % daysdown
+
+               elif last_contact is None:
+                       if date_created is not None:
+                               now = time.time()
+                               diff = now - date_created
+                               daysdown = diff // (60*60*24)
+                               daysdown = "Never contacted PLC, created %s days ago" % daysdown
+                       else:
+                               daysdown = "Never contacted PLC"
                else:
-                       return "%d days up"% -daysdown
+                       now = time.time()
+                       diff = now - last_contact
+                       daysdown = diff // (60*60*24)
+                       daysdown = "%s days down" % daysdown
+               return daysdown
        getStrDaysDown = classmethod(getStrDaysDown)
+       #def getStrDaysDown(cls, diag_record):
+       #       daysdown = cls.getDaysDown(diag_record)
+       #       if daysdown > -1:
+       #               return "%d days down"%daysdown
+       #       elif daysdown == -1:
+       #               return "Has never contacted PLC"
+       #       else:
+       #               return "%d days up"% -daysdown
+       #getStrDaysDown = classmethod(getStrDaysDown)
 
        def __getCDVersion(self, diag_record, nodename):
                cdversion = ""
index 82cb33c..4cccdf0 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -1196,12 +1196,13 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
 
        try:
                # DataProbe iPal (many sites)
-               if  continue_probe and values['model'].find("Dataprobe IP-41x/IP-81x") >= 0:
+               if  continue_probe and values['model'].find("IP-41x_IP-81x") >= 0:
                        ipal = IPAL(values, verbose, ['23', '80', '9100'])
                        rb_ret = ipal.reboot(values[nodename], dryrun)
                                
                # APC Masterswitch (Berkeley)
-               elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
+               elif continue_probe and ( values['model'].find("AP79xx") >= 0 or \
+                                                                 values['model'].find("Masterswitch") >= 0 ):
                        print values
 
                        # TODO: make a more robust version of APC
@@ -1226,7 +1227,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
                # BayTech DS4-RPC
-               elif continue_probe and values['model'].find("Baytech DS4-RPC") >= 0:
+               elif continue_probe and values['model'].find("DS4-RPC") >= 0:
                        if values['pcu_id'] in [1237,1052,1209,1002,1008,1041,1013,1022]:
                                # These  require a 'ctrl-c' to be sent... 
                                baytech = BayTechCtrlC(values, verbose, ['22', '23'])
@@ -1255,7 +1256,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                rb_ret = baytech.reboot(values[nodename], dryrun)
 
                # iLO
-               elif continue_probe and values['model'].find("HP iLO") >= 0:
+               elif continue_probe and values['model'].find("ilo") >= 0:
                        try:
                                hpilo = HPiLO(values, verbose, ['22'])
                                rb_ret = hpilo.reboot(0, dryrun)
@@ -1267,7 +1268,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                rb_ret = hpilo.reboot(0, dryrun)
 
                # DRAC ssh
-               elif continue_probe and values['model'].find("Dell RAC") >= 0:
+               elif continue_probe and values['model'].find("DRAC") >= 0:
                        # TODO: I don't think DRACRacAdm will throw an exception for the
                        # default method to catch...
                        try:
@@ -1281,15 +1282,12 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                wti = WTIIPS4(values, verbose, ['23'])
                                rb_ret = wti.reboot(values[nodename], dryrun)
 
-               elif continue_probe and values['model'].find("Intel AMT") >= 0:
+               elif continue_probe and values['model'].find("AMT") >= 0:
                                amt = IntelAMT(values, verbose, ['16992'])
                                rb_ret = amt.reboot(values[nodename], dryrun)
 
                # BlackBox PSExxx-xx (e.g. PSE505-FR)
-               elif continue_probe and \
-                       (values['model'].find("BlackBox PS5xx") >= 0 or
-                        values['model'].find("ePowerSwitch 1/4/8x") >=0 ):
-
+               elif continue_probe and values['model'].find("ePowerSwitch") >=0:
                        # TODO: allow a different port than http 80.
                        if values['pcu_id'] in [1089, 1071, 1046, 1035, 1118]:
                                eps = ePowerSwitchGood(values, verbose, ['80'])
diff --git a/rt.py b/rt.py
index 4a951b9..4a9c3fd 100644 (file)
--- a/rt.py
+++ b/rt.py
@@ -142,7 +142,7 @@ def rt_tickets():
 #WHERE Tk.Queue != 10 AND Tk.id > 10000 AND 
 #Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR
 #Tk.Status = 'new') """
-       sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress, Tk.LastUpdated FROM Tickets AS Tk, Attachments AS At, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy """
+       sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress, Tk.LastUpdated, Q.Name, Tk.Owner FROM Tickets AS Tk, Attachments AS At, Queues as Q, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy AND Q.id=Tk.Queue """
 
 
        raw = fetch_from_db(db, sql)
@@ -163,6 +163,8 @@ def rt_tickets():
                                "content":str(x[3]),
                                "email":str(x[4]),
                                "lastupdated":str(x[5]),
+                               "queue":str(x[6]),
+                               "owner":str(x[7]),
                                },
                                raw)
 
diff --git a/showlatlon.py b/showlatlon.py
new file mode 100755 (executable)
index 0000000..6406c49
--- /dev/null
@@ -0,0 +1,204 @@
+#!/usr/bin/python
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+
+import sys
+import reboot
+from datetime import datetime, timedelta
+
+import soltesz
+import comon
+from nodecommon import color_pcu_state, datetime_fromstr
+from nodehistory import get_filefromglob
+import time
+
+# region
+# total
+# up
+# up with good hardware
+# up with good hardware & functional pcu
+
+#cm_url="http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeviewshort&format=formatcsv&dumpcols='name,cpuspeed,memsize,disksize'"
+#cm = soltesz.if_cached_else(1, "cmhardware", lambda : comon.comonget(cm_url))
+
+def gethardwarequality(nodename, fb):
+       if nodename in fb['nodes']:
+               cstat = fb['nodes'][nodename]['values']['comonstats']
+               for field in ['cpuspeed', 'memsize', 'disksize']:
+                       if field not in cstat: cstat[field] = "null"
+
+               if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4:
+                       return "BAD" # "cpu_slow",
+               if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9:
+                       return "BAD" # "mem_small",
+               if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0:
+                       return "BAD" # "disk_small",
+
+               if cstat['disksize'] == "null" and \
+                  cstat['cpuspeed'] == "null" and \
+                  cstat['memsize'] == "null":
+                       return "N/A"
+
+               try:
+                       if  float(cstat['cpuspeed']) >= 2.4 and \
+                               float(cstat['memsize']) >= 2.9 and \
+                               (cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0):
+                               return "A-OK"
+               except:
+                       print cstat
+
+               return "ZOO"
+       else:
+               return "N/A"
+
+def addtostats(stats, a):
+       if a['cc'] not in stats:
+               stats[a['cc']] = {'total' : 0,
+                                 'up' : 0,
+                                                 'goodhw': 0,
+                                                 'pcuok' : 0}
+       
+       stats[a['cc']]['total'] += 1
+       if a['status'] == "boot":
+               stats[a['cc']]['up'] += 1  
+               if a['hardware'] == "A-OK":
+                       stats[a['cc']]['goodhw'] += 1 
+                       if a['pcuok'] == "PCUOK  " or a['pcuok'] == "PCUA-OK":
+                               stats[a['cc']]['pcuok'] += 1
+
+def main():
+
+       stats = {}
+       path = "archive-pdb"
+       archive = soltesz.SPickle(path)
+
+       if len(sys.argv) > 2:
+               timestr = sys.argv[1]
+               format = sys.argv[2]
+               begin = timestr
+       else:
+               format = "%Y-%m-%d"
+               begin = time.strftime(format)
+
+       d = datetime_fromstr(begin)
+       fbstr = get_filefromglob(d, "production.findbad")
+       fbpcustr = get_filefromglob(d, "production.findbadpcus")
+
+       l_plcnodes = soltesz.dbLoad("l_plcnodes")
+       l_plcsites = soltesz.dbLoad("l_plcsites")
+       lb2hn = soltesz.dbLoad("plcdb_lb2hn")
+       fb = archive.load(fbstr) 
+       fbpcu = archive.load(fbpcustr)
+       reboot.fb = fbpcu
+
+       results = []
+       # COLLECT nodegroups, nodes and node lists
+       for site in l_plcsites:
+               CC="none"
+               if site['login_base'] in lb2hn:
+                       nodes = lb2hn[site['login_base']]
+                       for node in nodes:
+                               hostname = node['hostname']
+                               fields = hostname.split(".")
+                               if len(fields[-1]) == 2:
+                                       CC=fields[-1]
+                               elif fields[-1] == "edu":
+                                       CC="usedu"
+                               elif site['login_base'] == "ft":
+                                       CC="fr"
+                               elif site['login_base'] == "ntu":
+                                       CC="tw"
+                               elif site['login_base'] in ["mcgill", "canarieottawa", 'canariecalgary',
+                                                                                       'canariehalifax', 'canariemontreal',
+                                                                                       'canarietoronto', 'canariewinnipeg']:
+                                       CC="ca"
+                               elif site['login_base'] in ["plcoloclarasanti", "plcoloclarasaopa",
+                                                                                       "plcoloclarabueno", "plcoloclaratijua", 
+                                                                                       "plcoloclarapanam"]:
+                                       CC="southamerica"
+                               elif site['login_base'] in ["plcoloamst", 'cwi']:
+                                       CC="nl"
+                               elif site['login_base'] == "urv":
+                                       CC="es"
+                               elif site['login_base'] == "ncl":
+                                       CC="uk"
+                               elif site['login_base'] == "waterford":
+                                       CC="ie"
+                               elif site['login_base'] in ["kisti", "snummlab"]:
+                                       CC="kr"
+                               elif site['login_base'] == "astri":
+                                       CC="cn"
+                               elif fields[-1] in [ "org", "net" ]:
+                                       CC="usorg"
+                               elif fields[-1] == "com":
+                                       CC="uscom"
+                               else:
+                                       CC=fields[-1]
+
+                               if hostname in fb['nodes']:
+                                       args = {'cc': CC, 
+                                               'site' : site['login_base'],
+                                               'host' : hostname,
+                                               'status' : fb['nodes'][hostname]['values']['state'].lower(),
+                                               'hardware' : gethardwarequality(hostname, fb),
+                                               'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) }
+                                       results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args)
+                                       addtostats(stats, args)
+               else:
+                       site['latitude'] = -2
+                       site['longitude'] = -2
+
+               #print "%4s %20s %8s %8s" % (CC, site['login_base'], site['latitude'], site['longitude'])
+
+       regions = { 'mideast'   : ['cy', 'gr', 'il', 'in', 'lb', 'pk'],
+                               'ca'        : ['ca'],
+                               'usa'           : ['pr','us', 'uscom', 'usedu', 'usorg'],
+                               'europe'        : ['at','ch','cz','be', 'de', 'dk', 
+                                                          'es','fi', 'fr', 'hu', 'ie', 'is', 'it','nl',
+                                                          'no', 'pl', 'pt', 'se', 'tr', 'uk'],
+                               'asia'          : ['cn','hk','jp','kr', 'ru', 'sg', 'si','tw',],
+                               'australia': ['au', 'nz',],
+                               'southam'       : ['ar','br','southamerica','uy', 've'],
+                               }
+       # fold stats
+       statsfold = {}
+       for key in regions.keys():
+               statsfold[key] = {'total' : 0, 'up' : 0, 
+                                               'goodhw': 0, 'pcuok' : 0}
+
+       totaltotal = {  'total' : 0, 'up' : 0, 
+                                       'goodhw': 0, 'pcuok' : 0}
+       # for all of the cc stats
+       for cc in stats.keys():
+               # search for the cc in the regions dict
+               for region in regions:
+                       # if the cc is assigned to a region
+                       if cc in regions[region]:
+                               # add all values in cc stats to that region
+                               for key in statsfold[region]:
+                                       statsfold[region][key] += stats[cc][key]
+                                       totaltotal[key] += stats[cc][key]
+
+       # print folded stats
+       print "       REGION | total | up  |& goodhw |& pcuok "
+       for region in statsfold.keys():
+               statsfold[region]['region'] = region
+               print "%(region)13s | %(total)5s | %(up)3s | %(goodhw)7s | %(pcuok)3s" % statsfold[region]
+       print "       totals | %(total)5s | %(up)3s | %(goodhw)7s | %(pcuok)3s" % totaltotal
+
+
+       print "      Region  | total | up  |& goodhw |& pcuok "
+       for region in stats.keys():
+               stats[region]['region'] = region
+               print "%(region)13s | %(total)5s | %(up)3s | %(goodhw)7s | %(pcuok)3s" % stats[region]
+
+       for line in results:
+               print line
+               
+if __name__ == "__main__":
+       try:
+               main()
+       except IOError:
+               pass
diff --git a/sitebad.py b/sitebad.py
new file mode 100755 (executable)
index 0000000..eccaa28
--- /dev/null
@@ -0,0 +1,148 @@
+#!/usr/bin/python
+
+import os
+import sys
+import string
+import time
+
+
+import soltesz
+import comon
+import threadpool
+import syncplcdb
+from nodequery import verify,query_to_dict,node_select
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+from unified_model import *
+from monitor_policy import MINUP
+
+round = 1
+externalState = {'round': round, 'sites': {}}
+count = 0
+
+def main(config):
+       global externalState
+       externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
+       if config.increment:
+               # update global round number to force refreshes across all nodes
+               externalState['round'] += 1
+
+       l_nodes = syncplcdb.create_plcdb()
+       l_plcsites = soltesz.dbLoad("l_plcsites")
+
+       if config.site:
+               l_sites = [config.site]
+       else:
+               l_sites = [site['login_base'] for site in l_plcsites]
+       
+       checkAndRecordState(l_sites, l_plcsites)
+
+def checkAndRecordState(l_sites, l_plcsites):
+       global externalState
+       global count
+       global_round = externalState['round']
+
+       for sitename in l_sites:
+               if sitename not in externalState['sites']:
+                       externalState['sites'][sitename] = {'round': 0, 'values': []}
+
+               site_round   = externalState['sites'][sitename]['round']
+               if site_round < global_round:
+                       # do work
+                       values = collectStatusAndState(sitename, l_plcsites)
+                       global_round = externalState['round']
+                       externalState['sites'][sitename]['values'] = values
+                       externalState['sites'][sitename]['round'] = global_round
+               else:
+                       count += 1
+
+               if count % 20 == 0:
+                       soltesz.dbDump(config.dbname, externalState)
+
+       soltesz.dbDump(config.dbname, externalState)
+
+fb = soltesz.dbLoad('findbad')
+lb2hn = soltesz.dbLoad("plcdb_lb2hn")
+
+def getnodesup(nodelist):
+       up = 0
+       for node in nodelist:
+               if node['hostname'] in fb['nodes'].keys():
+                       try:
+                               if fb['nodes'][node['hostname']]['values']['state'] == "BOOT":
+                                       up = up + 1
+                       except:
+                               pass
+       return up
+
+def collectStatusAndState(sitename, l_plcsites):
+       global count
+
+       d_site = None
+       for site in l_plcsites:
+               if site['login_base'] == sitename:
+                       d_site = site
+                       break
+       if not d_site:
+               return None
+
+       if sitename in lb2hn:
+               pf = PersistFlags(sitename, 1, db='site_persistflags')
+
+               if not pf.checkattr('last_changed'):
+                       pf.last_changed = time.time()
+               
+               pf.last_checked = time.time()
+               pf.nodes_total = len(lb2hn[sitename])
+               pf.slices_used = len(d_site['slice_ids'])
+               pf.nodes_up = getnodesup(lb2hn[sitename])
+               if not pf.checkattr('status'):
+                       pf.status = "unknown"
+
+               if pf.nodes_up >= MINUP:
+                       if pf.status != "good": pf.last_changed = time.time()
+                       pf.status = "good"
+               else:
+                       if pf.status != "down": pf.last_changed = time.time()
+                       pf.status = "down"
+
+               count += 1
+               print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, 
+                                                                               pf.nodes_total, pf.nodes_up, pf.status)
+               # updated by other modules
+               #pf.enabled = 
+               #pf.suspended = 
+
+               pf.save()
+
+       return True
+
+if __name__ == '__main__':
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, 
+                                               increment=False, dbname="sitebad", cachenodes=False)
+       parser.add_option("", "--site", dest="site", metavar="login_base", 
+                                               help="Provide a single site to operate on")
+       parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", 
+                                               help="Provide a list of files to operate on")
+
+       parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
+                                               help="Specify the name of the database to which the information is saved")
+       parser.add_option("-i", "--increment", action="store_true", dest="increment", 
+                                               help="Increment round number to force refresh or retry")
+       config = config(parser)
+       config.parse_args()
+
+       try:
+               main(config)
+       except Exception, err:
+               import traceback
+               print traceback.print_exc()
+               print "Exception: %s" % err
+               print "Saving data... exitting."
+               soltesz.dbDump(config.dbname, externalState)
+               sys.exit(0)
diff --git a/siteinfo.py b/siteinfo.py
new file mode 100755 (executable)
index 0000000..10c42ef
--- /dev/null
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+
+import soltesz
+import reboot
+
+import time
+from model import *
+from nodecommon import *
+
+import config as configmodule
+
+from config import config as cfg
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.set_defaults(site=None, 
+                                       findbad=False,
+                                       enable=False,
+                                       disable=False
+                                       )
+parser.add_option("", "--site", dest="site", metavar="login_base", 
+                                       help="The sitename to present")
+parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+                                       help="Re-run findbad on the nodes we're going to check before acting.")
+parser.add_option("", "--enable", dest="enable", action="store_true",
+                                       help="")
+parser.add_option("", "--disable", dest="disable", action="store_true",
+                                       help="")
+config = cfg(parser)
+config.parse_args()
+
+from unified_model import *
+def color_sitestatus(status):
+       if status == "good":
+               return green(status)
+       elif status == "down":
+               return red(status)
+       else:
+               return status
+               
+
+def pf_print_siteinfo(sitename):
+       pf = PersistFlags(sitename, 1, db='site_persistflags')
+       if pf.checkattr('last_changed'):
+               print "   Checked: %s" % diff_time(pf.last_checked)
+               print "\t status | nodes up / total | last_change"
+               print "\t   %6s | %8s / %5s | %s" % \
+                       ( color_sitestatus(pf.status), pf.nodes_up, pf.nodes_total, diff_time(pf.last_changed) )
+       else:
+               print "no  such site in pf"
+       del pf
+
+
+def plc_print_siteinfo(plcsite):
+       print ""
+       print "   Checked: %s" % time.ctime()
+       print "\t login_base    | used / max | enabled | last_updated "
+       print "\t %13s | %4s / %3s | %7s | %12s" % \
+                       (plcsite['login_base'], 
+                        len(plcsite['slice_ids']),
+                        plcsite['max_slices'],
+                        plcsite['enabled'],
+                        diff_time(plcsite['last_updated']))
+
+       print ""
+       nodes = api.GetNodes(plcsite['node_ids'])
+       print "   Checked: %s" % time.ctime()
+       print "\t                               host     | state | obs   |   created   |   updated   | last_contact "
+       for plcnode in nodes:
+               fbnode = fb['nodes'][plcnode['hostname']]['values']
+               plcnode['state'] = color_boot_state(get_current_state(fbnode))
+               print "\t  %37s |  %5s |  %5s | %11.11s | %11.11s | %12s " % \
+               (plcnode['hostname'], color_boot_state(plcnode['boot_state']), plcnode['state'], 
+                       diff_time(plcnode['date_created']), diff_time(plcnode['last_updated']), 
+               diff_time(plcnode['last_contact']))
+
+
+fb = soltesz.dbLoad("findbad")
+act_all = soltesz.dbLoad("act_all")
+
+for site in config.args:
+       config.site = site
+
+       plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+       url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
+       plc_siteinfo['url'] = url + plc_siteinfo['login_base']
+
+       if config.findbad:
+               # rerun findbad with the nodes in the given nodes.
+               import os
+               file = "findbad.txt"
+               nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+               nodes = [ n['hostname'] for n in nodes ]
+               configmodule.setFileFromList(file, nodes)
+               os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
+
+       print "%(login_base)s %(url)s" % plc_siteinfo
+       pf_print_siteinfo(config.site)
+       plc_print_siteinfo(plc_siteinfo)
+
+       if config.enable:
+               api.UpdateSite(config.site, {'enabled' : True})
+       if config.disable:
+               api.UpdateSite(config.site, {'enabled' : False})
index 1c65e15..a0fe9a5 100644 (file)
@@ -11,8 +11,8 @@ except:
 
 import inspect
 import shutil
-from config2 import config
-config = config()
+from config import config as cfg
+config = cfg()
 
 DEBUG= 0
 PICKLE_PATH="pdb"
diff --git a/template.py b/template.py
new file mode 100644 (file)
index 0000000..2e9150b
--- /dev/null
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+
+
+import emailTxt
+class Type:
+       def __init__(self, value):
+               self.value = value
+       def name(self):
+               return self.__class__.__name__
+       
+class Is(Type): pass
+class Match(Type): pass
+class ListMatch(Type): pass
+class FilledIn(Type): pass
+class PortOpen(Type): pass
+class NodesUp(Type): pass
+
+# a failed constraint leads to a message-escelation process.
+# so, define a constraint, which defines the set of nodes it operates on, the
+#        message to send if it fails, and maybe a thank you message when it's
+#        satisfied (if previously failed.).
+standardnode = {
+       'membership' : [ { 'plcnode/nodegroups' : Match('.*') } ],
+       'site'       : [ { 'nodes'    : NodesUp(2), } ],
+       'node'       : { 'constraint' : 
+                                                               [ {     'state'    : Match('BOOT'),
+                                                               'kernel'   : Match('2.6.22.19-vs2.3.0.34'), } ],
+                                        'failed_message'       : [ emailTxt.mailtxt.newdown ],
+                                        'resolved_message'     : [ emailTxt.mailtxt.newthankyou ],
+                                       },
+       'pcu'            : { 
+                                        'constraint' : [ {     'hostname' : FilledIn(True),
+                                                                               'password' : FilledIn(True), },
+                                                                        {      'ip' : FilledIn(True),
+                                                                               'password' : FilledIn(True), },
+                                                                       ],
+                                        'failed_message'       : [ emailTxt.mailtxt.pcudown ],
+                                        'resolved_message'     : [ emailTxt.mailtxt.pcuthankyou ],
+                                       },
+}
+
+dc7800 = {
+       # if membership constraint it true, then apply the other constraints.
+       'membership'    : [ { 'plcnode/nodegroups' : ListMatch('DC7800Deployment'), } ],
+
+       'pcu'                   : { 'constraint' : [ { 'hostname'       : FilledIn(True),
+                                                                       'ip'            : FilledIn(True),
+                                                                       'password'      : FilledIn(True),
+                                                                       'model'         : Match('AMT'),
+                                                                       'username'      : Match('admin'),
+                                                                       'portstatus': PortOpen(16992),
+                                                                       'reboot'        : Is(0),
+                                                                },
+                                                                {      'hostname'      : FilledIn(True),
+                                                                       'ip'            : FilledIn(True),
+                                                                       'password'      : FilledIn(True),
+                                                                       'reboot'        : Is(0),
+                                                                       #'valid'                : Is(True),
+                                                                },],
+                                        'failed_message'       : [ emailTxt.mailtxt.donation_nopcu],
+                                        'resolved_message'     : [ emailTxt.mailtxt.pcuthankyou ],
+                                        },
+       'node'       : { 'constraint' : 
+                                                               [ {     'state'    : Match('BOOT'),
+                                                                       'kernel'   : Match('2.6.22.19-vs2.3.0.34'), } ],
+                                        'failed_message'       : [ emailTxt.mailtxt.donation_down],
+                                        'resolved_message'     : [ emailTxt.mailtxt.newthankyou ],
+                               },
+}
+
+#
+# data source, { constraints ... value } 
+# action on failure of constraint
+# information about why it failed
+# # stop action if constraint is satisfied at a later time.
+# kind of like asynchronous constraint solving.
+# or stored procedures.
index 58c223b..e0a6ffa 100755 (executable)
@@ -449,12 +449,11 @@ class Record(object):
        def getDaysDown(cls, diag_record):
                daysdown = -1
                if diag_record['comonstats']['uptime'] != "null":
-                       #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
                        daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
-               elif diag_record['comonstats']['sshstatus'] != "null":
-                       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
-               elif diag_record['comonstats']['lastcotop'] != "null":
-                       daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
+               #elif diag_record['comonstats']['sshstatus'] != "null":
+               #       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
+               #elif diag_record['comonstats']['lastcotop'] != "null":
+               #       daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
                else:
                        now = time.time()
                        last_contact = diag_record['plcnode']['last_contact']
@@ -468,15 +467,41 @@ class Record(object):
        getDaysDown = classmethod(getDaysDown)
 
        def getStrDaysDown(cls, diag_record):
-               daysdown = cls.getDaysDown(diag_record)
-               if daysdown > 0:
-                       return "%d days down"%daysdown
-               elif daysdown == -1:
-                       return "Unknown number of days"
+               daysdown = "unknown"
+               last_contact = diag_record['plcnode']['last_contact']
+               date_created = diag_record['plcnode']['date_created']
+
+               if      diag_record['comonstats']['uptime'] != "null" and \
+                       diag_record['comonstats']['uptime'] != "-1":
+                       daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+                       daysdown = "%d days up" % daysdown
+
+               elif last_contact is None:
+                       if date_created is not None:
+                               now = time.time()
+                               diff = now - date_created
+                               daysdown = diff // (60*60*24)
+                               daysdown = "Never contacted PLC, created %s days ago" % daysdown
+                       else:
+                               daysdown = "Never contacted PLC"
                else:
-                       return "%d days up"% -daysdown
+                       now = time.time()
+                       diff = now - last_contact
+                       daysdown = diff // (60*60*24)
+                       daysdown = "%s days down" % daysdown
+               return daysdown
        getStrDaysDown = classmethod(getStrDaysDown)
 
+       #def getStrDaysDown(cls, diag_record):
+       #       daysdown = cls.getDaysDown(diag_record)
+       #       if daysdown > 0:
+       #               return "%d days down"%daysdown
+       #       elif daysdown == -1:
+       #               return "Never online"
+       #       else:
+       #               return "%d days up"% -daysdown
+       #getStrDaysDown = classmethod(getStrDaysDown)
+
        def takeAction(self):
                pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
                if 'improvement' in self.data['stage'] or self.improved():