Massive commit. Just put all local changes into svn.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 23 Jun 2008 16:57:53 +0000 (16:57 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 23 Jun 2008 16:57:53 +0000 (16:57 +0000)
20 files changed:
automate_pl03.sh
config.py
dumpact.py
emailTxt.py
findbad.py
findbadpcu.py
getsshkeys.py
mailer.py
monitor.py
nodecommon.py
nodegroups.py
nodeinfo.py
nodequery.py
nodereboot.py
plc.py
policy.py
reboot.py
soltesz.py
ssh/pxssh.py
syncplcdb.py

index 82f25dc..e31ead9 100755 (executable)
@@ -6,14 +6,19 @@ DATE=`date +%Y-%m-%d-%T`
 
 
 if [ -f $HOME/monitor/SKIP ] ; then 
-       echo "SKIPPING Monitor"
        # TODO: should be possible to kill the old version if 
        # desired and prevent lingering instances of automate.
-       #./kill.cmd.sh `cat $HOME/monitor/SKIP`
-       exit
-else
-       echo $$ > $HOME/monitor/SKIP
+       if [ -z "$1" ] ; then 
+               echo "KILLING Monitor"
+               ./kill.cmd.sh `cat $HOME/monitor/SKIP`
+               rm -f $HOME/monitor/SKIP
+       else 
+               # skipping monitor
+               echo "SKIPPING Monitor"
+               exit
+       fi 
 fi
+echo $$ > $HOME/monitor/SKIP
 #########################
 # 1. FINDBAD NODES 
 rm -f pdb/production.findbad2.pkl
@@ -37,6 +42,10 @@ cp badcsv.txt /plc/data/var/www/html/monitor/
 # 2. FINDBAD PCUS
 rm -f pdb/production.findbadpcus2.pkl
 ./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE            
+
+# clean up stray 'locfg' processes that hang around inappropriately...
+ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill
+
 # convert pkl to php serialize format.
 cp pdb/production.findbadpcus2.pkl pdb/production.findbadpcus.pkl
 ./pkl2php.py -i findbadpcus2 -o findbadpcus
index 906c447..0a66b75 100644 (file)
--- a/config.py
+++ b/config.py
@@ -23,6 +23,13 @@ def parse_bool(option, opt_str, value, parser):
        else:
                print "blue"
 
+def setFileFromList(file, list):
+       f = open(file, 'w')
+       for line in list:
+               f.write(line + "\n")
+       f.close()
+       return True
+
 def getListFromFile(file):
        f = open(file, 'r')
        list = []
index fd5be08..1ac0cb1 100755 (executable)
@@ -43,8 +43,7 @@ def main():
                                        for k in keys:
                                                if "message" not in k and "msg" not in k:
                                                        if 'time' in k:
-                                                               s_time=time.strftime("%Y/%m/%d %H:%M:%S",
-                                                                                                               time.gmtime(diag_node[k]))
+                                                               s_time=time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(diag_node[k]))
                                                                print "\t'%s' : %s" % (k, s_time)
                                                        else:
                                                                print "\t'%s' : %s" % (k, diag_node[k])
index 23adc95..f92451d 100644 (file)
@@ -293,6 +293,58 @@ Thank you for your help,
        pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
        pcudown=[pcudown_one, pcudown_one, pcudown_one]
 
+       unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", 
+                                          """
+While trying to automatically recover this machine:
+
+    http://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
+
+We encountered an unknown situation.  Please re-code to handle, or manually intervene to repair this host.
+
+Abbreviated BootManager Sequence:
+
+    %(sequence)s
+
+BootManager.log output follows:
+---------------------------------------------------------
+%(bmlog)s
+"""      )
+
+       minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", 
+                                          """
+While trying to automatically recover this machine:
+
+    http://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
+
+We encountered an failed hardware requirement.  Please look at the log below to determine the exact nature of the failure, either Disk, CPU, Network, or Mimial RAM was not satisfied.
+
+If your machine does not meet the current hardware specifications for a PlanetLab node (http://www.planet-lab.org/hardware), please upgrade it to meet the current recommended configuration.  
+
+If you believe this message is an error, please email support@planet-lab.org explaining the problem.  You may need to create an updated Boot Image that includes drivers for your hardware.
+
+Thank you,
+ - PlanetLab Support
+
+BootManager.log output follows:
+---------------------------------------------------------
+%(bmlog)s
+"""      )
+
+       baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
+                          """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
+
+Please verify the integrity of the disk, and order a replacment if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
+
+Thanks.
+
+  -- PlanetLab Central (support@planet-lab.org)
+
+The output of `dmesg` follows:
+-------------------------------------------------------------------------
+
+%(log)s
+""")
+
        down=("""PlanetLab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has been down for %(days)s days.
 
 Please check the node's connectivity and, if properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we're seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.
@@ -321,13 +373,14 @@ Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
 
-       planet_cnf=(""" Planetlab node %(hostname)s needs an updated configuration file""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated planet.cnf file with no NODE_ID.  This can happen after an upgrade and requires your assistance in correcting.  All that is needed is to visit:
+       plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
+                               """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
 
-       https://www.planet-lab.org/db/nodes/index.php?id=%(node_id)d
+       https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
 
-And follow the "Download conf file" link to generate a new configuration file for each node.  Copy this file to the appropriate read-only media, either floppy or USB stick, and reboot the machines.
+Then, select, "Download -> Download plnode.txt file for %(hostname)s" menu.  This will generate a new configuration file for your node.  Copy this file to the appropriate read-only media, either floppy or USB stick, and reboot the machine.
 
-There's no need to respond to this message if you're able to update the configuration files without difficulty and your node returns to normal operation.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. 
+There is no need to respond to this message if you're able to update the configuration file without difficulty and your node returns to normal operation.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. 
 
 Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
index fa4c76a..e08b554 100755 (executable)
@@ -5,20 +5,6 @@ import sys
 import string
 import time
 
-from config import config
-from optparse import OptionParser
-parser = OptionParser()
-parser.set_defaults(filename=None, increment=False, dbname="findbadnodes", cachenodes=False)
-parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
-                                       help="Provide the input file for the node list")
-parser.add_option("", "--cachenodes", action="store_true",
-                                       help="Cache node lookup from PLC")
-parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
-                                       help="Specify the name of the database to which the information is saved")
-parser.add_option("-i", "--increment", action="store_true", dest="increment", 
-                                       help="Increment round number to force refresh or retry")
-config = config(parser)
-config.parse_args()
 
 # QUERY all nodes.
 COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
@@ -36,11 +22,14 @@ count = 0
 
 
 import soltesz
-import plc
 import comon
 import threadpool
 import syncplcdb
 
+import plc
+import auth
+api = plc.PLC(auth.auth, auth.plc)
+
 def collectPingAndSSH(nodename, cohash):
        ### RUN PING ######################
        ping = soltesz.CMD()
@@ -275,6 +264,12 @@ def main():
        if config.filename:
                f_nodes = config.getListFromFile(config.filename)
                l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
+       elif config.node:
+               f_nodes = [config.node]
+               l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
+       elif config.nodegroup:
+               ng = api.GetNodeGroups({'name' : config.nodegroup})
+               l_nodes = api.GetNodes(ng[0]['node_ids'])
 
        l_nodes = [node['hostname'] for node in l_nodes]
 
@@ -286,6 +281,26 @@ def main():
 
 
 if __name__ == '__main__':
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(filename=None, node=None, nodegroup=None, increment=False, dbname="findbadnodes", cachenodes=False)
+       parser.add_option("", "--node", dest="node", metavar="hostname", 
+                                               help="Provide a single node to operate on")
+       parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
+                                               help="Provide the input file for the node list")
+       parser.add_option("", "--nodegroup", dest="nodegroup", metavar="FILE", 
+                                               help="Provide the nodegroup for the list of nodes.")
+
+       parser.add_option("", "--cachenodes", action="store_true",
+                                               help="Cache node lookup from PLC")
+       parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
+                                               help="Specify the name of the database to which the information is saved")
+       parser.add_option("-i", "--increment", action="store_true", dest="increment", 
+                                               help="Increment round number to force refresh or retry")
+       config = config(parser)
+       config.parse_args()
+
        try:
                main()
        except Exception, err:
index 2900b65..122d8a5 100755 (executable)
@@ -86,6 +86,122 @@ def nmap_portstatus(status):
                        continue_probe = True
        return (ps, continue_probe)
 
+def get_pcu(pcuname):
+       plc_lock.acquire()
+       try:
+               print "GetPCU from PLC %s" % pcuname
+               l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
+               print l_pcu
+               if len(l_pcu) > 0:
+                       l_pcu = l_pcu[0]
+       except:
+               try:
+                       print "GetPCU from file %s" % pcuname
+                       l_pcus = soltesz.dbLoad("pculist")
+                       for i in l_pcus:
+                               if i['pcu_id'] == pcuname:
+                                       l_pcu = i
+               except:
+                       import traceback
+                       traceback.print_exc()
+                       l_pcu = None
+
+       plc_lock.release()
+       return l_pcu
+
+def get_nodes(node_ids):
+       plc_lock.acquire()
+       l_node = []
+       try:
+               l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 'node_id', 'ports'])
+       except:
+               try:
+                       plc_nodes = soltesz.dbLoad("l_plcnodes")
+                       for n in plc_nodes:
+                               if n['node_id'] in node_ids:
+                                       l_node.append(n)
+               except:
+                       import traceback
+                       traceback.print_exc()
+                       l_node = None
+
+       plc_lock.release()
+       if l_node == []:
+               l_node = None
+       return l_node
+       
+
+def get_plc_pcu_values(pcuname):
+       """
+               Try to contact PLC to get the PCU info.
+               If that fails, try a backup copy from the last run.
+               If that fails, return None
+       """
+       values = {}
+
+       l_pcu = get_pcu(pcuname)
+       
+       if l_pcu is not None:
+               site_id = l_pcu['site_id']
+               node_ids = l_pcu['node_ids']
+               l_node = get_nodes(node_ids) 
+                               
+               if l_node is not None:
+                       for node in l_node:
+                               values[node['hostname']] = node['ports'][0]
+
+                       values['nodenames'] = [node['hostname'] for node in l_node]
+
+                       # NOTE: this is for a dry run later. It doesn't matter which node.
+                       values['node_id'] = l_node[0]['node_id']
+
+               values.update(l_pcu)
+       else:
+               values = None
+       
+       return values
+
+def get_plc_site_values(site_id):
+       ### GET PLC SITE ######################
+       plc_lock.acquire()
+       values = {}
+       d_site = None
+
+       try:
+               d_site = plc.getSites({'site_id': site_id}, ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
+               if len(d_site) > 0:
+                       d_site = d_site[0]
+       except:
+               try:
+                       plc_sites = soltesz.dbLoad("l_plcsites")
+                       for site in plc_sites:
+                               if site['site_id'] == site_id:
+                                       d_site = site
+                                       break
+               except:
+                       import traceback
+                       traceback.print_exc()
+                       values = None
+
+       plc_lock.release()
+
+       if d_site is not None:
+               max_slices = d_site['max_slices']
+               num_slices = len(d_site['slice_ids'])
+               num_nodes = len(d_site['node_ids'])
+               loginbase = d_site['login_base']
+               values['plcsite'] = {'num_nodes' : num_nodes, 
+                                                       'max_slices' : max_slices, 
+                                                       'num_slices' : num_slices,
+                                                       'login_base' : loginbase,
+                                                       'status'     : 'SUCCESS'}
+       else:
+               values = None
+
+
+       return values
+
+
 def collectPingAndSSH(pcuname, cohash):
 
        continue_probe = True
@@ -94,39 +210,19 @@ def collectPingAndSSH(pcuname, cohash):
        ### GET PCU ######################
        try:
                b_except = False
-               plc_lock.acquire()
-
                try:
-                       l_pcu  = plc.GetPCUs({'pcu_id' : pcuname})
-                       
-                       if len(l_pcu) > 0:
-                               site_id = l_pcu[0]['site_id']
-
-                               node_ids = l_pcu[0]['node_ids']
-                               l_node = plc.getNodes(node_ids, ['hostname', 'last_contact', 
-                                                                                                'node_id', 'ports'])
-                       if len(l_node) > 0:
-                               for node in l_node:
-                                       values[node['hostname']] = node['ports'][0]
-
-                               values['nodenames'] = [node['hostname'] for node in l_node]
-                               # NOTE: this is for a dry run later. It doesn't matter which node.
-                               values['node_id'] = l_node[0]['node_id']
-
-                       if len(l_pcu) > 0:
-                               values.update(l_pcu[0])
+                       v = get_plc_pcu_values(pcuname)
+                       if v is not None:
+                               values.update(v)
                        else:
                                continue_probe = False
-
                except:
                        b_except = True
                        import traceback
                        traceback.print_exc()
-
                        continue_probe = False
 
-               plc_lock.release()
-               if b_except: return (None, None)
+               if b_except or not continue_probe: return (None, None, None)
 
                if values['hostname'] is not None:
                        values['hostname'] = values['hostname'].strip()
@@ -206,32 +302,12 @@ def collectPingAndSSH(pcuname, cohash):
                values['reboot'] = rb_ret
 
                ### GET PLC SITE ######################
-               b_except = False
-               plc_lock.acquire()
-
-               try:
-                       d_site = plc.getSites({'site_id': site_id}, 
-                                                               ['max_slices', 'slice_ids', 'node_ids', 'login_base'])
-               except:
-                       b_except = True
-                       import traceback
-                       traceback.print_exc()
-
-               plc_lock.release()
-               if b_except: return (None, None)
-
-               if d_site and len(d_site) > 0:
-                       max_slices = d_site[0]['max_slices']
-                       num_slices = len(d_site[0]['slice_ids'])
-                       num_nodes = len(d_site[0]['node_ids'])
-                       loginbase = d_site[0]['login_base']
-                       values['plcsite'] = {'num_nodes' : num_nodes, 
-                                                               'max_slices' : max_slices, 
-                                                               'num_slices' : num_slices,
-                                                               'login_base' : loginbase,
-                                                               'status'     : 'SUCCESS'}
+               v = get_plc_site_values(values['site_id'])
+               if v is not None:
+                       values.update(v)
                else:
                        values['plcsite'] = {'status' : "GS_FAILED"}
+                       
        except:
                print "____________________________________"
                print values
@@ -317,6 +393,7 @@ def checkAndRecordState(l_pcus, cohash):
 def main():
        global externalState
 
+       l_pcus = soltesz.if_cached_else_refresh(1, config.refresh, "pculist", lambda : plc.GetPCUs())
        externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) 
        cohash = {}
 
@@ -326,8 +403,6 @@ def main():
 
        if config.filename == None and config.pcuid == None:
                print "Calling API GetPCUs() : refresh(%s)" % config.refresh
-               l_pcus = soltesz.if_cached_else_refresh(1, 
-                                                               config.refresh, "pculist", lambda : plc.GetPCUs())
                l_pcus  = [pcu['pcu_id'] for pcu in l_pcus]
        elif config.filename is not None:
                l_pcus = config.getListFromFile(config.filename)
@@ -335,7 +410,6 @@ def main():
        elif config.pcuid is not None:
                l_pcus = [ config.pcuid ] 
                l_pcus = [int(pcu) for pcu in l_pcus]
-               
 
        checkAndRecordState(l_pcus, cohash)
 
@@ -360,6 +434,8 @@ if __name__ == '__main__':
                main()
                time.sleep(1)
        except Exception, err:
+               import traceback
+               traceback.print_exc()
                print "Exception: %s" % err
                print "Saving data... exitting."
                soltesz.dbDump(config.dbname, externalState)
index d0084db..fc306e4 100755 (executable)
@@ -86,12 +86,14 @@ class SSHKnownHosts:
 
        def delete(self, host):
                node = self.getNodes(host) 
-               (host, ip, _, _) = self._record_from_node(node[0])
-               index = "%s,%s" % (host,ip)
-               if index in self.pl_keys:
-                       del self.pl_keys[index]
-               if index in self.other_keys:
-                       del self.other_keys[index]
+               if len(node) > 0:
+                       (host, ip, _, _) = self._record_from_node(node[0])
+                       index = "%s,%s" % (host,ip)
+                       if index in self.pl_keys:
+                               del self.pl_keys[index]
+                       if index in self.other_keys:
+                               del self.other_keys[index]
+               return node
 
        def updateDirect(self, host):
                cmd = os.popen("/usr/bin/ssh-keyscan -t rsa %s 2>/dev/null" % host)
@@ -105,14 +107,16 @@ class SSHKnownHosts:
                self.other_keys.update(rec)
 
        def update(self, host):
-               node = self.getNodes(host) 
-               ret = self._record_from_node(node[0])
-               (host, ip, key, comment)  = ret
-               if ip == None:
-                       self.updateDirect(host)
-               else:
-                       rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) }
-                       self.pl_keys.update(rec)
+               node = self.delete(host)
+               #node = self.getNodes(host) 
+               if node is not []:
+                       ret = self._record_from_node(node[0])
+                       (host, ip, key, comment)  = ret
+                       if ip == None:
+                               self.updateDirect(host)
+                       else:
+                               rec = { "%s,%s" % (host,ip) : "%s %s" % (key, comment) }
+                               self.pl_keys.update(rec)
 
        def getNodes(self, host=None):
                if type(host) == type(""): host = [host]
@@ -163,7 +167,7 @@ def main(hosts):
        k = SSHKnownHosts()
        if len (hosts) > 0:
                for host in hosts:
-                       k.update(host)
+                       k.updateDirect(host)
        else:
                k.updateAll()
        k.write()
index bb7c781..5fc0320 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -169,8 +169,9 @@ def closeTicketViaRT(ticket_id, comment):
 
 def emailViaRT(subject, text, to, ticket_id=None):
        if ticket_id == None or ticket_id == "":
+               print "No TICKET"
                return emailViaRT_NoTicket(subject, text, to)
-               
+
 
        # Set ENV Variables/PATH
        _setupRTenvironment()
@@ -314,22 +315,30 @@ def email(subject, text, to):
        #       mail and debug, 'to' changed at the beginning'
        #   nomail, but report who I'd send to.
        if config.mail:
-               try:
-                       # This is normal operation
-                       server = smtplib.SMTP(MTA)
-                       server.sendmail(FROM, to,  msg)
-                       if config.bcc and not config.debug:
-                               server.sendmail(FROM, config.email,  msg)
-                       server.quit()
-               except Exception, err:
-                       print "Mailer error: %s" % err
+               for mta in [MTA, 'golf.cs.princeton.edu']:
+                       try:
+                               # This is normal operation
+                               #print MTA
+                               #print FROM
+                               #print to
+                               #print msg
+                               server = smtplib.SMTP(mta)
+                               #server = smtplib.SMTP('golf.cs.princeton.edu')
+                               server.sendmail(FROM, to,  msg)
+                               if config.bcc and not config.debug:
+                                       server.sendmail(FROM, config.email,  msg)
+                               server.quit()
+                       except Exception, err:
+                               print "Mailer error1: failed using MTA(%s) with: %s" % (mta, err)
+
        elif not config.debug and not config.mail and config.bcc:
-               try:
-                       server = smtplib.SMTP(MTA)
-                       server.sendmail(FROM, to,  msg)
-                       server.quit()
-               except Exception, err:
-                       print "Mailer error: %s" % err
+               for mta in [MTA, 'golf.cs.princeton.edu']:
+                       try:
+                               server = smtplib.SMTP(mta)
+                               server.sendmail(FROM, to,  msg)
+                               server.quit()
+                       except Exception, err:
+                               print "Mailer error2: failed using MTA(%s) with: %s" % (mta, err)
        else:
                #print "Would mail %s" %to
                logger.debug("Would send mail to %s" % to)
index d876dc3..8891b25 100644 (file)
@@ -9,6 +9,7 @@
 import soltesz
 
 from monitor_policy import *
+import rt
 
 import plc
 import auth
@@ -23,6 +24,38 @@ def reboot(hostname):
        l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
        l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
 
+       l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
+       if len(l_nodes) == 0:
+               raise Exception("Host removed via blacklist: %s" % hostname)
+
+       ad_dbTickets = soltesz.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
+       if ad_dbTickets == None:
+               raise Exception("Could not find cached dbTickets")
+
+       #print "merge"
+       merge = Merge( [node['hostname'] for node in l_nodes])
+       record_list = merge.run()
+       #print "rt"
+       rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
+       record_list = rt.run()
+       #print "diagnose"
+       diag = Diagnose(record_list)
+       diagnose_out = diag.run()
+       #print diagnose_out
+       #print "action"
+       action = Action(diagnose_out)
+       action.run()
+
+       return True
+
+def reboot2(hostname):
+       l_nodes = api.GetNodes(hostname)
+       if len(l_nodes) == 0:
+               raise Exception("No such host: %s" % hostname)
+       
+       l_blacklist = soltesz.if_cached_else(1, "l_blacklist", lambda : [])
+       l_ticket_blacklist = soltesz.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+
        l_nodes  = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
        if len(l_nodes) == 0:
                raise Exception("Host removed via blacklist: %s" % hostname)
@@ -31,6 +64,15 @@ def reboot(hostname):
        if ad_dbTickets == None:
                raise Exception("Could not find cached dbTickets")
 
+
+       args = {}
+       args['hostname'] = "%s" % hostname
+       args['hostname_list'] = "%s" % hostname
+       args['loginbase'] = plc.siteId(hostname)
+
+       m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
+                                                       mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+       
        #print "merge"
        merge = Merge( [node['hostname'] for node in l_nodes])
        record_list = merge.run()
@@ -47,6 +89,7 @@ def reboot(hostname):
 
        return True
 
+
 def main():
        pass
 
index db46e40..0f3d0fb 100644 (file)
@@ -63,6 +63,33 @@ def color_boot_state(l):
        else:
                return l
 
+def diff_time(timestamp):
+       now = time.time()
+       if timestamp == None:
+               return "unknown"
+       diff = now - timestamp
+       # return the number of seconds as a difference from current time.
+       t_str = ""
+       if diff < 60: # sec in min.
+               t = diff // 1
+               t_str = "%s sec ago" % t
+       elif diff < 60*60: # sec in hour
+               t = diff // (60)
+               t_str = "%s min ago" % int(t)
+       elif diff < 60*60*24: # sec in day
+               t = diff // (60*60)
+               t_str = "%s hrs ago" % int(t)
+       elif diff < 60*60*24*7: # sec in week
+               t = diff // (60*60*24)
+               t_str = "%s days ago" % int(t)
+       elif diff < 60*60*24*30: # approx sec in month
+               t = diff // (60*60*24*7)
+               t_str = "%s wks ago" % int(t)
+       elif diff > 60*60*24*30: # approx sec in month
+               t = diff // (60*60*24*7*30)
+               t_str = "%s mnths ago" % int(t)
+       return t_str
+
 def nodegroup_display(node, fb):
        if node['hostname'] in fb['nodes']:
                node['current'] = get_current_state(fb['nodes'][node['hostname']]['values'])
@@ -84,7 +111,9 @@ def nodegroup_display(node, fb):
        #node['boot_state']     = node['boot_state']
        #node['current']        = node['current']
        node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
-       return "%(hostname)-38s %(boot_state)5s %(current)5s %(pcu)6s %(key)45s %(kernel)s" % node
+       node['lastupdate'] = diff_time(node['last_contact'])
+
+       return "%(hostname)-38s %(boot_state)5s %(current)5s %(pcu)6s %(key)45s %(kernel)32s %(lastupdate)12s " % node
 
 from model import *
 import soltesz
@@ -94,6 +123,11 @@ def node_end_record(node):
        if node not in act_all:
                del act_all
                return False
+
+       if len(act_all[node]) == 0:
+               del act_all
+               return False
+
        a = Action(node, act_all[node][0])
        a.delField('rt')
        a.delField('found_rt_ticket')
index 430bb7b..725d0e0 100755 (executable)
@@ -17,92 +17,106 @@ import plc
 import auth
 api = plc.PLC(auth.auth, auth.plc)
 
-from config import config
 from optparse import OptionParser
 from sets import Set
 
 from nodecommon import *
 import soltesz
-fb = soltesz.dbLoad("findbad")
-
-parser = OptionParser()
-parser.set_defaults(nodegroup="Alpha",
-                                       node=None,
-                                       nodelist=None,
-                                       list=False,
-                                       add=False,
-                                       notng=False,
-                                       delete=False,
-                                       )
-parser.add_option("", "--not", dest="notng", action="store_true", 
-                                       help="All nodes NOT in nodegroup.")
-parser.add_option("", "--nodegroup", dest="nodegroup", metavar="NodegroupName",
-                                       help="Specify a nodegroup to perform actions on")
-
-parser.add_option("", "--list", dest="list", action="store_true", 
-                                       help="List all nodes in the given nodegroup")
-parser.add_option("", "--add", dest="add", action="store_true", 
-                                       help="Add nodes to the given nodegroup")
-parser.add_option("", "--delete", dest="delete", action="store_true", 
-                                       help="Delete nodes from the given nodegroup")
-parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
-                                       help="A single node name to add to the nodegroup")
-parser.add_option("", "--nodelist", dest="nodelist", metavar="list.txt", 
-                                       help="Use all nodes in the given file for operation.")
-config = config(parser)
-config.parse_args()
-
-# COLLECT nodegroups, nodes and node lists
-if config.node or config.nodelist:
-       if config.node: 
-               hostlist = [ config.node ] 
-       else: 
-               hostlist = config.getListFromFile(config.nodelist)
-       nodelist = api.GetNodes(hostlist)
-
-       group_str = "Given"
-
-else:
-       ng = api.GetNodeGroups({'name' : config.nodegroup})
-       nodelist = api.GetNodes(ng[0]['node_ids'])
-
-       group_str = config.nodegroup
-
-if config.notng:
-       # Get nodegroup nodes
-       ng_nodes = nodelist
-
-       # Get all nodes
-       all_nodes = api.GetNodes({'peer_id': None})
-       
-       # remove ngnodes from all node list
-       ng_list = [ x['hostname'] for x in ng_nodes ]
-       all_list = [ x['hostname'] for x in all_nodes ]
-       not_ng_nodes = Set(all_list) - Set(ng_list)
-
-       # keep each node that *is* in the not_ng_nodes set
-       nodelist = filter(lambda x : x['hostname'] in not_ng_nodes, all_nodes)
-
-hostnames = [ n['hostname'] for n in nodelist ]
-
-# commands:
-if config.list:
-       print " ---- Nodes in the %s Node Group ----" % group_str
-       i = 1
-       for node in nodelist:
-               print "%-2d" % i, 
-               print nodegroup_display(node, fb)
-               i += 1
-
-elif config.add and config.nodegroup:
-       for node in hostnames:
-               print "Adding %s to %s nodegroup" % (node, config.nodegroup)
-               api.AddNodeToNodeGroup(node, config.nodegroup)
-
-elif config.delete:
-       for node in hostnames:
-               print "Deleting %s from %s nodegroup" % (node, config.nodegroup)
-               api.DeleteNodeFromNodeGroup(node, config.nodegroup)
-
-else:
-       print "no other options supported."
+
+def main():
+       from config import config
+       fb = soltesz.dbLoad("findbad")
+
+       parser = OptionParser()
+       parser.set_defaults(nodegroup="Alpha",
+                                               node=None,
+                                               nodelist=None,
+                                               list=False,
+                                               add=False,
+                                               notng=False,
+                                               delete=False,
+                                               )
+       parser.add_option("", "--not", dest="notng", action="store_true", 
+                                               help="All nodes NOT in nodegroup.")
+       parser.add_option("", "--nodegroup", dest="nodegroup", metavar="NodegroupName",
+                                               help="Specify a nodegroup to perform actions on")
+
+       parser.add_option("", "--list", dest="list", action="store_true", 
+                                               help="List all nodes in the given nodegroup")
+       parser.add_option("", "--add", dest="add", action="store_true", 
+                                               help="Add nodes to the given nodegroup")
+       parser.add_option("", "--delete", dest="delete", action="store_true", 
+                                               help="Delete nodes from the given nodegroup")
+       parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
+                                               help="A single node name to add to the nodegroup")
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="list.txt", 
+                                               help="Use all nodes in the given file for operation.")
+       config = config(parser)
+       config.parse_args()
+
+       # COLLECT nodegroups, nodes and node lists
+       if config.node or config.nodelist:
+               if config.node: 
+                       hostlist = [ config.node ] 
+               else: 
+                       hostlist = config.getListFromFile(config.nodelist)
+
+               # NOTE: preserve order given in file.  Otherwise, return values are not in order
+               # given to GetNodes
+               nodelist = []
+               for h in hostlist:
+                       nodelist += api.GetNodes(h)
+
+               #nodelist = api.GetNodes(hostlist)
+               group_str = "Given"
+
+       else:
+               ng = api.GetNodeGroups({'name' : config.nodegroup})
+               nodelist = api.GetNodes(ng[0]['node_ids'])
+
+               group_str = config.nodegroup
+
+       if config.notng:
+               # Get nodegroup nodes
+               ng_nodes = nodelist
+
+               # Get all nodes
+               all_nodes = api.GetNodes({'peer_id': None})
+               
+               # remove ngnodes from all node list
+               ng_list = [ x['hostname'] for x in ng_nodes ]
+               all_list = [ x['hostname'] for x in all_nodes ]
+               not_ng_nodes = Set(all_list) - Set(ng_list)
+
+               # keep each node that *is* in the not_ng_nodes set
+               nodelist = filter(lambda x : x['hostname'] in not_ng_nodes, all_nodes)
+
+       hostnames = [ n['hostname'] for n in nodelist ]
+
+       # commands:
+       if config.list:
+               print " ---- Nodes in the %s Node Group ----" % group_str
+               i = 1
+               for node in nodelist:
+                       print "%-2d" % i, 
+                       print nodegroup_display(node, fb)
+                       i += 1
+
+       elif config.add and config.nodegroup:
+               for node in hostnames:
+                       print "Adding %s to %s nodegroup" % (node, config.nodegroup)
+                       api.AddNodeToNodeGroup(node, config.nodegroup)
+
+       elif config.delete:
+               for node in hostnames:
+                       print "Deleting %s from %s nodegroup" % (node, config.nodegroup)
+                       api.DeleteNodeFromNodeGroup(node, config.nodegroup)
+
+       else:
+               print "no other options supported."
+
+if __name__ == "__main__":
+       try:
+               main()
+       except IOError:
+               pass
index be2b93c..3376257 100755 (executable)
@@ -5,27 +5,30 @@ import auth
 api = plc.PLC(auth.auth, auth.plc)
 
 import soltesz
-fb = soltesz.dbLoad("findbad")
-act_all = soltesz.dbLoad("act_all")
-
 import reboot
 
 import time
 from model import *
 from nodecommon import *
 
-from config import config
+import config as configmodule
+
+from config import config as cfg
 from optparse import OptionParser
 
 parser = OptionParser()
-parser.set_defaults(node=None, endrecord=False)
+parser.set_defaults(node=None, 
+                                       findbad=False,
+                                       endrecord=False)
 parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
                                        help="A single node name to add to the nodegroup")
 parser.add_option("", "--endrecord", dest="endrecord", action="store_true",
                                        help="Force an end to the action record; to prompt Montior to start messaging again.")
+parser.add_option("", "--findbad", dest="findbad", action="store_true", 
+                                       help="Re-run findbad on the nodes we're going to check before acting.")
 parser.add_option("", "--bootcd", dest="bootcd", action="store_true",
                                        help="A stock help message for fetching a new BootCD from the PLC GUI.")
-config = config(parser)
+config = cfg(parser)
 config.parse_args()
 
 def diff_time(timestamp):
@@ -143,6 +146,18 @@ def pcu_print_info(pcuinfo, hostname):
                        print "\t racadm.py -r %s -u %s -p '%s'" % (pcuinfo['ip'], pcuinfo['username'], pcuinfo['password'])
                        print "\t cmdhttps/locfg.pl -s %s -f iloxml/Reset_Server.xml -u %s -p '%s' | grep MESSAGE" % \
                                (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
+               if pcuinfo['portstatus']['16992'] == "open":
+                       print "\t ./cmdamt/remoteControl -A -verbose 'http://%s:16992/RemoteControlService' -user admin -pass '%s'" % (reboot.pcu_name(pcuinfo), pcuinfo['password'])
+
+if config.findbad:
+       # rerun findbad with the nodes in the given nodes.
+       import os
+       file = "findbad.txt"
+       configmodule.setFileFromList(file, config.args)
+       os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
+
+fb = soltesz.dbLoad("findbad")
+act_all = soltesz.dbLoad("act_all")
 
 for node in config.args:
        config.node = node
index 6253215..a6b6c1c 100755 (executable)
@@ -6,49 +6,22 @@ api = plc.PLC(auth.auth, auth.plc)
 
 import soltesz
 fb = soltesz.dbLoad("findbad")
+fbpcu = soltesz.dbLoad("findbadpcus")
 from nodecommon import *
+from policy import Diagnose
 
 import time
+import re
 
-from config import config
-from optparse import OptionParser
-parser = OptionParser()
-parser.set_defaults(node=None, category=None, nodelist=None)
-parser.add_option("", "--category", dest="category", metavar="category", 
-                                       help="List all nodes in the given category")
-parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
-                                       help="A list of nodes to bring out of debug mode.")
-config = config(parser)
-config.parse_args()
-
-def diff_time(timestamp):
-       now = time.time()
-       if timestamp == None:
-               return "unknown"
-       diff = now - timestamp
-       # return the number of seconds as a difference from current time.
-       t_str = ""
-       if diff < 60: # sec in min.
-               t = diff
-               t_str = "%s sec ago" % t
-       elif diff < 60*60: # sec in hour
-               t = diff // (60)
-               t_str = "%s min ago" % int(t)
-       elif diff < 60*60*24: # sec in day
-               t = diff // (60*60)
-               t_str = "%s hours ago" % int(t)
-       elif diff < 60*60*24*7: # sec in week
-               t = diff // (60*60*24)
-               t_str = "%s days ago" % int(t)
-       elif diff < 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7)
-               t_str = "%s weeks ago" % int(t)
-       elif diff > 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7*30)
-               t_str = "%s months ago" % int(t)
-       return t_str
 
 
+def daysdown_print_nodeinfo(fbnode, hostname):
+       fbnode['hostname'] = hostname
+       fbnode['daysdown'] = Diagnose.getStrDaysDown(fbnode)
+       fbnode['intdaysdown'] = Diagnose.getDaysDown(fbnode)
+
+       print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % fbnode
+
 def fb_print_nodeinfo(fbnode, hostname):
        fbnode['hostname'] = hostname
        fbnode['checked'] = diff_time(fbnode['checked'])
@@ -60,32 +33,142 @@ def fb_print_nodeinfo(fbnode, hostname):
                fbnode['kernel'] = ""
        else:
                fbnode['kernel'] = fbnode['kernel'].split()[2]
-       #fbnode['pcu'] = color_pcu_state(fbnode)
+       fbnode['pcu'] = color_pcu_state(fbnode)
        print "%(hostname)-39s | %(checked)11.11s | %(state)10.10s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
 
-if config.nodelist:
-       nodelist = config.getListFromFile(config.nodelist)
-else:
-       nodelist = fb['nodes'].keys()
-
-
-for node in nodelist:
-       config.node = node
+def verify(constraints, data):
+       """
+               constraints is a list of key, value pairs.
+               # [ {... : ...}==AND , ... , ... , ] == OR
+       """
+       con_or_true = False
+       for con in constraints:
+               #print "con: %s" % con
+               if len(con.keys()) == 0:
+                       con_and_true = False
+               else:
+                       con_and_true = True
+
+               for key in con.keys():
+                       #print "looking at key: %s" % key
+                       if key in data: 
+                               value_re = re.compile(con[key])
+                               con_and_true = con_and_true & (value_re.search(data[key]) is not None)
+                       elif key not in data:
+                               print "missing key %s" % key
+                               con_and_true = False
+
+               con_or_true = con_or_true | con_and_true
+
+       return con_or_true
+
+def query_to_dict(query):
+       
+       ad = []
+
+       or_queries = query.split('||')
+       for or_query in or_queries:
+               and_queries = or_query.split('&&')
+
+               d = {}
+
+               for and_query in and_queries:
+                       (key, value) = and_query.split('=')
+                       d[key] = value
+
+               ad.append(d)
+       
+       return ad
+
+def _pcu_in(fbdata):
+       if 'plcnode' in fbdata:
+               if 'pcu_ids' in fbdata['plcnode']:
+                       if len(fbdata['plcnode']['pcu_ids']) > 0:
+                               return True
+       return False
+
+def pcu_select(str_query):
+       pcunames = []
+       if str_query is None: return pcunames
+
+       #print str_query
+       dict_query = query_to_dict(str_query)
+       #print dict_query
+
+       for node in fb['nodes'].keys():
+       
+               fb_nodeinfo  = fb['nodes'][node]['values']
+               if _pcu_in(fb_nodeinfo):
+                       pcuinfo = fbpcu['nodes']['id_%s' % fb_nodeinfo['plcnode']['pcu_ids'][0]]['values']
+                       if verify(dict_query, pcuinfo):
+                               pcunames.append(node)
+       
+       return pcunames
+
+def node_select(str_query):
+       hostnames = []
+       if str_query is None: return hostnames
+
+       #print str_query
+       dict_query = query_to_dict(str_query)
+       #print dict_query
+
+       for node in fb['nodes'].keys():
+       
+               fb_nodeinfo  = fb['nodes'][node]['values']
+
+               if verify(dict_query, fb_nodeinfo):
+                       #print node #fb_nodeinfo
+                       hostnames.append(node)
+               else:
+                       #print "NO MATCH", node
+                       pass
+       
+       return hostnames
+
+
+def main():
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(node=None, select=None, pcuselect=None, nodelist=None, daysdown=None)
+       parser.add_option("", "--daysdown", dest="daysdown", action="store_true",
+                                               help="List the node state and days down...")
+       parser.add_option("", "--select", dest="select", metavar="key=value", 
+                                               help="List all nodes with the given key=value pattern")
+       parser.add_option("", "--pcuselect", dest="pcuselect", metavar="key=value", 
+                                               help="List all nodes with the given key=value pattern")
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
+                                               help="A list of nodes to bring out of debug mode.")
+       config = config(parser)
+       config.parse_args()
+
+       if config.nodelist:
+               nodelist = config.getListFromFile(config.nodelist)
+       elif config.select is not None:
+               nodelist = node_select(config.select)
+       elif config.pcuselect is not None:
+               nodelist = pcu_select(config.pcuselect)
+       else:
+               nodelist = fb['nodes'].keys()
 
-       if node not in fb['nodes']:
-               continue
+       for node in nodelist:
+               config.node = node
 
-       fb_nodeinfo  = fb['nodes'][node]['values']
+               if node not in fb['nodes']:
+                       continue
 
-       if config.category and \
-               'state' in fb_nodeinfo and \
-               config.category == fb_nodeinfo['state']:
+               fb_nodeinfo  = fb['nodes'][node]['values']
 
-                       fb_print_nodeinfo(fb_nodeinfo, node)
-       elif 'state' in fb_nodeinfo:
-               fb_print_nodeinfo(fb_nodeinfo, node)
-       else:
-               pass
+               if config.daysdown:
+                       daysdown_print_nodeinfo(fb_nodeinfo, node)
+               else:
+                       if config.select:
+                               fb_print_nodeinfo(fb_nodeinfo, node)
+                       elif not config.select and 'state' in fb_nodeinfo:
+                               fb_print_nodeinfo(fb_nodeinfo, node)
+                       else:
+                               pass
                
-
-
+if __name__ == "__main__":
+       main()
index 48e99ee..a7a99d9 100755 (executable)
@@ -9,6 +9,7 @@ api = plc.PLC(auth.auth, auth.plc)
 
 import sys
 import os
+import policy
 
 from getsshkeys import SSHKnownHosts
 
@@ -20,8 +21,8 @@ from sets import Set
 import ssh.pxssh as pxssh
 import ssh.fdpexpect as fdpexpect
 import ssh.pexpect as pexpect
-
-
+from unified_model import *
+from emailTxt import mailtxt
 
 import signal
 class Sopen(subprocess.Popen):
@@ -136,13 +137,22 @@ class NodeConnection:
        def restart_node(self, state='boot'):
                api.UpdateNode(self.node, {'boot_state' : state})
 
-               print "   Killing all slice processes... : %s" %  self.node
-               cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
-               self.c.modules.os.system(cmd_slicekill)
+               pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
+               if not pflags.getRecentFlag('gentlekill'):
+                       print "   Killing all slice processes... : %s" %  self.node
+                       cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
+                       self.c.modules.os.system(cmd_slicekill)
+                       cmd = """ shutdown -r +1 & """
+                       print "   Restarting %s : %s" % ( self.node, cmd)
+                       self.c.modules.os.system(cmd)
+
+                       pflags.setRecentFlag('gentlekill')
+                       pflags.save()
+               else:
+                       print "   Restarting with sysrq 'sub' %s" % self.node
+                       cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
+                       self.c.modules.os.system(cmd)
 
-               cmd = """ shutdown -r +1 & """
-               print "   Restarting %s : %s" % ( self.node, cmd)
-               self.c.modules.os.system(cmd)
                return
 
        def restart_bootmanager(self, forceState):
@@ -162,8 +172,9 @@ class NodeConnection:
                return 
 
 
+import random
 class PlanetLabSession:
-       globalport = 22222
+       globalport = 22000 + int(random.random()*1000)
 
        def __init__(self, node, nosetup, verbose):
                self.verbose = verbose
@@ -193,48 +204,60 @@ class PlanetLabSession:
                # COPY Rpyc files to host
                cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                if self.verbose: print cmd
-               ret = os.system(cmd)
+               # TODO: Add timeout
+               timeout = 120
+               localos = soltesz.CMD()
+
+               ret = localos.system(cmd, timeout)
+               print ret
                if ret != 0:
-                       print "UNKNOWN SSH KEY FOR %s" % self.node
-                       print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
+                       print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
+                       #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
                        k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
-                       ret = os.system(cmd)
+                       ret = localos.system(cmd, timeout)
+                       print ret
                        if ret != 0:
-                               print "FAILED TWICE"
-                               sys.exit(1)
-
-               #cmd = "rsync -qv -az -e ssh %(monitordir)s/BootManager.py 
-               # %(monitordir)s/ChainBoot.py %(user)s@%(hostname)s:/tmp/source" % args
-               #print cmd; os.system(cmd)
+                               print "\tFAILED TWICE"
+                               #sys.exit(1)
+                               raise Exception("Failed twice trying to login with updated ssh host key")
 
+               t1 = time.time()
                # KILL any already running servers.
                cmd = """ssh %(user)s@%(hostname)s """ + \
-                    """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
+                        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
                cmd = cmd % args
                if self.verbose: print cmd
-               os.system(cmd)
+               # TODO: Add timeout
+               print localos.system(cmd,timeout)
 
                # START a new rpyc server.
-               cmd = """ssh %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
+               cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
                         """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
                cmd = cmd % args
                if self.verbose: print cmd
-               os.system(cmd)
+               print localos.system(cmd,timeout)
 
+               # TODO: Add timeout
                # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
                # and the following options seems to work well.
                cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
-                     """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
-                     """-o ConnectTimeout=120 """ + \
-                     """-n -N -L %(port)s:localhost:18812 """ + \
-                     """%(user)s@%(hostname)s"""
+                         """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
+                         """-o ConnectTimeout=120 """ + \
+                         """-n -N -L %(port)s:localhost:18812 """ + \
+                         """%(user)s@%(hostname)s"""
                cmd = cmd % args
                if self.verbose: print cmd
                self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
+               # TODO: the read() here may block indefinitely.  Need a better
+               # approach therefore, that includes a timeout.
                ret = self.command.stdout.read(5)
+
+               t2 = time.time()
                if 'READY' in ret:
-                       # We can return without delay.
-                       time.sleep(1)
+                       # NOTE: There is still a slight race for machines that are slow...
+                       self.timeout = 2*(t2-t1)
+                       print "Sleeping for %s sec" % self.timeout
+                       time.sleep(self.timeout)
                        return
 
                if self.command.returncode is not None:
@@ -267,13 +290,35 @@ def reboot(hostname, config=None, forced_action=None):
        print "Creating session for %s" % node
        # update known_hosts file (in case the node has rebooted since last run)
        if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
-       k = SSHKnownHosts(); k.update(node); k.write(); del k
+       try:
+               k = SSHKnownHosts(); k.update(node); k.write(); del k
+       except:
+               import traceback; print traceback.print_exc()
+               return False
 
-       if config == None:
-               session = PlanetLabSession(node, False, False)
-       else:
-               session = PlanetLabSession(node, config.nosetup, config.verbose)
-       conn = session.get_connection(config)
+       try:
+               if config == None:
+                       session = PlanetLabSession(node, False, True)
+               else:
+                       session = PlanetLabSession(node, config.nosetup, config.verbose)
+       except Exception, e:
+               print "ERROR setting up session for %s" % hostname
+               import traceback; print traceback.print_exc()
+               print e
+               return False
+
+       try:
+               conn = session.get_connection(config)
+       except EOFError:
+               # NOTE: sometimes the wait in setup_host() is not long enough.  
+               # So, here we try to wait a little longer before giving up entirely.
+               try:
+                       time.sleep(session.timeout*4)
+                       conn = session.get_connection(config)
+               except:
+                       import traceback; print traceback.print_exc()
+                       return False
+                       
 
        if forced_action == "reboot":
                conn.restart_node('rins')
@@ -282,7 +327,7 @@ def reboot(hostname, config=None, forced_action=None):
        boot_state = conn.get_boot_state()
        if boot_state == "boot":
                print "...Boot state of %s already completed : skipping..." % node
-               return False
+               return True
        elif boot_state == "unknown":
                print "...Unknown bootstate for %s : skipping..."% node
                return False
@@ -291,12 +336,16 @@ def reboot(hostname, config=None, forced_action=None):
 
        if conn.bootmanager_running():
                print "...BootManager is currently running.  Skipping host %s" % node
-               return False
+               return True
 
-       if config != None:
-               if config.force:
-                       conn.restart_bootmanager(config.force)
-                       return True
+       #if config != None:
+       #       if config.force:
+       #               conn.restart_bootmanager(config.force)
+       #               return True
+
+       # Read persistent flags, tagged on one week intervals.
+       pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+               
 
        if config and not config.quiet: print "...downloading dmesg from %s" % node
        dmesg = conn.get_dmesg()
@@ -334,7 +383,7 @@ def reboot(hostname, config=None, forced_action=None):
                        break
 
        s = Set(sequence)
-       if config and not config.quiet: print "SET: ", s
+       if config and not config.quiet: print "\tSET: ", s
 
        if len(s) > 1:
                print "...Potential drive errors on %s" % node
@@ -342,6 +391,16 @@ def reboot(hostname, config=None, forced_action=None):
                        print "...Should investigate.  Continuing with node."
                else:
                        print "...Should investigate.  Skipping node."
+                       # TODO: send message related to these errors.
+                       args = {}
+                       args['hostname'] = hostname
+                       args['log'] = conn.get_dmesg().read()
+
+                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
+                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
                        return False
 
        print "...Downloading bm.log from %s" % node
@@ -382,15 +441,18 @@ def reboot(hostname, config=None, forced_action=None):
                        ('exception'    , 'Exception'),
                        ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
                        ('protoerror'   , 'XML RPC protocol error'),
+                       ('nodehostname' , 'Configured node hostname does not resolve'),
                        ('implementerror', 'Implementation Error'),
                        ('readonlyfs'   , '[Errno 30] Read-only file system'),
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
+                       ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                        ('hardwarefail' , 'Hardware requirements not met'),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
+                       ('nospace'      , "No space left on device"),
                        ('nonode'       , 'Failed to authenticate call: No such node'),
                        ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
                        ('bootcheckfail'     , 'BootCheckAuthentication'),
@@ -410,81 +472,181 @@ def reboot(hostname, config=None, forced_action=None):
        s = "-".join(sequence)
        print "   FOUND SEQUENCE: ", s
 
-       if s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done":
-               if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
-               conn.restart_bootmanager('boot')
-       elif s == "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done":
-               if conn.compare_and_repair_nodekeys():
-                       # the keys either are in sync or were forced in sync.
-                       # so try to reboot the node again.
-                       conn.restart_bootmanager('boot')
-               else:
-                       # there was some failure to synchronize the keys.
-                       print "...Unable to repair node keys on %s" % node
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done" or \
-                s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done":
-               conn.restart_bootmanager('boot')
-       elif s == "bminit-cfg-auth-getplc-update-debug-done":
-               conn.restart_bootmanager('boot')
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done":
-               conn.restart_bootmanager('rins')
-       elif s == "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done":
-               conn.restart_bootmanager('boot')
-       elif s == "bminit-cfg-auth-protoerror-exception-update-debug-done":
-               conn.restart_bootmanager('boot')
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done":
-               # reinstall b/c it is not installed.
-               conn.restart_bootmanager('rins')
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done":
-
-               conn.restart_bootmanager('rins')
-       elif s == "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done":
-               conn.restart_node('rins')
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done":
-               conn.restart_node('rins')
-       elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done":
-               conn.restart_node('rins')
-       elif s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done":
-               conn.restart_bootmanager('rins')
-       elif s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done":
-               conn.restart_bootmanager('rins')
-       elif s == "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done" or \
-                s == "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done":
-               conn.dump_plconf_file()
-       elif s == "bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
-            s == "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done":
-               print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
-               pass
+       # NOTE: We get or set the flag based on the current sequence identifier.
+       #  By using the sequence identifier, we guarantee that there will be no
+       #  frequent loops.  I'm guessing there is a better way to track loops,
+       #  though.
+       if not config.force and ( pflags.getFlag(s) or pflags.isRecent() ):
+               pflags.resetFlag(s)
+               pflags.setRecent()
+               pflags.save() 
+               print "... flag is set or it has already run recently. Skipping %s" % node
+               return True
 
-       elif s == "bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done":
-               # MAKE An ACTION record that this host has failed hardware.  May
-               # require either an exception "/minhw" or other manual intervention.
-               # Definitely need to send out some more EMAIL.
-               print "...NOTIFY OWNER OF BROKEN HARDWARE!!!"
-               pass
+       sequences = {}
+
+
+       # restart_bootmanager_boot
+       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-debug-done",
+                       "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+                       "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+                       "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+                       "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+                       ]:
+               sequences.update({n : "restart_bootmanager_boot"})
+
+       #       conn.restart_bootmanager('rins')
+       for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                       ]:
+               sequences.update({n : "restart_bootmanager_rins"})
+
+       # repair_node_keys
+       sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+       #   conn.restart_node('rins')
+       for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+                       ]:
+               sequences.update({n : "restart_node_rins"})
+
+       #       restart_node_boot
+       for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+                        "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+                        "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                        ]:
+               sequences.update({n: "restart_node_boot"})
+
+       # update_node_config_email
+       for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                       "bminit-cfg-exception-nodehostname-update-debug-done",
+                       ]:
+               sequences.update({n : "update_node_config_email"})
+
+       # update_bootcd_email
+       for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-exception-hardwarefail-update-debug-done",
+                       ]:
+               sequences.update({n : "update_bootcd_email"})
+
+       # update_hardware_email
+       sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarefail-update-debug-done" : "update_hardware_email"})
+
+       # broken_hardware_email
+       sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done" : "broken_hardware_email"})
 
-       elif s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done" or \
-            s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done" or \
-            s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done" or \
-                s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done":
-               conn.restart_node('rins')
-               #conn.restart_bootmanager('rins')
-               print "...Need to follow up on this one."
+       
+       if s not in sequences:
+               print "   HOST %s" % hostname
+               print "   UNKNOWN SEQUENCE: %s" % s
 
-               ## If the disk is full, just start over.
-               #conn.restart_bootmanager('rins')
-       elif s == "":
-               pass
+               args = {}
+               args['hostname'] = hostname
+               args['sequence'] = s
+               args['bmlog'] = conn.get_bootmanager_log().read()
+               m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
+                                                                        mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
+               m.reset()
+               m.send(['monitor-list@lists.planet-lab.org'])
+
+               conn.restart_bootmanager('boot')
 
        else:
-               print "   HOST %s" % hostname
-               print "   UNKNOWN SEQUENCE: %s" % s
-               pass
+
+               if   sequences[s] == "restart_bootmanager_boot":
+                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       conn.restart_bootmanager('boot')
+               elif sequences[s] == "restart_bootmanager_rins":
+                       if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+                       conn.restart_bootmanager('rins')
+               elif sequences[s] == "restart_node_rins":
+                       conn.restart_node('rins')
+               elif sequences[s] == "restart_node_boot":
+                       conn.restart_node('boot')
+               elif sequences[s] == "repair_node_keys":
+                       if conn.compare_and_repair_nodekeys():
+                               # the keys either are in sync or were forced in sync.
+                               # so try to reboot the node again.
+                               conn.restart_bootmanager('boot')
+                       else:
+                               # there was some failure to synchronize the keys.
+                               print "...Unable to repair node keys on %s" % node
+               elif sequences[s] == "update_node_config_email":
+                       print "...Sending message to UPDATE NODE CONFIG"
+                       args = {}
+                       args['hostname'] = hostname
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                                                               True, db='nodeid_persistmessages')
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.dump_plconf_file()
+
+               elif sequences[s] == "update_bootcd_email":
+                       print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
+                       import getconf
+                       args = {}
+                       args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                       args['hostname_list'] = "%s" % hostname
+
+                       m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
+                                                               mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+
+               elif sequences[s] == "broken_hardware_email":
+                       # MAKE An ACTION record that this host has failed hardware.  May
+                       # require either an exception "/minhw" or other manual intervention.
+                       # Definitely need to send out some more EMAIL.
+                       print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+                       # TODO: email notice of broken hardware
+                       args = {}
+                       args['hostname'] = hostname
+                       args['log'] = conn.get_dmesg().read()
+                       m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
+                                                                                mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+
+               elif sequences[s] == "update_hardware_email":
+                       print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+                       args = {}
+                       args['hostname'] = hostname
+                       args['bmlog'] = conn.get_bootmanager_log().read()
+                       m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
+                                                                                mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+
+       pflags.setFlag(s)
+       pflags.save() 
 
        return True
        
diff --git a/plc.py b/plc.py
index 2bd580b..7b92445 100644 (file)
--- a/plc.py
+++ b/plc.py
@@ -138,8 +138,11 @@ def getSites(filter=None, fields=None):
        sites = []
        anon = {'AuthMethod': "anonymous"}
        try:
-               sites = api.GetSites(anon, filter, fields)
+               #sites = api.GetSites(anon, filter, fields)
+               sites = api.GetSites(auth.auth, filter, fields)
        except Exception, exc:
+               import traceback
+               traceback.print_exc()
                print "getSites:  %s" % exc
                logger.info("getSites:  %s" % exc)
        return sites
@@ -255,6 +258,7 @@ def enableSliceCreation(nodename):
 Removes ability to create slices. Returns previous max_slices
 '''
 def removeSliceCreation(nodename):
+       print "removeSliceCreation(%s)" % nodename
        api = xmlrpclib.Server(XMLRPC_SERVER, verbose=False)
        try:
                loginbase = siteId(nodename)
index 2019462..20c12b4 100644 (file)
--- a/policy.py
+++ b/policy.py
@@ -23,7 +23,7 @@ import soltesz
 import string
 from www.printbadnodes import cmpCategoryVal
 from config import config
-print "policy"
+#print "policy"
 config = config()
 
 DAT="./monitor.dat"
@@ -371,9 +371,12 @@ class Diagnose(Thread):
 
                pass
                
-       def __getDaysDown(self, diag_record, nodename):
+       def getDaysDown(cls, diag_record):
                daysdown = -1
-               if diag_record['comonstats']['sshstatus'] != "null":
+               if diag_record['comonstats']['uptime'] != "null":
+                       #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
+                       daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+               elif diag_record['comonstats']['sshstatus'] != "null":
                        daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
                elif diag_record['comonstats']['lastcotop'] != "null":
                        daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
@@ -387,13 +390,17 @@ class Diagnose(Thread):
                                diff = now - last_contact
                                daysdown = diff // (60*60*24)
                return daysdown
+       getDaysDown = classmethod(getDaysDown)
 
-       def __getStrDaysDown(self, diag_record, nodename):
-               daysdown = self.__getDaysDown(diag_record, nodename)
+       def getStrDaysDown(cls, diag_record):
+               daysdown = cls.getDaysDown(diag_record)
                if daysdown > 0:
-                       return "(%d days down)"%daysdown
-               else:
+                       return "%d days down"%daysdown
+               elif daysdown == -1:
                        return "Unknown number of days"
+               else:
+                       return "%d days up"% -daysdown
+       getStrDaysDown = classmethod(getStrDaysDown)
 
        def __getCDVersion(self, diag_record, nodename):
                cdversion = ""
@@ -461,13 +468,13 @@ class Diagnose(Thread):
                if  "ERROR" in category:        # i.e. "DOWN"
                        diag_record = {}
                        diag_record.update(node_record)
-                       daysdown = self.__getDaysDown(diag_record, nodename
+                       daysdown = self.getDaysDown(diag_record
                        if daysdown < 7:
                                format = "DIAG: %20s : %-40s Down only %s days  NOTHING DONE"
                                print format % (loginbase, nodename, daysdown)
                                return None
 
-                       s_daysdown = self.__getStrDaysDown(diag_record, nodename)
+                       s_daysdown = self.getStrDaysDown(diag_record)
                        diag_record['message'] = emailTxt.mailtxt.newdown
                        diag_record['args'] = {'nodename': nodename}
                        diag_record['info'] = (nodename, s_daysdown, "")
@@ -498,7 +505,7 @@ class Diagnose(Thread):
 
                elif "OLDBOOTCD" in category:
                        # V2 boot cds as determined by findbad
-                       s_daysdown = self.__getStrDaysDown(node_record, nodename)
+                       s_daysdown = self.getStrDaysDown(node_record)
                        s_cdversion = self.__getCDVersion(node_record, nodename)
                        diag_record = {}
                        diag_record.update(node_record)
@@ -756,12 +763,17 @@ class Diagnose(Thread):
                                act_record['first-found'] = True
                                act_record['log'] += " firstfound"
                                act_record['action'] = ['ticket_waitforever']
-                               act_record['message'] = None
+                               act_record['message'] = message[0]
                                act_record['time'] = current_time
                        else:
                                if delta >= 7*SPERDAY:
                                        act_record['action'] = ['ticket_waitforever']
-                                       act_record['message'] = None
+                                       if 'rt' in act_record and 'Status' in act_record['rt'] and \
+                                                       act_record['rt']['Status'] == 'new':
+                                               act_record['message'] = message[0]
+                                       else:
+                                               act_record['message'] = None
+                                               
                                        act_record['time'] = current_time               # reset clock
                                else:
                                        act_record['action'] = ['ticket_waitforever']
@@ -831,7 +843,10 @@ class Diagnose(Thread):
                if site_stats == None:
                        raise Exception, "loginbase with no nodes in findbad"
                else:
-                       return site_stats['num_nodes']
+                       if 'num_nodes' in site_stats:
+                               return site_stats['num_nodes']
+                       else:
+                               return 0
 
        """
        Returns number of up nodes as the total number *NOT* in act_all with a
index 9406ddb..495d366 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -404,6 +404,7 @@ class APCFolsom(PCUControl):
 
 class APCMaster(PCUControl):
        def run(self, node_port, dryrun):
+               print "Rebooting %s" % self.host
                self.open(self.host, self.username)
                self.sendPassword(self.password)
 
@@ -512,22 +513,12 @@ class HPiLOHttps(PCUControl):
        def run(self, node_port, dryrun):
                import soltesz
 
-               #cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s'" % (
-               #                       self.host, "iloxml/Get_Network.xml", 
-               #                       self.username, self.password)
-               #p_ilo  = Popen(cmd, stdout=PIPE, shell=True)
-               #cmd2 = "grep 'MESSAGE' | grep -v 'No error'"
-               #p_grep = Popen(cmd2, stdin=p_ilo.stdout, stdout=PIPE, stderr=PIPE, shell=True)
-               #sout, serr = p_grep.communicate()
-
                locfg = soltesz.CMD()
                cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
                                        self.host, "iloxml/Get_Network.xml", 
                                        self.username, self.password)
                sout, serr = locfg.run_noexcept(cmd)
 
-               #p_ilo.wait()
-               #p_grep.wait()
                if sout.strip() != "":
                        print "sout: %s" % sout.strip()
                        return sout.strip()
@@ -539,23 +530,6 @@ class HPiLOHttps(PCUControl):
                                                self.username, self.password)
                        sout, serr = locfg.run_noexcept(cmd)
 
-                       #cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s'" % (
-                       #               self.host, "iloxml/Reset_Server.xml", 
-                       #               self.username, self.password)
-                       #print cmd
-                       #p_ilo = Popen(cmd, stdin=PIPE, stdout=PIPE, shell=True)
-                       #cmd2 = "grep 'MESSAGE' | grep -v 'No error'"
-                       #p_grep = Popen(cmd2, shell=True, stdin=p_ilo.stdout, stdout=PIPE, stderr=PIPE)
-                       #sout, serr = p_grep.communicate()
-                       #try: p_ilo.wait()
-                       #except: 
-                       #       print "p_ilo wait failed."
-                       #       pass
-                       #try: p_grep.wait()
-                       #except: 
-                       #       print "p_grep wait failed."
-                       #       pass
-
                        if sout.strip() != "":
                                print "sout: %s" % sout.strip()
                                #return sout.strip()
@@ -1053,10 +1027,11 @@ def pcu_name(pcu):
        else:
                return None
 
+import soltesz
+fb =soltesz.dbLoad("findbadpcus")
+
 def get_pcu_values(pcu_id):
        # TODO: obviously, this shouldn't be loaded each time...
-       import soltesz
-       fb =soltesz.dbLoad("findbadpcus")
 
        try:
                values = fb['nodes']["id_%s" % pcu_id]['values']
@@ -1070,24 +1045,31 @@ def reboot(nodename):
        
 def reboot_policy(nodename, continue_probe, dryrun):
        global verbose
+       print "this is a test of reboot_policy()"
 
        pcu = plc.getpcu(nodename)
        if not pcu:
+               logger.debug("no pcu for %s" % hostname)
+               print "no pcu for %s" % hostname
                return False # "%s has no pcu" % nodename
 
        values = get_pcu_values(pcu['pcu_id'])
        if values == None:
+               logger.debug("No values for pcu probe %s" % hostname)
+               print "No values for pcu probe %s" % hostname
                return False #"no info for pcu_id %s" % pcu['pcu_id']
        
        # Try the PCU first
        logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
 
+       print "reboot_test"
        ret = reboot_test(nodename, values, continue_probe, verbose, dryrun)
 
        if ret != 0:
                print ret
                return False
        else:
+               print "return true"
                return True
 
 def reboot_test(nodename, values, continue_probe, verbose, dryrun):
@@ -1101,6 +1083,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                
                # APC Masterswitch (Berkeley)
                elif continue_probe and values['model'].find("APC AP79xx/Masterswitch") >= 0:
+                       print values
 
                        # TODO: make a more robust version of APC
                        if values['pcu_id'] in [1163,1055,1111,1231,1113,1127,1128,1148]:
@@ -1227,13 +1210,21 @@ def main():
        logger.addHandler(ch)
 
        try:
+               if "test" in sys.argv:
+                       dryrun = True
+               else:
+                       dryrun = False
+
                for node in sys.argv[1:]:
+                       if node == "test": continue
+
                        print "Rebooting %s" % node
-                       if reboot_policy(node, True, False):
+                       if reboot_policy(node, True, dryrun):
                                print "success"
                        else:
                                print "failed"
        except Exception, err:
+               import traceback; traceback.print_exc()
                print err
 
 if __name__ == '__main__':
index eb44910..d89eed0 100644 (file)
@@ -17,6 +17,8 @@ config = config()
 DEBUG= 0
 PICKLE_PATH="pdb"
 
+class ExceptionTimeout(Exception): pass
+
 def dbLoad(name, type=None):
        return SPickle().load(name, type)
 
@@ -169,16 +171,57 @@ class CMD:
 
        def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
+               try:
+                       return CMD.run(self,cmd,timeout)
+               except ExceptionTimeout:
+                       import traceback; print traceback.print_exc()
+                       return ("", "SCRIPTTIMEOUT")
+                       
+
+#              s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
+#              #(f_in, f_out, f_err) = os.popen3(cmd)
+#              (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
+#              lout, lin, lerr = select([f_out,f_err], [], [], timeout)
+#              if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
+#                      # Reached a timeout!  Nuke process so it does not hang.
+#                      s.kill(signal.SIGKILL)
+#                      return ("", "SCRIPTTIMEOUT")
+#              o_value = f_out.read()
+#              e_value = ""
+#              if o_value == "":       # An error has occured
+#                      e_value = f_err.read()
+#
+#              o_value = o_value.strip()
+#              e_value = e_value.strip()
+#
+#              f_out.close()
+#              f_in.close()
+#              f_err.close()
+#              try:
+#                      s.kill()
+#              except OSError:
+#                      # no such process, due to it already exiting...
+#                      pass
+#
+#              return (o_value, e_value)
+       def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
+               (o,e) = self.run(cmd, timeout)
+               self.output = o
+               self.error = e
+               if self.s.returncode is None:
+                       self.s.wait()
+               return self.s.returncode
+
+       def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
+
                s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
-               #(f_in, f_out, f_err) = os.popen3(cmd)
+               self.s = s
                (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
                lout, lin, lerr = select([f_out,f_err], [], [], timeout)
                if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-                       # Reached a timeout!
-                       #print "TODO: kill subprocess: '%s'" % cmd
-                       # TODO: kill subprocess??
+                       # Reached a timeout!  Nuke process so it does not hang.
                        s.kill(signal.SIGKILL)
-                       return ("", "SCRIPTTIMEOUT")
+                       raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
                o_value = f_out.read()
                e_value = ""
                if o_value == "":       # An error has occured
@@ -198,42 +241,6 @@ class CMD:
 
                return (o_value, e_value)
 
-       def run_noexcept2(self, cmd):
-
-               (f_in, f_out, f_err) = os.popen3(cmd)
-               lout, lin, lerr = select([f_out,f_err], [], [], COMMAND_TIMEOUT*2)
-               if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-                       # Reached a timeout!
-                       print "TODO: kill subprocess: '%s'" % cmd
-                       # TODO: kill subprocess??
-                       return ("", "SCRIPTTIMEOUT")
-               o_value = f_out.read()
-               e_value = ""
-               if o_value == "":       # An error has occured
-                       e_value = f_err.read()
-
-               o_value = o_value.strip()
-               e_value = e_value.strip()
-
-               f_out.close()
-               f_in.close()
-               f_err.close()
-               return (o_value, e_value)
-
-       def run(self, cmd):
-
-               (f_in, f_out, f_err) = os.popen3(cmd)
-               value = f_out.read()
-               if value == "":
-                       raise Exception, f_err.read()
-               value = value.strip()
-
-               f_out.close()
-               f_in.close()
-               f_err.close()
-               return value
-
-               
 
 class SSH(CMD):
        def __init__(self, user, host, options = ssh_options):
@@ -248,10 +255,10 @@ class SSH(CMD):
                        options = options + "-o %s=%s " % (o,v)
                return options
 
-       def run(self, cmd):
+       def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
                cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(), 
                                                                        self.user, self.host, cmd)
-               return CMD.run(self, cmd)
+               return CMD.run(self, cmd, timeout)
 
        def get_file(self, rmt_filename, local_filename=None):
                if local_filename == None:
index 9eb0633..c73079b 100644 (file)
@@ -39,7 +39,7 @@ class pxssh (spawn):
 
     ### TODO: This is getting messy and I'm pretty sure this isn't perfect.
     ### TODO: I need to draw a better flow chart for this.
-    def login (self,server,username,password='',ssh_options="",terminal_type='ansi',original_prompts=r"][#$]|~[#$]|bash.*?[#$]|[#$] ",login_timeout=10):
+    def login (self,server,username,password='',ssh_options="",terminal_type='ansi',original_prompts=r"][#$]|~[#$]|-bash.*?[#$]|[#$] ",login_timeout=10):
         """This logs the user into the given server. By default the prompt is
         rather optimistic and should be considered more of an example. It's
         better to try to match the prompt as exactly as possible to prevent
@@ -97,6 +97,7 @@ class pxssh (spawn):
             return False
         # We appear to be in -- reset prompt to something more unique.
         #if not self.set_unique_prompt():
+        #    print "couldn't reset prompt"
         #    self.close()
         #    return False
         return True
index 35ac57b..d83e8d3 100755 (executable)
@@ -64,13 +64,15 @@ def create_netid2ip(l_nodes, l_nodenetworks):
 def create_plcdb():
 
        # get sites, and stats
-       l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude'])
+       l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', 
+                                                                                         'max_slices', 'slice_ids', 'node_ids' ])
        if len(l_sites) == 0:
                sys.exit(1)
        (d_sites,id2lb) = dsites_from_lsites(l_sites)
 
        # get nodes at each site, and 
-       l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'site_id', 'version', 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+       l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', 
+                                                 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
 
        l_nodenetworks = plc.getNodeNetworks()
        (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)