The most current version of everything.

author Stephen Soltesz <soltesz@cs.princeton.edu>

Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)
diff --git a/automate_pl03.sh b/automate_pl03.sh

index e31ead9..32a1a17 100755 (executable)
--- a/automate_pl03.sh
+++ b/automate_pl03.sh
@@ -4,14 +4,16 @@ set -e
  cd $HOME/monitor/
  DATE=`date +%Y-%m-%d-%T`
  
-
  if [ -f $HOME/monitor/SKIP ] ; then 
+       #       echo "SKIPPING Monitor"
+       #       exit
         # TODO: should be possible to kill the old version if 
         # desired and prevent lingering instances of automate.
         if [ -z "$1" ] ; then 
                 echo "KILLING Monitor"
-               ./kill.cmd.sh `cat $HOME/monitor/SKIP`
+               PID=`cat $HOME/monitor/SKIP`
                 rm -f $HOME/monitor/SKIP
+               ./kill.cmd.sh $PID
         else 
                 # skipping monitor
                 echo "SKIPPING Monitor"
@@ -22,7 +24,9 @@ echo $$ > $HOME/monitor/SKIP
  #########################
  # 1. FINDBAD NODES 
  rm -f pdb/production.findbad2.pkl
-./findbad.py --cachenodes --debug=0 --dbname="findbad2" $DATE
+./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE
+
+ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill
  
  ########################
  # COPY to golf for diagnose.py and action.py
@@ -43,6 +47,8 @@ cp badcsv.txt /plc/data/var/www/html/monitor/
  rm -f pdb/production.findbadpcus2.pkl
  ./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE            
  
+./sitebad.py --increment
+
  # clean up stray 'locfg' processes that hang around inappropriately...
  ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill
  
@@ -57,7 +63,7 @@ cp pdb/production.findbadpcus2.pkl pdb/production.findbadpcus.pkl
  ./pkl2php.py -i idTickets -o idTickets
  
  for f in findbad act_all findbadpcus l_plcnodes; do 
-       cp pdb/production.$f.pkl archive-pdb/`date +%F`.production.$f.pkl
+       cp pdb/production.$f.pkl archive-pdb/`date +%F-%H:%M`.production.$f.pkl
  done
  
  rm -f $HOME/monitor/SKIP
diff --git a/emailTxt.py b/emailTxt.py

index f92451d..8a666c8 100644 (file)
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -30,18 +30,18 @@ If the machine has booted successfully, you may check it more quickly by logging
  
      sudo /usr/sbin/vps ax
  
-If you have a BootCD older than 3.0, you will need to create a new BootCD and configuration file.  You can find instructions for this at the Technical Contact's Guide:
+If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB.  You can find instructions for this at the Technical Contact's Guide:
  
      https://www.planet-lab.org/doc/guides/bootcdsetup
  
  If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
  
-After a week, we will disable your site's ability to create new slices.  Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
-
  Thank you for your help,
    -- PlanetLab Central (support@planet-lab.org)
  """)
  
+#If no one responds, then after a week, we will disable your site's ability to create new slices.  Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
+
         newdown_two=("""PlanetLab node(s) down: %(loginbase)s""", 
  """
  Hello,
@@ -114,11 +114,11 @@ Instructions to perform the steps necessary for a BootCD upgrade are available i
  
  If your node returns to normal operation after following these directions, then there's no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.  Including this message in your reply will help us coordinate our records with the actions you've taken.  
  
-After a week, we will disable your site's ability to create new slices.  Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
-
  Thank you for your help,
    -- PlanetLab Central (support@planet-lab.org)
  """)
+#After a week, we will disable your site's ability to create new slices.  Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
+
         newbootcd_two=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", # : %(hostname)s""", 
  """As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: 
  
@@ -196,7 +196,7 @@ Monitor restarted NM on the following machines:
  """As part of our machine monitoring and maintenance, we tried to use the PCU
  registered below, but could not for the following reason at the link below:
  
-       https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php#id%(pcu_id)s
+       https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
  
  We need your help resolving this issue in two ways:  
  
@@ -228,7 +228,7 @@ Thank you very much for your help,
  registered below, and though it appears to succeed, we do not subsequently
  observe the associated nodes rebooting:
  
-    https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php#id%(pcu_id)s
+    https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
  
  %(hostname_list)s
  
@@ -333,7 +333,7 @@ BootManager.log output follows:
         baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", 
                            """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
  
-Please verify the integrity of the disk, and order a replacment if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
+Please verify the integrity of the disk, and order a replacement if needed.  If you need to schedule downtime for the node, please let us know at support@planet-lab.org. 
  
  Thanks.
  
@@ -373,8 +373,31 @@ Thank you for your help,
    -- PlanetLab Central (support@planet-lab.org)
  """)
  
+       plnode_cfg=(""" Please Verify Network Configuration for PlanetLab node %(hostname)s""", 
+"""Hello,
+
+As part of PlanetLab node monitoring, we noticed that %(hostname)s has a network configuration error related to DNS or hostname lookups.  Often this can happen either due local configuraiton changes, or a misconfiguration of the node's DNS servers.  To resolve the issue we require your assistance.  All that is needed is to visit:
+
+       https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
+
+Find the primary node network entry and confirm that the settings are correct.  
+
+If you use 'static' network configuration, verify that the DNS servers are correct.  If you are using 'dhcp' then you will need to confirm that the information returned for the node will allow it to perform lookups on it's own hostname.
+
+If you change the network settings, then select, "Download -> Download plnode.txt file for %(hostname)s" menu.  This will generate a new configuration file for your node.  Copy this file to the appropriate read-only media, either floppy or USB stick, and reboot the machine.  If you are using an All-in-One boot image, then you will need to download the All-in-One image instead, burn it to the appropriate media (CD or USB) and reboot.
+
+Please let us know if you need any assistance.
+
+Thank you for your help,
+  -- PlanetLab Central (support@planet-lab.org)
+
+BootManager.log output follows:
+---------------------------------------------------------
+%(bmlog)s
+""")
+
         plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", 
-                               """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
+"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME.  This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade.  To resolve the issue we require your assistance.  All that is needed is to visit:
  
         https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
  
diff --git a/fetch.py b/fetch.py

index 968ad66..e2f33a8 100755 (executable)
--- a/fetch.py
+++ b/fetch.py
@@ -11,31 +11,6 @@ from config import config
  from optparse import OptionParser
  from automate import *
  
-parser = OptionParser()
-parser.set_defaults(nodelist=None, 
-                                       node=None,
-                                   outdir=None,
-                                       querystr=None,
-                                       timeout=0,
-                                       simple=False,
-                                       run=False,
-                                       cmdfile=None,)
-
-parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
-                                       help="Read list of nodes from specified file")
-parser.add_option("", "--node", dest="node", metavar="hostname",
-                                       help="specify a single node name.")
-parser.add_option("", "--timeout", dest="timeout", metavar="seconds",
-                                       help="Number of seconds to wait before timing out on host.")
-parser.add_option("", "--outdir", dest="outdir", metavar="dirname",
-                                       help="Name of directory to place output")
-parser.add_option("", "--cmd", dest="cmdfile", metavar="filename",
-                                       help="Name of file that contains a unix-to-csv command " + \
-                                            "to run on the hosts.")
-
-config = config(parser)
-config.parse_args()
-
  def build_vx_args(shell_cmd):
         ssh_options="-q -o UserKnownHostsFile=junkssh -o StrictHostKeyChecking=no"
         cmd="""ssh %s root@{} """  % ssh_options
@@ -43,37 +18,62 @@ def build_vx_args(shell_cmd):
         args.append(shell_cmd)
         return args
  
-def vx_start(filelist,outdir,cmd):
+def vx_start(filelist,outdir,cmd, timeout=0):
         args = build_vx_args(cmd)
-       #vxargs.start(None, 10, filelist, outdir, False, args, 120)
-       vxargs.start(None, 10, filelist, outdir, False, args, int(config.timeout))
+       vxargs.start(None, 20, filelist, outdir, False, args, timeout)
+
+if __name__ == "__main__":
+       parser = OptionParser()
+       parser.set_defaults(nodelist=None, 
+                                               node=None,
+                                               outdir=None,
+                                               querystr=None,
+                                               timeout=0,
+                                               simple=False,
+                                               run=False,
+                                               cmdfile=None,)
+
+       parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
+                                               help="Read list of nodes from specified file")
+       parser.add_option("", "--node", dest="node", metavar="hostname",
+                                               help="specify a single node name.")
+       parser.add_option("", "--timeout", dest="timeout", metavar="seconds",
+                                               help="Number of seconds to wait before timing out on host.")
+       parser.add_option("", "--outdir", dest="outdir", metavar="dirname",
+                                               help="Name of directory to place output")
+       parser.add_option("", "--cmd", dest="cmdfile", metavar="filename",
+                                               help="Name of file that contains a unix-to-csv command " + \
+                                                        "to run on the hosts.")
+
+       config = config(parser)
+       config.parse_args()
  
-if config.outdir == None: 
-       outdir="checkhosts"
-else: 
-       outdir=config.outdir
+       if config.outdir == None: 
+               outdir="checkhosts"
+       else: 
+               outdir=config.outdir
  
-if not os.path.exists(outdir):
-       os.system('mkdir -p %s' % outdir)
+       if not os.path.exists(outdir):
+               os.system('mkdir -p %s' % outdir)
  
-if config.nodelist == None and config.node == None:
-       filelist="nocomon.txt"
-       filelist = vxargs.getListFromFile(open(filelist,'r'))
-elif os.path.exists(str(config.nodelist)) and os.path.isfile(config.nodelist):
-       filelist = vxargs.getListFromFile(open(config.nodelist,'r'))
-elif os.path.exists(str(config.nodelist)) and os.path.isdir(config.nodelist):
-       filelist = get_hostlist_from_dir(config.nodelist)
-elif config.node is not None:
-       filelist = [(config.node, '')]
-else:
-       # probably no such file.
-       raise Exception("No such file %s" % config.nodelist)
+       if config.nodelist == None and config.node == None:
+               filelist="nocomon.txt"
+               filelist = vxargs.getListFromFile(open(filelist,'r'))
+       elif os.path.exists(str(config.nodelist)) and os.path.isfile(config.nodelist):
+               filelist = vxargs.getListFromFile(open(config.nodelist,'r'))
+       elif os.path.exists(str(config.nodelist)) and os.path.isdir(config.nodelist):
+               filelist = get_hostlist_from_dir(config.nodelist)
+       elif config.node is not None:
+               filelist = [(config.node, '')]
+       else:
+               # probably no such file.
+               raise Exception("No such file %s" % config.nodelist)
  
-if config.cmdfile == None:
-       f = open("command.txt",'r')
-       cmd = f.read()
-else:
-       f = open(config.cmdfile,'r')
-       cmd = f.read()
+       if config.cmdfile == None:
+               f = open("command.txt",'r')
+               cmd = f.read()
+       else:
+               f = open(config.cmdfile,'r')
+               cmd = f.read()
  
-vx_start(filelist, outdir, cmd)
+       vx_start(filelist, outdir, cmd, int(config.timeout))
diff --git a/findbad.py b/findbad.py

index e08b554..141f9ac 100755 (executable)
--- a/findbad.py
+++ b/findbad.py
@@ -9,7 +9,7 @@ import time
  # QUERY all nodes.
  COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
                                         "table=table_nodeview&" + \
-                                   "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
+                                   "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
                                     "formatcsv"
                                     #"formatcsv&" + \
                                         #"select='lastcotop!=0'"
@@ -25,6 +25,7 @@ import soltesz
  import comon
  import threadpool
  import syncplcdb
+from nodequery import verify,query_to_dict,node_select
  
  import plc
  import auth
@@ -33,7 +34,7 @@ api = plc.PLC(auth.auth, auth.plc)
  def collectPingAndSSH(nodename, cohash):
         ### RUN PING ######################
         ping = soltesz.CMD()
-       (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
+       (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
  
         values = {}
  
@@ -43,31 +44,62 @@ def collectPingAndSSH(nodename, cohash):
         else:
                 values['ping'] = "PING"
  
-       #uptime = soltesz.SSH('root', nodename)
-       #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
+       try:
+               for port in [22, 806]: 
+                       ssh = soltesz.SSH('root', nodename, port)
+
+                       (oval, errval) = ssh.run_noexcept2(""" <<\EOF
+                               echo "{"
+                               echo '  "kernel":"'`uname -a`'",'
+                               echo '  "bmlog":"'`ls /tmp/bm.log`'",'
+                               echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
+                               echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
+                               echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
+
+                               ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
+
+                               echo '  "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
+                               echo '  "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
+                               echo "}"
+EOF                    """)
+                       
+                       if len(oval) > 0:
+                               values.update(eval(oval))
+                               values['sshport'] = port
+                               break
+                       else:
+                               values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 'nm' :
+                               '', 'princeton_comon' : '', 'princeton_comon_running' : '',
+                               'princeton_comon_procs' : '', 'sshport' : None})
+       except:
+               import traceback; print traceback.print_exc()
+               sys.exit(1)
  
         ### RUN SSH ######################
         b_getbootcd_id = True
-       ssh = soltesz.SSH('root', nodename)
-       oval = ""
-       eval = ""
-       (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
-       val = oval
+       #ssh = soltesz.SSH('root', nodename)
+       #oval = ""
+       #errval = ""
+       #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
+
+       oval = values['kernel']
         if "2.6.17" in oval or "2.6.2" in oval:
                 values['ssh'] = 'SSH'
                 values['category'] = 'ALPHA'
-               if "bm.log" in oval:
+               if "bm.log" in values['bmlog']:
                         values['state'] = 'DEBUG'
                 else:
                         values['state'] = 'BOOT'
         elif "2.6.12" in oval or "2.6.10" in oval:
                 values['ssh'] = 'SSH'
                 values['category'] = 'PROD'
-               if "bm.log" in oval:
+               if "bm.log" in values['bmlog']:
                         values['state'] = 'DEBUG'
                 else:
                         values['state'] = 'BOOT'
-       elif "2.4" in oval:
+       
+       # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails.  I have no idea why.
+       elif "2.4" in oval or "2.6.8" in oval:
                 b_getbootcd_id = False
                 values['ssh'] = 'SSH'
                 values['category'] = 'OLDBOOTCD'
@@ -75,7 +107,7 @@ def collectPingAndSSH(nodename, cohash):
         elif oval != "":
                 values['ssh'] = 'SSH'
                 values['category'] = 'UNKNOWN'
-               if "bm.log" in oval:
+               if "bm.log" in values['bmlog']:
                         values['state'] = 'DEBUG'
                 else:
                         values['state'] = 'BOOT'
@@ -85,17 +117,18 @@ def collectPingAndSSH(nodename, cohash):
                 values['ssh'] = 'NOSSH'
                 values['category'] = 'ERROR'
                 values['state'] = 'DOWN'
-               val = eval.strip()
+               val = errval.strip()
+               values['kernel'] = val
  
-       values['kernel'] = val
+       #values['kernel'] = val
  
         if b_getbootcd_id:
                 # try to get BootCD for all nodes that are not 2.4 nor inaccessible
-               (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
-               val = oval
-               if "BootCD" in val:
-                       values['bootcd'] = val
-                       if "v2" in val and \
+               #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
+               oval = values['bootcd']
+               if "BootCD" in oval:
+                       values['bootcd'] = oval
+                       if "v2" in oval and \
                                 ( nodename is not "planetlab1.cs.unc.edu" and \
                                   nodename is not "planetlab2.cs.unc.edu" ):
                                 values['category'] = 'OLDBOOTCD'
@@ -106,6 +139,41 @@ def collectPingAndSSH(nodename, cohash):
  
         # TODO: get bm.log for debug nodes.
         # 'zcat /tmp/bm.log'
+       
+       #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
+       oval = values['nm']
+       if "nm.py" in oval:
+               values['nm'] = "Y"
+       else:
+               values['nm'] = "N"
+
+       continue_slice_check = True
+       #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
+       oval = values['princeton_comon']
+       if "princeton_comon" in oval:
+               values['princeton_comon'] = "Y"
+       else:
+               values['princeton_comon'] = "N"
+               continue_slice_check = False
+
+       if continue_slice_check:
+               #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
+               oval = values['princeton_comon_running']
+               if len(oval) > len('/proc/virtual/'):
+                       values['princeton_comon_running'] = "Y"
+               else:
+                       values['princeton_comon_running'] = "N"
+                       continue_slice_check = False
+       else:
+               values['princeton_comon_running'] = "-"
+               
+       if continue_slice_check:
+               #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
+               oval = values['princeton_comon_procs']
+               values['princeton_comon_procs'] = oval
+       else:
+               values['princeton_comon_procs'] = "-"
+
                 
         if nodename in cohash: 
                 values['comonstats'] = cohash[nodename]
@@ -196,7 +264,8 @@ def recordPingAndSSH(request, result):
  
                 count += 1
                 print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
-               soltesz.dbDump(config.dbname, externalState)
+               if count % 20 == 0:
+                       soltesz.dbDump(config.dbname, externalState)
  
  # this will be called when an exception occurs within a thread
  def handle_exception(request, result):
@@ -242,6 +311,8 @@ def checkAndRecordState(l_nodes, cohash):
                         print "All results collected."
                         break
  
+       soltesz.dbDump(config.dbname, externalState)
+
  
  
  def main():
@@ -259,6 +330,7 @@ def main():
         cotop_url = COMON_COTOPURL
  
         # history information for all nodes
+       #cohash = {}
         cohash = cotop.coget(cotop_url)
         l_nodes = syncplcdb.create_plcdb()
         if config.filename:
@@ -270,9 +342,17 @@ def main():
         elif config.nodegroup:
                 ng = api.GetNodeGroups({'name' : config.nodegroup})
                 l_nodes = api.GetNodes(ng[0]['node_ids'])
-
+       elif config.site:
+               site = api.GetSites(config.site)
+               l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+               
         l_nodes = [node['hostname'] for node in l_nodes]
  
+       # perform this query after the above options, so that the filter above
+       # does not break.
+       if config.nodeselect:
+               l_nodes = node_select(config.nodeselect)
+
         print "fetching %s hosts" % len(l_nodes)
  
         checkAndRecordState(l_nodes, cohash)
@@ -284,13 +364,18 @@ if __name__ == '__main__':
         from config import config
         from optparse import OptionParser
         parser = OptionParser()
-       parser.set_defaults(filename=None, node=None, nodegroup=None, increment=False, dbname="findbadnodes", cachenodes=False)
+       parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, 
+                                               increment=False, dbname="findbadnodes", cachenodes=False)
         parser.add_option("", "--node", dest="node", metavar="hostname", 
                                                 help="Provide a single node to operate on")
         parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
                                                 help="Provide the input file for the node list")
+       parser.add_option("", "--nodeselect", dest="nodeselect", metavar="query string", 
+                                               help="Provide a selection string to return a node list.")
         parser.add_option("", "--nodegroup", dest="nodegroup", metavar="FILE", 
                                                 help="Provide the nodegroup for the list of nodes.")
+       parser.add_option("", "--site", dest="site", metavar="site name",
+                                               help="Specify a site to view node status")
  
         parser.add_option("", "--cachenodes", action="store_true",
                                                 help="Cache node lookup from PLC")
diff --git a/findbadpcu.py b/findbadpcu.py

index 122d8a5..017b4c4 100755 (executable)
--- a/findbadpcu.py
+++ b/findbadpcu.py
@@ -24,30 +24,6 @@ import signal
  #
  #orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler)
  
-from config import config
-from optparse import OptionParser
-parser = OptionParser()
-parser.set_defaults(filename=None, 
-                                       increment=False, 
-                                       pcuid=None,
-                                       dbname="findbadpcus", 
-                                       cachenodes=False,
-                                       refresh=False,
-                                       )
-parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
-                                       help="Provide the input file for the node list")
-parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
-                                       help="Provide the id for a single pcu")
-parser.add_option("", "--cachenodes", action="store_true",
-                                       help="Cache node lookup from PLC")
-parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
-                                       help="Specify the name of the database to which the information is saved")
-parser.add_option("", "--refresh", action="store_true", dest="refresh",
-                                       help="Refresh the cached values")
-parser.add_option("-i", "--increment", action="store_true", dest="increment", 
-                                       help="Increment round number to force refresh or retry")
-config = config(parser)
-config.parse_args()
  
  # QUERY all nodes.
  COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
@@ -286,7 +262,7 @@ def collectPingAndSSH(pcuname, cohash):
                 #### RUN NMAP ###############################
                 if continue_probe:
                         nmap = soltesz.CMD()
-                       (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,16992 %s | grep Host:" % pcu_name(values))
+                       (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values))
                         # NOTE: an empty / error value for oval, will still work.
                         (values['portstatus'], continue_probe) = nmap_portstatus(oval)
                 else:
@@ -415,17 +391,40 @@ def main():
  
         return 0
  
-import logging
-logger = logging.getLogger("monitor")
-logger.setLevel(logging.DEBUG)
-fh = logging.FileHandler("monitor.log", mode = 'a')
-fh.setLevel(logging.DEBUG)
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-fh.setFormatter(formatter)
-logger.addHandler(fh)
-
  
  if __name__ == '__main__':
+       import logging
+       logger = logging.getLogger("monitor")
+       logger.setLevel(logging.DEBUG)
+       fh = logging.FileHandler("monitor.log", mode = 'a')
+       fh.setLevel(logging.DEBUG)
+       formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+       fh.setFormatter(formatter)
+       logger.addHandler(fh)
+       from config import config
+       from optparse import OptionParser
+       parser = OptionParser()
+       parser.set_defaults(filename=None, 
+                                               increment=False, 
+                                               pcuid=None,
+                                               dbname="findbadpcus", 
+                                               cachenodes=False,
+                                               refresh=False,
+                                               )
+       parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE", 
+                                               help="Provide the input file for the node list")
+       parser.add_option("", "--pcuid", dest="pcuid", metavar="id", 
+                                               help="Provide the id for a single pcu")
+       parser.add_option("", "--cachenodes", action="store_true",
+                                               help="Cache node lookup from PLC")
+       parser.add_option("", "--dbname", dest="dbname", metavar="FILE", 
+                                               help="Specify the name of the database to which the information is saved")
+       parser.add_option("", "--refresh", action="store_true", dest="refresh",
+                                               help="Refresh the cached values")
+       parser.add_option("-i", "--increment", action="store_true", dest="increment", 
+                                               help="Increment round number to force refresh or retry")
+       config = config(parser)
+       config.parse_args()
         try:
                 # NOTE: evidently, there is a bizarre interaction between iLO and ssh
                 # when LANG is set... Do not know why.  Unsetting LANG, fixes the problem.
diff --git a/getconf.py b/getconf.py

index 802eced..f40e108 100755 (executable)
--- a/getconf.py
+++ b/getconf.py
@@ -5,11 +5,11 @@ import plc
  import sys
  import os
  
-def getconf(hostname):
+def getconf(hostname, force=False, media=None):
         api = plc.PLC(auth.auth, auth.plc)
         n = api.GetNodes(hostname)
         filename = "bootcd-alpha/" + hostname + ".txt"
-       if not os.path.exists(filename):
+       if not os.path.exists(filename) or force:
                 f = open("bootcd-alpha/" + hostname + ".txt", 'w')
                 f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) )
                 f.close()
@@ -20,8 +20,18 @@ def getconf(hostname):
                 pass
  
         args = {}
-       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
-       args['url_list'] += "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+       if not media:
+               args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+               args['url_list'] += "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+       else:
+               if media == "usb":
+                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+               elif media == "iso":
+                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+               else:
+                       args['url_list']  = "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+                       args['url_list'] += "   http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+                       
         #print "http://pl-virtual-03.cs.princeton.edu/bootcds/%s.usb\n" % hostname
  
         return args
@@ -30,16 +40,18 @@ if __name__ == '__main__':
         from config import config as cfg
         from optparse import OptionParser
         parser = OptionParser()
-       parser.set_defaults(media='both')
+       parser.set_defaults(media='both', force=False)
         parser.add_option("", "--media", dest="media", metavar="usb, iso, both", 
                                                 help="""Which media to generate the message for.""")
+       parser.add_option("", "--force", dest="force", action="store_true", 
+                                               help="""Force the recreation of the usb images.""")
  
         config = cfg(parser)
         config.parse_args()
  
         ret = {'url_list' : ''} 
         for i in config.args:
-               conf = getconf(i)
+               conf = getconf(i, config.force, config.media)
                 ret['url_list'] += conf['url_list']
                 ret['hostname'] = i
  
@@ -68,7 +80,7 @@ Thank you,
  
  """ % ret
  
-       elif config.media == "cd":
+       elif config.media == "iso":
                 print """
  Hello,
  
diff --git a/getsshkeys.py b/getsshkeys.py

index fc306e4..0819abe 100755 (executable)
--- a/getsshkeys.py
+++ b/getsshkeys.py
@@ -154,7 +154,7 @@ class SSHKnownHosts:
  
                 key = key.strip()
                 # TODO: check for '==' at end of key.
-               if key[-1] != '=':
+               if len(key) > 0 and key[-1] != '=':
                         print "Host with corrupt key! for %s %s" % (node['boot_state'], node['hostname'])
  
                 s_date = time.strftime("%Y/%m/%d_%H:%M:%S",time.gmtime(time.time()))
diff --git a/grouprins.py b/grouprins.py

index 99af752..a1e18d6 100755 (executable)
--- a/grouprins.py
+++ b/grouprins.py
@@ -19,6 +19,7 @@ api = plc.PLC(auth.auth, auth.plc)
  import policy
  
  from config import config as cfg
+import config as config2
  from optparse import OptionParser
  
  from nodecommon import *
@@ -32,7 +33,8 @@ from model import *
  import bootman                 # debug nodes
  import monitor         # down nodes with pcu
  import reboot          # down nodes without pcu
-reboot.verbose = 0
+from emailTxt import mailtxt
+#reboot.verbose = 0
  import sys
  
  class Reboot(object):
@@ -50,7 +52,7 @@ class Reboot(object):
                 m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
                                                                  mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
  
-               loginbase = plc.siteId(hostname)
+               loginbase = plc.siteId(host)
                 m.send([policy.TECHEMAIL % loginbase])
  
         def pcu(self, host):
@@ -59,8 +61,8 @@ class Reboot(object):
                 if self.fbnode['pcu'] == "PCU": 
                         self.action = "reboot.reboot('%s')" % host
  
-                       pflags = PersistFlags(host, 1*60*60*24, db='pcu_persistflags')
-                       if not pflags.getRecentFlag('pcutried'): # or not pflags.getFlag('pcufailed'):
+                       pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
+                       if not pflags.getRecentFlag('pcutried'):
                                 pflags.setRecentFlag('pcutried')
                                 try:
                                         ret = reboot.reboot(host)
@@ -87,10 +89,11 @@ class Reboot(object):
  
                                         pflags.setRecentFlag('pcumessagesent')
                                         pflags.save()
+                                       # NOTE: this will result in just one message sent at a time.
+                                       return True
  
                                 else:
-                                       pass # just skip it?
-
+                                       return False
                 else:
                         self.action = "None"
                         return False
@@ -130,6 +133,26 @@ class RebootDown(Reboot):
                 self.action = "None"
                 return False    # this always fails, since the node will be down.
  
+def set_node_to_rins(host, fb):
+
+       node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
+       record = {'observation' : node[0], 
+                         'model' : 'USER_REQUEST', 
+                         'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
+                         'time' : time.time()}
+       l = Log(host, record)
+
+       ret = api.UpdateNode(host, {'boot_state' : 'rins'})
+       if ret:
+               # it's nice to see the current status rather than the previous status on the console
+               node = api.GetNodes(host)[0]
+               print l
+               print "%-2d" % (i-1), nodegroup_display(node, fb)
+               return l
+       else:
+               print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
+               return None
+
  
  try:
         rebootlog = soltesz.dbLoad("rebootlog")
@@ -141,7 +164,7 @@ parser.set_defaults(nodegroup=None,
                                         node=None,
                                         nodelist=None,
                                         nodeselect=None,
-                                       timewait=30,
+                                       timewait=0,
                                         skip=0,
                                         rins=False,
                                         reboot=False,
@@ -209,7 +232,7 @@ if config.findbad:
         # rerun findbad with the nodes in the given nodes.
         import os
         file = "findbad.txt"
-       config.setFileFromList(file, hostnames)
+       config2.setFileFromList(file, hostnames)
         os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
  
  fb = soltesz.dbLoad("findbad")
@@ -219,7 +242,6 @@ count = 1
  for host in hostnames:
  
         #if 'echo' in host or 'hptest-1' in host: continue
-
         try:
                 try:
                         node = api.GetNodes(host)[0]
@@ -259,26 +281,6 @@ for host in hostnames:
                         print "recently rebooted %s.  skipping... " % host
                         continue
  
-               if config.rins:
-                       # reset the boot_state to 'rins'
-                       node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
-                       record = {'observation' : node[0], 
-                                         'model' : 'USER_REQUEST', 
-                                         'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, 
-                                         'time' : time.time()}
-                       l = Log(host, record)
-
-                       ret = api.UpdateNode(host, {'boot_state' : 'rins'})
-                       if ret:
-                               # it's nice to see the current status rather than the previous status on the console
-                               node = api.GetNodes(host)[0]
-                               print l
-                               print "%-2d" % (i-1), nodegroup_display(node, fb)
-                               rebootlog.add(l)
-                       else:
-                               print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-
-
                 if config.reboot:
  
                         fbnode = fb['nodes'][host]['values']
@@ -288,9 +290,17 @@ for host in hostnames:
                                 o = RebootDebug(fbnode)
  
                         elif observed_state == "boot" :
+                               if config.rins:
+                                       l = set_node_to_rins(host, fb)
+                                       if l: rebootlog.add(l)
+
                                 o = RebootBoot(fbnode)
  
                         elif observed_state == "down":
+                               if config.rins:
+                                       l = set_node_to_rins(host, fb)
+                                       if l: rebootlog.add(l)
+
                                 o = RebootDown(fbnode)
  
  
@@ -316,6 +326,12 @@ for host in hostnames:
                                                   'time' : time.time()}
  
                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
+                               args = {}
+                               args['hostname'] = host
+                               m = PersistMessage(host, "ALL FAIL for %(hostname)s" % args,
+                                                                                        "nada", False, db='suspect_persistmessages')
+                               m.reset()
+                               m.send(['monitor-list@lists.planet-lab.org'])
  
                         l = Log(host, record)
                         print l
diff --git a/mailer.py b/mailer.py

index 5fc0320..407390f 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -8,6 +8,7 @@
  from emailTxt import *
  import smtplib
  from config import config
+import calendar
  import logging
  import os
  import time
@@ -50,7 +51,7 @@ def getTicketStatus(ticket_id):
         if ticket_id == None or ticket_id == "":
                 return {}
  
-       cmd = "rt show -t ticket -f id,subject,status,queue %s" % (ticket_id)
+       cmd = "rt show -t ticket -f id,subject,status,queue,created %s" % (ticket_id)
         (f_in, f_out, f_err) = os.popen3(cmd)
         value = f_out.read()
         l_values = value.split('\n')
@@ -59,8 +60,10 @@ def getTicketStatus(ticket_id):
                 if len(line) == 0: continue
                 vals = line.split(':')
                 key = vals[0]
-               r_values[key] = "".join(vals[1:])
+               r_values[key] = ":".join(vals[1:])
                 r_values[key] = r_values[key].strip()
+
+       r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
         return r_values
  
  def setAdminCCViaRT(ticket_id, to):
diff --git a/model.py b/model.py

index 3628c00..558d04d 100644 (file)
--- a/model.py
+++ b/model.py
@@ -61,7 +61,7 @@ class LogRoll:
                                         cmp = re.compile(filter[key])
                                         res = cmp.search(log.__getattribute__(key))
                                         if res != None:
-                                               print "found match in log: %s  %s ~=~ %s" % (log, key, filter[key])
+                                               #print "found match in log: %s  %s ~=~ %s" % (log, key, filter[key])
                                                 if log.time > time.time() - timerange:
                                                         print "returning log b/c it occured within time."
                                                         return log
diff --git a/monitor.py b/monitor.py

index 8891b25..78db954 100644 (file)
--- a/monitor.py
+++ b/monitor.py
@@ -10,12 +10,16 @@ import soltesz
  
  from monitor_policy import *
  import rt
+import sys
  
  import plc
  import auth
  api = plc.PLC(auth.auth, auth.plc)
  
+from clean_policy import *
+
  def reboot(hostname):
+       print "calling reboot!!! %s " % hostname
  
         l_nodes = api.GetNodes(hostname)
         if len(l_nodes) == 0:
@@ -32,19 +36,23 @@ def reboot(hostname):
         if ad_dbTickets == None:
                 raise Exception("Could not find cached dbTickets")
  
+       print "starting new thing"
+       mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
+       mon.run()
+
         #print "merge"
-       merge = Merge( [node['hostname'] for node in l_nodes])
-       record_list = merge.run()
-       #print "rt"
-       rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
-       record_list = rt.run()
-       #print "diagnose"
-       diag = Diagnose(record_list)
-       diagnose_out = diag.run()
+       #merge = Merge( [node['hostname'] for node in l_nodes])
+       #record_list = merge.run()
+       ##print "rt"
+       #rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
+       #record_list = rt.run()
+       ##print "diagnose"
+       #diag = Diagnose(record_list)
+       #diagnose_out = diag.run()
         #print diagnose_out
         #print "action"
-       action = Action(diagnose_out)
-       action.run()
+       #action = Action(diagnose_out)
+       #action.run()
  
         return True
  
@@ -91,7 +99,10 @@ def reboot2(hostname):
  
  
  def main():
-       pass
+       for host in sys.argv[1:]:
+               reboot(host)
  
+print "hello?"
  if __name__ == '__main__':
+       print "calling main"
         main()
diff --git a/monitor_policy.py b/monitor_policy.py

index a44e9a1..70eee04 100644 (file)
--- a/monitor_policy.py
+++ b/monitor_policy.py
@@ -42,6 +42,8 @@ PI=2
  USER=4
  ADMIN=8
  
+from unified_model import *
+
  class Merge:
         def __init__(self, l_merge):
                 self.merge_list = l_merge
@@ -396,12 +398,13 @@ class Diagnose:
  
                 # NOTE: these settings can be overridden by command line arguments,
                 #       or the state of a record, i.e. if already in RT's Support Queue.
-               nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+               pf = PersistFlags(loginbase, 1, db='site_persitflags')
+               nodes_up = pf.nodes_up
                 if nodes_up < MINUP:
                         d_diag_site[loginbase]['config']['squeeze'] = True
  
                 max_slices = self.getMaxSlices(loginbase)
-               num_nodes = self.getNumNodes(loginbase)
+               num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
                 # NOTE: when max_slices == 0, this is either a new site (the old way)
                 #       or an old disabled site from previous monitor (before site['enabled'])
                 if nodes_up < num_nodes and max_slices != 0:
@@ -867,6 +870,7 @@ class Action:
                         if config.policysavedb:
                                 print "Saving Databases... act_all"
                                 soltesz.dbDump("act_all", self.act_all)
+                               soltesz.dbDump("diagnose_out", self.diagnose_db)
                         sys.exit(1)
  
                 #print_stats("sites_observed", stats)
@@ -882,6 +886,7 @@ class Action:
                         # TODO: remove 'diagnose_out', 
                         #       or at least the entries that were acted on.
                         soltesz.dbDump("act_all", self.act_all)
+                       soltesz.dbDump("diagnose_out", self.diagnose_db)
  
         def accumSites(self):
                 """
@@ -1058,6 +1063,7 @@ class Action:
                                 if ticket_id == 0:
                                         # error.
                                         print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+                                       import os
                                         os._exit(1)
                                         pass
  
@@ -1084,8 +1090,8 @@ class Action:
                                 i_nodes_actedon += 1
                 
                 if config.policysavedb:
-                       print "Saving Databases... act_all, diagnose_out"
-                       soltesz.dbDump("act_all", self.act_all)
+                       #print "Saving Databases... act_all, diagnose_out"
+                       #soltesz.dbDump("act_all", self.act_all)
                         # remove site record from diagnose_out, it's in act_all as done.
                         del self.diagnose_db[loginbase]
                         #soltesz.dbDump("diagnose_out", self.diagnose_db)
diff --git a/nodecommon.py b/nodecommon.py

index 0f3d0fb..3256b69 100644 (file)
--- a/nodecommon.py
+++ b/nodecommon.py
@@ -6,6 +6,7 @@ RED     = esc + "[1;31m"
  GREEN  = esc + "[1;32m"
  YELLOW = esc + "[1;33m"
  BLUE   = esc + "[1;34m"
+LIGHTBLUE      = esc + "[1;36m"
  NORMAL  = esc + "[0;39m"
  
  def red(str):
@@ -17,6 +18,9 @@ def yellow(str):
  def green(str):
         return GREEN + str + NORMAL
  
+def lightblue(str):
+       return LIGHTBLUE + str + NORMAL
+
  def blue(str):
         return BLUE + str + NORMAL
  
@@ -37,12 +41,16 @@ def color_pcu_state(fbnode):
                 if values == None:
                         return fbnode['pcu']
         else:
-               return fbnode['pcu']
+               if 'pcu' not in fbnode:
+                       return 'NOPCU'
+               else:
+                       return fbnode['pcu']
  
         if 'reboot' in values:
                 rb = values['reboot']
                 if rb == 0 or rb == "0":
                         return fbnode['pcu'] + "OK  "
+                       #return fbnode['pcu'] + "OK  "
                         #return green(fbnode['pcu'])
                 elif "NetDown" == rb  or "Not_Run" == rb:
                         return fbnode['pcu'] + "DOWN"
@@ -55,8 +63,10 @@ def color_pcu_state(fbnode):
                 return fbnode['pcu'] + "BAD "
  
  def color_boot_state(l):
-       if    l == "dbg": return yellow("dbg ")
-       elif  l == "dbg ": return yellow(l)
+       if    l == "dbg": return yellow("debg")
+       elif  l == "dbg ": return yellow("debg")
+       elif  l == "diag": return lightblue(l)
+       elif  l == "disable": return red("dsbl")
         elif  l == "down": return red(l)
         elif  l == "boot": return green(l)
         elif  l == "rins": return blue(l)
@@ -64,6 +74,7 @@ def color_boot_state(l):
                 return l
  
  def diff_time(timestamp):
+       import math
         now = time.time()
         if timestamp == None:
                 return "unknown"
@@ -71,26 +82,26 @@ def diff_time(timestamp):
         # return the number of seconds as a difference from current time.
         t_str = ""
         if diff < 60: # sec in min.
-               t = diff // 1
-               t_str = "%s sec ago" % t
+               t = diff / 1
+               t_str = "%s sec ago" % int(math.ceil(t))
         elif diff < 60*60: # sec in hour
-               t = diff // (60)
-               t_str = "%s min ago" % int(t)
+               t = diff / (60)
+               t_str = "%s min ago" % int(math.ceil(t))
         elif diff < 60*60*24: # sec in day
-               t = diff // (60*60)
-               t_str = "%s hrs ago" % int(t)
+               t = diff / (60*60)
+               t_str = "%s hrs ago" % int(math.ceil(t))
         elif diff < 60*60*24*7: # sec in week
-               t = diff // (60*60*24)
-               t_str = "%s days ago" % int(t)
-       elif diff < 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7)
-               t_str = "%s wks ago" % int(t)
+               t = diff / (60*60*24)
+               t_str = "%s days ago" % int(math.ceil(t))
+       elif diff <= 60*60*24*30: # approx sec in month
+               t = diff / (60*60*24*7)
+               t_str = "%s wks ago" % int(math.ceil(t))
         elif diff > 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7*30)
+               t = diff / (60*60*24*30)
                 t_str = "%s mnths ago" % int(t)
         return t_str
  
-def nodegroup_display(node, fb):
+def nodegroup_display(node, fb, conf=None):
         if node['hostname'] in fb['nodes']:
                 node['current'] = get_current_state(fb['nodes'][node['hostname']]['values'])
         else:
@@ -106,14 +117,15 @@ def nodegroup_display(node, fb):
                 node['kernel'] = fb['nodes'][node['hostname']]['values']['kernel']
                 
         if '2.6' not in node['kernel']: node['kernel'] = ""
-       node['boot_state']      = color_boot_state(node['boot_state'])
-       node['current']         = color_boot_state(node['current'])
+       if conf and not conf.nocolor:
+           node['boot_state']  = color_boot_state(node['boot_state'])
+           node['current']     = color_boot_state(node['current'])
         #node['boot_state']     = node['boot_state']
         #node['current']        = node['current']
         node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
         node['lastupdate'] = diff_time(node['last_contact'])
  
-       return "%(hostname)-38s %(boot_state)5s %(current)5s %(pcu)6s %(key)45s %(kernel)32s %(lastupdate)12s " % node
+       return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)20.20s... %(kernel)43s %(lastupdate)12s " % node
  
  from model import *
  import soltesz
@@ -143,3 +155,16 @@ def node_end_record(node):
         soltesz.dbDump("act_all", act_all)
         del act_all
         return True
+
+def datetime_fromstr(str):
+       if '-' in str:
+               try:
+                       tup = time.strptime(str, "%Y-%m-%d")
+               except:
+                       tup = time.strptime(str, "%Y-%m-%d-%H:%M")
+       elif '/' in str:
+               tup = time.strptime(str, "%m/%d/%Y")
+       else:
+               tup = time.strptime(str, "%m/%d/%Y")
+       ret = datetime.fromtimestamp(time.mktime(tup))
+       return ret
diff --git a/nodeconfig.py b/nodeconfig.py

index 0b23c7b..d69ccfe 100755 (executable)
--- a/nodeconfig.py
+++ b/nodeconfig.py
@@ -32,16 +32,16 @@ def main():
  
                 try:
                         n = api.GetNodes(node)[0]
-                       print n
+                       #print n
                         net = api.GetNodeNetworks(n['nodenetwork_ids'])[0]
-                       print net
+                       #print net
  
                         node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
                         for k in node_keys:
                                 if 'last' in k:
-                                       print "%15s == %s" % (k, diff_time(net[k]))
+                                       print "%15s == %s" % (k, diff_time(n[k]))
                                 else:
-                                       print "%15s == %s" % (k, net[k])
+                                       print "%15s == %s" % (k, n[k])
  
                         static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary']
                         for k in static_keys:
@@ -51,6 +51,7 @@ def main():
                         #       print k, "==" , net[k]
                 except:
                         print "Error with %s" % node
+                       import traceback; print traceback.print_exc()
                         pass
  
         # commands:
diff --git a/nodegroups.py b/nodegroups.py

index 725d0e0..20f1513 100755 (executable)
--- a/nodegroups.py
+++ b/nodegroups.py
@@ -19,6 +19,7 @@ api = plc.PLC(auth.auth, auth.plc)
  
  from optparse import OptionParser
  from sets import Set
+from nodequery import verify,query_to_dict,node_select
  
  from nodecommon import *
  import soltesz
@@ -31,16 +32,24 @@ def main():
         parser.set_defaults(nodegroup="Alpha",
                                                 node=None,
                                                 nodelist=None,
-                                               list=False,
+                                               list=True,
                                                 add=False,
+                        nocolor=False,
                                                 notng=False,
                                                 delete=False,
+                                               nodeselect=None,
                                                 )
         parser.add_option("", "--not", dest="notng", action="store_true", 
                                                 help="All nodes NOT in nodegroup.")
         parser.add_option("", "--nodegroup", dest="nodegroup", metavar="NodegroupName",
                                                 help="Specify a nodegroup to perform actions on")
+       parser.add_option("", "--nodeselect", dest="nodeselect", metavar="querystring",
+                                               help="Specify a query to perform on findbad db")
+       parser.add_option("", "--site", dest="site", metavar="site name",
+                                               help="Specify a site to view node status")
  
+       parser.add_option("", "--nocolor", dest="nocolor", action="store_true", 
+                                               help="Enable color")
         parser.add_option("", "--list", dest="list", action="store_true", 
                                                 help="List all nodes in the given nodegroup")
         parser.add_option("", "--add", dest="add", action="store_true", 
@@ -70,6 +79,22 @@ def main():
                 #nodelist = api.GetNodes(hostlist)
                 group_str = "Given"
  
+       elif config.site:
+               site = api.GetSites(config.site)
+               if len (site) > 0:
+                       site = site[0]
+                       nodelist = api.GetNodes(site['node_ids'])
+               else:
+                       nodelist = []
+
+               group_str = config.site
+
+       elif config.nodeselect:
+               hostlist = node_select(config.nodeselect)
+               nodelist = api.GetNodes(hostlist)
+
+               group_str = "selection"
+               
         else:
                 ng = api.GetNodeGroups({'name' : config.nodegroup})
                 nodelist = api.GetNodes(ng[0]['node_ids'])
@@ -99,7 +124,7 @@ def main():
                 i = 1
                 for node in nodelist:
                         print "%-2d" % i, 
-                       print nodegroup_display(node, fb)
+                       print nodegroup_display(node, fb, config)
                         i += 1
  
         elif config.add and config.nodegroup:
diff --git a/nodehistory.py b/nodehistory.py

index f40ecc7..b6d0a58 100755 (executable)
--- a/nodehistory.py
+++ b/nodehistory.py
@@ -10,6 +10,7 @@ import time
  from datetime import datetime, timedelta
  import calendar
  
+import sys
  import time
  from model import *
  from nodecommon import *
@@ -28,41 +29,6 @@ parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD",
  config = config(parser)
  config.parse_args()
  
-def datetime_fromstr(str):
-    if '-' in str:
-        tup = time.strptime(str, "%Y-%m-%d")
-    elif '/' in str:
-        tup = time.strptime(str, "%m/%d/%Y")
-    else:
-        tup = time.strptime(str, "%m/%d/%Y")
-    return datetime.fromtimestamp(calendar.timegm(tup))
-
-def diff_time(timestamp):
-       now = time.time()
-       if timestamp == None:
-               return "unknown"
-       diff = now - timestamp
-       # return the number of seconds as a difference from current time.
-       t_str = ""
-       if diff < 60: # sec in min.
-               t = diff
-               t_str = "%s sec ago" % t
-       elif diff < 60*60: # sec in hour
-               t = diff // (60)
-               t_str = "%s min ago" % int(t)
-       elif diff < 60*60*24: # sec in day
-               t = diff // (60*60)
-               t_str = "%s hours ago" % int(t)
-       elif diff < 60*60*24*7: # sec in week
-               t = diff // (60*60*24)
-               t_str = "%s days ago" % int(t)
-       elif diff < 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7)
-               t_str = "%s weeks ago" % int(t)
-       elif diff > 60*60*24*30: # approx sec in month
-               t = diff // (60*60*24*7*30)
-               t_str = "%s months ago" % int(t)
-       return t_str
  
  def fb_print_nodeinfo(fbnode, verbose, date=None):
         if verbose: print "              state |  ssh  |  pcu  | bootcd | category | kernel"
@@ -77,7 +43,8 @@ def fb_print_nodeinfo(fbnode, verbose, date=None):
         else:
                 fbnode['bootcd'] = "unknown"
         fbnode['state'] = color_boot_state(get_current_state(fbnode))
-       fbnode['kernel'] = fbnode['kernel'].split()[2]
+       if len(fbnode['kernel'].split()) >= 3:
+               fbnode['kernel'] = fbnode['kernel'].split()[2]
         print "    %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
  
  def pcu_print_info(pcuinfo, hostname):
@@ -113,12 +80,36 @@ if config.fromtime:
  else:
         begin = "2007-11-06"
  
+if config.node is None and len(config.args) > 0:
+       config.node = config.args[0]
+elif config.node is None:
+       print "Add a hostname to arguments"
+       print "exit."
+       sys.exit(1)
+
  d = datetime_fromstr(begin)
  tdelta = timedelta(1)
  verbose = 1
  
+def get_filefromglob(d, str):
+       import os
+       import glob
+       # TODO: This is aweful.
+       path = "archive-pdb"
+       archive = soltesz.SPickle(path)
+       glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
+       os.chdir(path)
+       #print glob_str
+       file = glob.glob(glob_str)[0]
+       #print "loading %s" % file
+       os.chdir("..")
+       return file[:-4]
+       #fb = archive.load(file[:-4])
+       
+
  while True:
-       file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+       file = get_filefromglob(d, "production.findbad")
+       #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
         
         try:
                 fb = archive.load(file)
@@ -128,7 +119,10 @@ while True:
  
                 del fb
                 verbose = 0
+       except KeyboardInterrupt:
+               sys.exit(1)
         except:
+               #import traceback; print traceback.print_exc()
                 print d.strftime("%Y-%m-%d"), "No record"
  
         d = d + tdelta
diff --git a/nodeinfo.py b/nodeinfo.py

index 3376257..9458cf2 100755 (executable)
--- a/nodeinfo.py
+++ b/nodeinfo.py
@@ -86,7 +86,8 @@ def fb_print_nodeinfo(fbnode):
                 fbnode['state'] = color_boot_state(get_current_state(fbnode))
         else:
                 fbnode['state'] = "none"
-       fbnode['kernel'] = fbnode['kernel'].split()[2]
+       if len(fbnode['kernel'].split()) > 2:
+               fbnode['kernel'] = fbnode['kernel'].split()[2]
         print "\t       %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
  
  def act_print_nodeinfo(actnode, header):
diff --git a/nodequery.py b/nodequery.py

index a6b6c1c..465c309 100755 (executable)
--- a/nodequery.py
+++ b/nodequery.py
@@ -4,16 +4,20 @@ import plc
  import auth
  api = plc.PLC(auth.auth, auth.plc)
  
+import sys
  import soltesz
-fb = soltesz.dbLoad("findbad")
-fbpcu = soltesz.dbLoad("findbadpcus")
  from nodecommon import *
  from policy import Diagnose
+import glob
+import os
+from reboot import pcu_name
  
  import time
  import re
  
-
+#fb = {}
+fb = soltesz.dbLoad("findbad")
+fbpcu = {}
  
  def daysdown_print_nodeinfo(fbnode, hostname):
         fbnode['hostname'] = hostname
@@ -22,19 +26,28 @@ def daysdown_print_nodeinfo(fbnode, hostname):
  
         print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % fbnode
  
-def fb_print_nodeinfo(fbnode, hostname):
+def fb_print_nodeinfo(fbnode, hostname, fields=None):
         fbnode['hostname'] = hostname
         fbnode['checked'] = diff_time(fbnode['checked'])
         if fbnode['bootcd']:
                 fbnode['bootcd'] = fbnode['bootcd'].split()[-1]
         else:
                 fbnode['bootcd'] = "unknown"
-       if 'ERROR' in fbnode['category']:
-               fbnode['kernel'] = ""
-       else:
-               fbnode['kernel'] = fbnode['kernel'].split()[2]
         fbnode['pcu'] = color_pcu_state(fbnode)
-       print "%(hostname)-39s | %(checked)11.11s | %(state)10.10s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+
+       if not fields:
+               if 'ERROR' in fbnode['category']:
+                       fbnode['kernel'] = ""
+               else:
+                       fbnode['kernel'] = fbnode['kernel'].split()[2]
+               fbnode['boot_state'] = fbnode['plcnode']['boot_state']
+
+               print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+       else:
+               format = ""
+               for f in fields:
+                       format += "%%(%s)s " % f
+               print format % fbnode
  
  def verify(constraints, data):
         """
@@ -55,8 +68,10 @@ def verify(constraints, data):
                                 value_re = re.compile(con[key])
                                 con_and_true = con_and_true & (value_re.search(data[key]) is not None)
                         elif key not in data:
-                               print "missing key %s" % key
-                               con_and_true = False
+                               print "missing key %s" % key,
+                               pass
+                               #print "missing key %s" % key
+                               #con_and_true = False
  
                 con_or_true = con_or_true | con_and_true
  
@@ -87,36 +102,53 @@ def _pcu_in(fbdata):
                                 return True
         return False
  
-def pcu_select(str_query):
+def pcu_select(str_query, nodelist=None):
         pcunames = []
-       if str_query is None: return pcunames
+       nodenames = []
+       if str_query is None: return (nodenames, pcunames)
  
         #print str_query
         dict_query = query_to_dict(str_query)
         #print dict_query
  
         for node in fb['nodes'].keys():
+               if nodelist is not None: 
+                       if node not in nodelist: continue
         
                 fb_nodeinfo  = fb['nodes'][node]['values']
                 if _pcu_in(fb_nodeinfo):
                         pcuinfo = fbpcu['nodes']['id_%s' % fb_nodeinfo['plcnode']['pcu_ids'][0]]['values']
                         if verify(dict_query, pcuinfo):
-                               pcunames.append(node)
-       
-       return pcunames
+                               nodenames.append(node)
+                               str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
+                                                       (pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
+                               pcunames.append(str)
+       return (nodenames, pcunames)
  
-def node_select(str_query):
+def node_select(str_query, nodelist=None):
         hostnames = []
         if str_query is None: return hostnames
  
         #print str_query
         dict_query = query_to_dict(str_query)
         #print dict_query
+       global fb
  
         for node in fb['nodes'].keys():
+               if nodelist is not None: 
+                       if node not in nodelist: continue
         
                 fb_nodeinfo  = fb['nodes'][node]['values']
  
+               if fb_nodeinfo == []:
+                       #print node, "has lost values"
+                       continue
+                       #sys.exit(1)
+               fb_nodeinfo['pcu'] = color_pcu_state(fb_nodeinfo)
+               fb_nodeinfo['hostname'] = node
+               if 'plcnode' in fb_nodeinfo:
+                       fb_nodeinfo.update(fb_nodeinfo['plcnode'])
+
                 if verify(dict_query, fb_nodeinfo):
                         #print node #fb_nodeinfo
                         hostnames.append(node)
@@ -128,30 +160,65 @@ def node_select(str_query):
  
  
  def main():
+       global fb
+       global fbpcu
+
         from config import config
         from optparse import OptionParser
         parser = OptionParser()
-       parser.set_defaults(node=None, select=None, pcuselect=None, nodelist=None, daysdown=None)
+       parser.set_defaults(node=None, fromtime=None, select=None, list=None, pcuselect=None, nodelist=None, daysdown=None, fields=None)
         parser.add_option("", "--daysdown", dest="daysdown", action="store_true",
                                                 help="List the node state and days down...")
         parser.add_option("", "--select", dest="select", metavar="key=value", 
                                                 help="List all nodes with the given key=value pattern")
+       parser.add_option("", "--fields", dest="fields", metavar="key,list,...", 
+                                               help="a list of keys to display for each entry.")
+       parser.add_option("", "--list", dest="list", action="store_true", 
+                                               help="Write only the hostnames as output.")
         parser.add_option("", "--pcuselect", dest="pcuselect", metavar="key=value", 
                                                 help="List all nodes with the given key=value pattern")
         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
                                                 help="A list of nodes to bring out of debug mode.")
+       parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD",
+                                       help="Specify a starting date from which to begin the query.")
         config = config(parser)
         config.parse_args()
+       
+       if config.fromtime:
+               path = "archive-pdb"
+               archive = soltesz.SPickle(path)
+               d = datetime_fromstr(config.fromtime)
+               glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d")
+               os.chdir(path)
+               #print glob_str
+               file = glob.glob(glob_str)[0]
+               #print "loading %s" % file
+               os.chdir("..")
+               fb = archive.load(file[:-4])
+       else:
+               fb = soltesz.dbLoad("findbad")
+
+       fbpcu = soltesz.dbLoad("findbadpcus")
  
         if config.nodelist:
                 nodelist = config.getListFromFile(config.nodelist)
-       elif config.select is not None:
-               nodelist = node_select(config.select)
-       elif config.pcuselect is not None:
-               nodelist = pcu_select(config.pcuselect)
         else:
                 nodelist = fb['nodes'].keys()
  
+       pculist = None
+       if config.select is not None and config.pcuselect is not None:
+               nodelist = node_select(config.select, nodelist)
+               nodelist, pculist = pcu_select(config.pcuselect, nodelist)
+       elif config.select is not None:
+               nodelist = node_select(config.select, nodelist)
+       elif config.pcuselect is not None:
+               nodelist, pculist = pcu_select(config.pcuselect, nodelist)
+
+
+       if pculist:
+               for pcu in pculist:
+                       print pcu
+
         for node in nodelist:
                 config.node = node
  
@@ -160,15 +227,23 @@ def main():
  
                 fb_nodeinfo  = fb['nodes'][node]['values']
  
-               if config.daysdown:
-                       daysdown_print_nodeinfo(fb_nodeinfo, node)
+               if config.list:
+                       print node
                 else:
-                       if config.select:
-                               fb_print_nodeinfo(fb_nodeinfo, node)
-                       elif not config.select and 'state' in fb_nodeinfo:
-                               fb_print_nodeinfo(fb_nodeinfo, node)
+                       if config.daysdown:
+                               daysdown_print_nodeinfo(fb_nodeinfo, node)
                         else:
-                               pass
+                               if config.select:
+                                       if config.fields:
+                                               fields = config.fields.split(",")
+                                       else:
+                                               fields = None
+
+                                       fb_print_nodeinfo(fb_nodeinfo, node, fields)
+                               elif not config.select and 'state' in fb_nodeinfo:
+                                       fb_print_nodeinfo(fb_nodeinfo, node)
+                               else:
+                                       pass
                 
  if __name__ == "__main__":
         main()
diff --git a/nodereboot.py b/nodereboot.py

index a7a99d9..3f9d6c6 100755 (executable)
--- a/nodereboot.py
+++ b/nodereboot.py
@@ -2,7 +2,6 @@
  
  # Attempt to reboot a node in debug state.
  
-
  import plc
  import auth
  api = plc.PLC(auth.auth, auth.plc)
@@ -33,6 +32,10 @@ class Sopen(subprocess.Popen):
  from Rpyc import SocketConnection, Async
  from Rpyc.Utils import *
  
+def get_fbnode(node):
+       fb = soltesz.dbLoad("findbad")
+       fbnode = fb['nodes'][node]['values']
+       return fbnode
  
  class NodeConnection:
         def __init__(self, connection, node, config):
@@ -106,8 +109,8 @@ class NodeConnection:
                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
                 except Exception, x:
                         bm_continue = False
-                       if not config.quiet: print "exception"
-                       if not config.quiet: print x
+                       print "exception"
+                       print x
                         print "   Possibly, unable to find valid configuration file"
  
                 if bm_continue:
@@ -134,6 +137,9 @@ class NodeConnection:
                 else:
                         return False
  
+       def set_nodestate(self, state='boot'):
+               return api.UpdateNode(self.node, {'boot_state' : state})
+
         def restart_node(self, state='boot'):
                 api.UpdateNode(self.node, {'boot_state' : state})
  
@@ -196,6 +202,7 @@ class PlanetLabSession:
                 args['user'] = 'root'
                 args['hostname'] = self.node
                 args['monitordir'] = "/home/soltesz/monitor"
+               ssh_port = 22
  
                 if self.nosetup:
                         print "Skipping setup"
@@ -223,19 +230,31 @@ class PlanetLabSession:
  
                 t1 = time.time()
                 # KILL any already running servers.
-               cmd = """ssh %(user)s@%(hostname)s """ + \
-                        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
-               cmd = cmd % args
-               if self.verbose: print cmd
-               # TODO: Add timeout
-               print localos.system(cmd,timeout)
-
-               # START a new rpyc server.
-               cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
-                        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
-               cmd = cmd % args
-               if self.verbose: print cmd
-               print localos.system(cmd,timeout)
+               ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+               (ov,ev) = ssh.run_noexcept2("""<<\EOF
+            rm -f out.log
+            echo "kill server" >> out.log
+            ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
+            echo "export" >> out.log
+            export PYTHONPATH=$HOME  ;
+            echo "start server" >> out.log
+            python Rpyc/Servers/forking_server.py &> server.log &
+            echo "done" >> out.log
+EOF""")
+               #cmd = """ssh %(user)s@%(hostname)s """ + \
+               #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
+               #cmd = cmd % args
+               #if self.verbose: print cmd
+               ## TODO: Add timeout
+               #print localos.system(cmd,timeout)
+
+               ## START a new rpyc server.
+               #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
+               #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
+               #cmd = cmd % args
+               #if self.verbose: print cmd
+               #print localos.system(cmd,timeout)
+               print ssh.ret
  
                 # TODO: Add timeout
                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
@@ -250,7 +269,8 @@ class PlanetLabSession:
                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
                 # TODO: the read() here may block indefinitely.  Need a better
                 # approach therefore, that includes a timeout.
-               ret = self.command.stdout.read(5)
+               #ret = self.command.stdout.read(5)
+               ret = soltesz.read_t(self.command.stdout, 5)
  
                 t2 = time.time()
                 if 'READY' in ret:
@@ -286,6 +306,24 @@ def index_to_id(steps,index):
  
  def reboot(hostname, config=None, forced_action=None):
  
+       # NOTE: Nothing works if the bootcd is REALLY old.
+       #       So, this is the first step.
+       fbnode = get_fbnode(hostname)
+       if fbnode['category'] == "OLDBOOTCD":
+               print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
+               args = {}
+               args['hostname_list'] = "    %s" % hostname
+
+               m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
+                                                       mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
+
+               loginbase = plc.siteId(hostname)
+               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+
+               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+               api.UpdateNode(hostname, {'boot_state' : 'disable'})
+               return True
+
         node = hostname
         print "Creating session for %s" % node
         # update known_hosts file (in case the node has rebooted since last run)
@@ -356,10 +394,13 @@ def reboot(hostname, config=None, forced_action=None):
                 steps = [
                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
+                       ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
+
                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
+                       ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
                         ('floppytimeout','floppy0: floppy timeout called'),
                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
  
@@ -401,12 +442,18 @@ def reboot(hostname, config=None, forced_action=None):
  
                         loginbase = plc.siteId(hostname)
                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.set_nodestate('diag')
                         return False
  
         print "...Downloading bm.log from %s" % node
         log = conn.get_bootmanager_log()
         child = fdpexpect.fdspawn(log)
  
+       try:
+               if config.collect: return True
+       except:
+               pass
+
         time.sleep(1)
  
         if config and not config.quiet: print "...Scanning bm.log for errors"
@@ -447,8 +494,11 @@ def reboot(hostname, config=None, forced_action=None):
                         ('noinstall'    , 'notinstalled'),
                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                         ('noblockdev'   , "No block devices detected."),
+                       ('downloadfail' , 'Unable to download main tarball /boot-alpha/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
-                       ('hardwarefail' , 'Hardware requirements not met'),
+                       ('hardwarerequirefail' , 'Hardware requirements not met'),
+                       ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
+                       ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                         ('modulefail'   , 'Unable to get list of system modules'),
                         ('writeerror'   , 'write error: No space left on device'),
@@ -476,9 +526,8 @@ def reboot(hostname, config=None, forced_action=None):
         #  By using the sequence identifier, we guarantee that there will be no
         #  frequent loops.  I'm guessing there is a better way to track loops,
         #  though.
-       if not config.force and ( pflags.getFlag(s) or pflags.isRecent() ):
-               pflags.resetFlag(s)
-               pflags.setRecent()
+       if not config.force and pflags.getRecentFlag(s):
+               pflags.setRecentFlag(s)
                 pflags.save() 
                 print "... flag is set or it has already run recently. Skipping %s" % node
                 return True
@@ -509,6 +558,10 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
                         ]:
                 sequences.update({n : "restart_bootmanager_rins"})
  
@@ -527,6 +580,7 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                         ]:
                 sequences.update({n : "restart_node_rins"})
  
@@ -535,30 +589,40 @@ def reboot(hostname, config=None, forced_action=None):
                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                          ]:
                 sequences.update({n: "restart_node_boot"})
  
         # update_node_config_email
         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-nodehostname-update-debug-done",
                         ]:
                 sequences.update({n : "update_node_config_email"})
  
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+               sequences.update({n : "nodenetwork_email"})
+
         # update_bootcd_email
-       for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarefail-update-debug-done",
-                       "bminit-cfg-auth-getplc-hardware-exception-hardwarefail-update-debug-done",
+       for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
                         ]:
                 sequences.update({n : "update_bootcd_email"})
  
+       for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+                       ]:
+               sequences.update({n: "suspect_error_email"})
+
         # update_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarefail-update-debug-done" : "update_hardware_email"})
+       sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+       sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
  
         # broken_hardware_email
-       sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done" : "broken_hardware_email"})
+       sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+       flag_set = True
  
         
         if s not in sequences:
@@ -576,6 +640,10 @@ def reboot(hostname, config=None, forced_action=None):
  
                 conn.restart_bootmanager('boot')
  
+               # NOTE: Do not set the pflags value for this sequence if it's unknown.
+               # This way, we can check it again after we've fixed it.
+               flag_set = False
+
         else:
  
                 if   sequences[s] == "restart_bootmanager_boot":
@@ -596,6 +664,19 @@ def reboot(hostname, config=None, forced_action=None):
                         else:
                                 # there was some failure to synchronize the keys.
                                 print "...Unable to repair node keys on %s" % node
+
+               elif sequences[s] == "suspect_error_email":
+                       args = {}
+                       args['hostname'] = hostname
+                       args['sequence'] = s
+                       args['bmlog'] = conn.get_bootmanager_log().read()
+                       m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
+                                                                                mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
+                       m.reset()
+                       m.send(['monitor-list@lists.planet-lab.org'])
+
+                       conn.restart_bootmanager('boot')
+
                 elif sequences[s] == "update_node_config_email":
                         print "...Sending message to UPDATE NODE CONFIG"
                         args = {}
@@ -605,6 +686,19 @@ def reboot(hostname, config=None, forced_action=None):
                         loginbase = plc.siteId(hostname)
                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
                         conn.dump_plconf_file()
+                       conn.set_nodestate('diag')
+
+               elif sequences[s] == "nodenetwork_email":
+                       print "...Sending message to LOOK AT NODE NETWORK"
+                       args = {}
+                       args['hostname'] = hostname
+                       args['bmlog'] = conn.get_bootmanager_log().read()
+                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                                                               True, db='nodenet_persistmessages')
+                       loginbase = plc.siteId(hostname)
+                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.dump_plconf_file()
+                       conn.set_nodestate('diag')
  
                 elif sequences[s] == "update_bootcd_email":
                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
@@ -619,6 +713,9 @@ def reboot(hostname, config=None, forced_action=None):
                         loginbase = plc.siteId(hostname)
                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
  
+                       #print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+                       #conn.set_nodestate('disable')
+
                 elif sequences[s] == "broken_hardware_email":
                         # MAKE An ACTION record that this host has failed hardware.  May
                         # require either an exception "/minhw" or other manual intervention.
@@ -633,6 +730,7 @@ def reboot(hostname, config=None, forced_action=None):
  
                         loginbase = plc.siteId(hostname)
                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
  
                 elif sequences[s] == "update_hardware_email":
                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
@@ -644,9 +742,11 @@ def reboot(hostname, config=None, forced_action=None):
  
                         loginbase = plc.siteId(hostname)
                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
  
-       pflags.setFlag(s)
-       pflags.save() 
+       if flag_set:
+               pflags.setRecentFlag(s)
+               pflags.save() 
  
         return True
         
@@ -657,7 +757,7 @@ def main():
         from config import config
         from optparse import OptionParser
         parser = OptionParser()
-       parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
+       parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
         parser.add_option("", "--child", dest="child", action="store_true", 
                                                 help="This is the child mode of this process.")
         parser.add_option("", "--force", dest="force", metavar="boot_state",
@@ -666,6 +766,8 @@ def main():
                                                 help="Extra quiet output messages.")
         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
                                                 help="Extra debug output messages.")
+       parser.add_option("", "--collect", dest="collect", action="store_true", 
+                                               help="No action, just collect dmesg, and bm.log")
         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
                                                 help="Do not perform the orginary setup phase.")
         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
diff --git a/reboot.py b/reboot.py

index 7c6bea3..82cb33c 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -297,7 +297,7 @@ class IPAL(PCUControl):
                         ret = s.recv(count, socket.MSG_DONTWAIT)
                 except socket.error, e:
                         if e[0] == errno.EAGAIN:
-                               return Exception(e[1])
+                               raise Exception(e[1])
                         else:
                                 # TODO: not other exceptions.
                                 raise Exception(e)
@@ -317,7 +317,7 @@ class IPAL(PCUControl):
                         s.close()
                         if e[0] == errno.ECONNREFUSED:
                                 # cannot connect to remote host
-                               return Exception(e[1])
+                               raise Exception(e[1])
                         else:
                                 # TODO: what other conditions are there?
                                 raise Exception(e)
@@ -327,6 +327,10 @@ class IPAL(PCUControl):
                 s.send(self.format_msg("", 'O'))
                 ret = self.recv_noblock(s, 8)
                 print "Current status is '%s'" % ret
+
+               if ret == '':
+                       raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+                       
                                 
                 if node_port < len(ret):
                         status = ret[node_port]
@@ -343,14 +347,14 @@ class IPAL(PCUControl):
                         
  
                 if not dryrun:
-                       print "Pulsing %s" % node_port
                         if power_on:
+                               print "Pulsing %s" % node_port
                                 s.send(self.format_msg("%s" % node_port, 'P'))
                         else:
-                               # NOTE: turn power on before pulsing the port.
-                               print "power was off, so turning on then pulsing..."
+                               # NOTE: turn power on ; do not pulse the port.
+                               print "Power was off, so turning on ..."
                                 s.send(self.format_msg("%s" % node_port, 'E'))
-                               s.send(self.format_msg("%s" % node_port, 'P'))
+                               #s.send(self.format_msg("%s" % node_port, 'P'))
  
                         print "Receiving response."
                         ret = self.recv_noblock(s, 8)
diff --git a/rt.py b/rt.py

index 3d0cac2..4a951b9 100644 (file)
--- a/rt.py
+++ b/rt.py
@@ -127,6 +127,9 @@ def rt_tickets():
  #                       % (hostname,hostname)
  
         # Queue == 10 is the spam Queue in RT.
+# SELECT Tk.* FROM Tickets AS Tk, Attachments AS At JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId  WHERE Tk.Queue != 10 AND Tk.id > 10000 AND Tr.id=At.TransactionID AND Tk.Status = 'open' ;
+# 
+
         sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content
                          FROM Tickets AS Tk, Attachments AS At 
                          JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId  
@@ -139,7 +142,7 @@ def rt_tickets():
  #WHERE Tk.Queue != 10 AND Tk.id > 10000 AND 
  #Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR
  #Tk.Status = 'new') """
-       sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress FROM Tickets AS Tk, Attachments AS At, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy """
+       sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress, Tk.LastUpdated FROM Tickets AS Tk, Attachments AS At, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy """
  
  
         raw = fetch_from_db(db, sql)
@@ -158,7 +161,9 @@ def rt_tickets():
                                 "status":x[1],
                                 "subj":str(x[2]),
                                 "content":str(x[3]),
-                               "email":str(x[4]) },
+                               "email":str(x[4]),
+                               "lastupdated":str(x[5]),
+                               },
                                 raw)
  
         db.close()
diff --git a/soltesz.py b/soltesz.py

index d89eed0..1c65e15 100644 (file)
--- a/soltesz.py
+++ b/soltesz.py
@@ -165,45 +165,26 @@ class Sopen(subprocess.Popen):
         def kill(self, signal = signal.SIGTERM):
                 os.kill(self.pid, signal)
  
+def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
+       lin, lout, lerr = select([stream], [], [], timeout)
+       if len(lin) == 0:
+               raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+
+       return stream.read(count)
+
  class CMD:
         def __init__(self):
                 pass
  
         def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
  
+               #print "CMD.run_noexcept(%s)" % cmd
                 try:
                         return CMD.run(self,cmd,timeout)
                 except ExceptionTimeout:
                         import traceback; print traceback.print_exc()
                         return ("", "SCRIPTTIMEOUT")
                         
-
-#              s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
-#              #(f_in, f_out, f_err) = os.popen3(cmd)
-#              (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-#              lout, lin, lerr = select([f_out,f_err], [], [], timeout)
-#              if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-#                      # Reached a timeout!  Nuke process so it does not hang.
-#                      s.kill(signal.SIGKILL)
-#                      return ("", "SCRIPTTIMEOUT")
-#              o_value = f_out.read()
-#              e_value = ""
-#              if o_value == "":       # An error has occured
-#                      e_value = f_err.read()
-#
-#              o_value = o_value.strip()
-#              e_value = e_value.strip()
-#
-#              f_out.close()
-#              f_in.close()
-#              f_err.close()
-#              try:
-#                      s.kill()
-#              except OSError:
-#                      # no such process, due to it already exiting...
-#                      pass
-#
-#              return (o_value, e_value)
         def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
                 (o,e) = self.run(cmd, timeout)
                 self.output = o
@@ -214,10 +195,59 @@ class CMD:
  
         def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
  
+               #print "CMD.run(%s)" % cmd
                 s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                 self.s = s
                 (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-               lout, lin, lerr = select([f_out,f_err], [], [], timeout)
+               #print "calling select(%s)" % timeout
+               lout, lin, lerr = select([f_out], [], [f_err], timeout)
+               #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
+               if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
+                       # Reached a timeout!  Nuke process so it does not hang.
+                       #print "KILLING"
+                       s.kill(signal.SIGKILL)
+                       raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+               else:
+                       #print "RETURNING"
+                       #print len(lin), len(lout), len(lerr)
+                       pass
+
+               o_value = ""
+               e_value = ""
+
+               #print "reading from f_out"
+               if len(lout) > 0: o_value = f_out.read()
+               #print "reading from f_err"
+               if len(lerr) > 0: e_value = f_err.read()
+
+               #print "striping output"
+               o_value = o_value.strip()
+               e_value = e_value.strip()
+
+               #print "OUTPUT", o_value, e_value
+
+               #print "closing files"
+               f_out.close()
+               f_in.close()
+               f_err.close()
+               try:
+                       #print "s.kill()"
+                       s.kill()
+                       #print "after s.kill()"
+               except OSError:
+                       # no such process, due to it already exiting...
+                       pass
+
+               #print o_value, e_value
+               return (o_value, e_value)
+
+       def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
+
+               #print "CMD.run(%s)" % " ".join(args)
+               s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
+               self.s = s
+               (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
+               lout, lin, lerr = select([f_out], [], [f_err], timeout)
                 if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
                         # Reached a timeout!  Nuke process so it does not hang.
                         s.kill(signal.SIGKILL)
@@ -243,10 +273,11 @@ class CMD:
  
  
  class SSH(CMD):
-       def __init__(self, user, host, options = ssh_options):
+       def __init__(self, user, host, port=22, options = ssh_options):
                 self.options = options
                 self.user = user
                 self.host = host
+               self.port = port
                 return
  
         def __options_to_str(self):
@@ -256,14 +287,15 @@ class SSH(CMD):
                 return options
  
         def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
-               cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(), 
+               cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
                                                                         self.user, self.host, cmd)
+               #print "SSH.run(%s)" % cmd
                 return CMD.run(self, cmd, timeout)
  
         def get_file(self, rmt_filename, local_filename=None):
                 if local_filename == None:
                         local_filename = "./"
-               cmd = "scp -B %s %s@%s:%s %s" % (self.__options_to_str(), 
+               cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(), 
                                                                         self.user, self.host, 
                                                                         rmt_filename, local_filename)
                 # output :
@@ -272,12 +304,35 @@ class SSH(CMD):
                 return CMD.run_noexcept(self, cmd)
  
         def run_noexcept(self, cmd):
-               cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(), 
+               cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
                                                                         self.user, self.host, cmd)
+               #print "SSH.run_noexcept(%s)" % cmd
                 return CMD.run_noexcept(self, cmd)
  
+       def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
+               cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
+                                                                       self.user, self.host, cmd)
+               #print "SSH.run_noexcept2(%s)" % cmd
+               r = CMD.run_noexcept(self, cmd, timeout)
+
+               # XXX: this may be resulting in deadlocks... not sure.
+               #if self.s.returncode is None:
+               #       #self.s.kill()
+               #       self.s.kill(signal.SIGKILL)
+               #       self.s.wait()
+               #       self.ret = self.s.returncode
+               self.ret = -1
+
+               return r
+
+       def system2(self, cmd, timeout=COMMAND_TIMEOUT*2):
+               cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), 
+                                                                       self.user, self.host, cmd)
+               #print "SSH.system2(%s)" % cmd
+               return CMD.system(self, cmd, timeout)
+
         def runE(self, cmd):
-               cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(), 
+               cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), 
                                                                         self.user, self.host, cmd)
                 if ( DEBUG == 1 ):
                         print cmd,
diff --git a/unified_model.py b/unified_model.py

index 918f653..58c223b 100755 (executable)
--- a/unified_model.py
+++ b/unified_model.py
@@ -6,11 +6,14 @@ import plc
  import auth
  api = plc.PLC(auth.auth, auth.plc)
  
-import config
  import mailer
  import time
+from nodecommon import *
+
+from const import *
  
  def gethostlist(hostlist_file):
+       import config
         return config.getListFromFile(hostlist_file)
         
         #nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
@@ -110,8 +113,14 @@ class Message(object):
  class Recent(object):
         def __init__(self, withintime):
                 self.withintime = withintime
-               self.time = time.time()
-               self.action_taken = False
+
+               try:
+                       self.time = self.__getattribute__('time')
+               except:
+                       self.time = time.time()- 7*24*60*60
+
+               #self.time = time.time()
+               #self.action_taken = False
  
         def isRecent(self):
                 if self.time + self.withintime < time.time():
@@ -152,6 +161,8 @@ class PersistFlags(Recent):
                         obj = super(PersistFlags, typ).__new__(typ, *args, **kwargs)
                         for key in kwargs.keys():
                                 obj.__setattr__(key, kwargs[key])
+                       obj.time = time.time()
+                       obj.action_taken = False
  
                 obj.db = db
                 return obj
@@ -178,6 +189,10 @@ class PersistFlags(Recent):
                         self.__setattr__(name, False)
                         return False
  
+       def resetRecentFlag(self, name):
+               self.resetFlag(name)
+               self.unsetRecent()
+
         def setRecentFlag(self, name):
                 self.setFlag(name)
                 self.setRecent()
@@ -191,6 +206,14 @@ class PersistFlags(Recent):
                         self.__setattr__(name, False)
                         return False
  
+       def checkattr(self, name):
+               try:
+                       x = self.__getattribute__(name)
+                       return True
+               except:
+                       return False
+               
+
  class PersistMessage(Message):
         def __new__(typ, id, subject, message, via_rt, **kwargs):
                 if 'db' in kwargs:
@@ -215,6 +238,9 @@ class PersistMessage(Message):
                         obj.actiontracker = Recent(3*60*60*24)
                         obj.ticket_id = None
  
+               if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
+                       obj.ticket_id = kwargs['ticket_id']
+
                 obj.db = db
                 return obj
  
@@ -237,8 +263,7 @@ class PersistMessage(Message):
                         soltesz.dbDump(self.db, pm)
                 else:
                         # NOTE: only send a new message every week, regardless.
-                       print "Not sending to host b/c not within window of 6 days"
-                       pass
+                       print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
  
  class MonitorMessage(object):
         def __new__(typ, id, *args, **kwargs):
@@ -342,7 +367,6 @@ class PersistSitePenalty(SitePenalty):
  
         def __init__(self, id, index, **kwargs):
                 self.id = id
-               #SitePenalty.__init__(self, self.index)
  
         def save(self):
                 pm = soltesz.dbLoad(self.db)
@@ -350,7 +374,6 @@ class PersistSitePenalty(SitePenalty):
                 soltesz.dbDump(self.db, pm)
  
  
-
  class Target:
         """
                 Each host has a target set of attributes.  Some may be set manually,
@@ -385,10 +408,140 @@ class Target:
  
                 return con_or_true
  
+class Record(object):
+
+       def __init__(self, hostname, data):
+               self.hostname = hostname
+               self.data = data
+               self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+               self.loginbase = self.plcdb_hn2lb[self.hostname]
+               return
+
+
+       def stageIswaitforever(self):
+               if 'waitforever' in self.data['stage']:
+                       return True
+               else:
+                       return False
+
+       def severity(self):
+               category = self.data['category']
+               prev_category = self.data['prev_category']
+               val = cmpCategoryVal(category, prev_category)
+               return val 
+
+       def improved(self):
+               return self.severity() > 0
+       
+       def end_record(self):
+               return node_end_record(self.hostname)
+
+       def reset_stage(self):
+               self.data['stage'] = 'findbad'
+               return True
+       
+       def getCategory(self):
+               return self.data['category'].lower()
+
+       def getState(self):
+               return self.data['state'].lower()
+
+       def getDaysDown(cls, diag_record):
+               daysdown = -1
+               if diag_record['comonstats']['uptime'] != "null":
+                       #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
+                       daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+               elif diag_record['comonstats']['sshstatus'] != "null":
+                       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
+               elif diag_record['comonstats']['lastcotop'] != "null":
+                       daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
+               else:
+                       now = time.time()
+                       last_contact = diag_record['plcnode']['last_contact']
+                       if last_contact == None:
+                               # the node has never been up, so give it a break
+                               daysdown = -1
+                       else:
+                               diff = now - last_contact
+                               daysdown = diff // (60*60*24)
+               return daysdown
+       getDaysDown = classmethod(getDaysDown)
+
+       def getStrDaysDown(cls, diag_record):
+               daysdown = cls.getDaysDown(diag_record)
+               if daysdown > 0:
+                       return "%d days down"%daysdown
+               elif daysdown == -1:
+                       return "Unknown number of days"
+               else:
+                       return "%d days up"% -daysdown
+       getStrDaysDown = classmethod(getStrDaysDown)
+
+       def takeAction(self):
+               pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
+               if 'improvement' in self.data['stage'] or self.improved():
+                       print "decreasing penalty for %s"%self.hostname
+                       pp.decrease()
+               else:
+                       print "increasing penalty for %s"%self.hostname
+                       pp.increase()
+               pp.apply(self.hostname)
+               pp.save()
+
+       def _format_diaginfo(self):
+               info = self.data['info']
+               if self.data['stage'] == 'monitor-end-record':
+                       hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
+               else:
+                       hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
+               return hlist
+
+       def getMessage(self, ticket_id=None):
+               self.data['args']['hostname'] = self.hostname
+               self.data['args']['loginbase'] = self.loginbase
+               self.data['args']['hostname_list'] = self._format_diaginfo()
+               message = PersistMessage(self.hostname, 
+                                                                self.data['message'][0] % self.data['args'],
+                                                                self.data['message'][1] % self.data['args'],
+                                                                True, db='monitor_persistmessages',
+                                                                ticket_id=ticket_id)
+               return message
+       
+       def getContacts(self):
+               from config import config
+               #print "policy"
+               config = config()
+
+               roles = self.data['email']
+
+               if not config.mail and not config.debug and config.bcc:
+                       roles = ADMIN
+               if config.mail and config.debug:
+                       roles = ADMIN
+
+               # build targets
+               contacts = []
+               if ADMIN & roles:
+                       contacts += [config.email]
+               if TECH & roles:
+                       contacts += [TECHEMAIL % self.loginbase]
+               if PI & roles:
+                       contacts += [PIEMAIL % self.loginbase]
+               if USER & roles:
+                       slices = plc.slices(self.loginbase)
+                       if len(slices) >= 1:
+                               for slice in slices:
+                                       contacts += [SLICEMAIL % slice]
+                               print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
+                       else:
+                               print "SLIC: %20s : 0 slices" % self.loginbase
+
+               return contacts
+
+
  class NodeRecord:
         def __init__(self, hostname, target):
                 self.hostname = hostname
-               self.pcu = PCU(hostname)
                 self.ticket = None
                 self.target = target
                 if hostname in fb['nodes']:
@@ -396,13 +549,28 @@ class NodeRecord:
                 else:
                         raise Exception("Hostname not in scan database")
  
-       def get(self):
-               pass
+       def stageIswaitforever(self):
+               if 'waitforever' in self.data['stage']:
+                       return True
+               else:
+                       return False
+
         def severity(self):
                 category = self.data['category']
                 prev_category = self.data['prev_category']
                 val = cmpCategoryVal(category, prev_category)
                 return val 
+
+       def improved(self):
+               return self.severity() > 0
+       
+       def end_record(self):
+               return node_end_record(self.hostname)
+
+       def reset_stage(self):
+               self.data['stage'] = 'findbad'
+               return True
+
         def open_tickets(self):
                 if self.ticket and self.ticket.status['status'] == 'open':
                         return 1
@@ -452,12 +620,13 @@ class NodeRecord:
  if __name__ == "__main__":
         #r = RT()
         #r.email("test", "body of test message", ['soltesz@cs.princeton.edu'])
-       from emailTxt import mailtxt
-       soltesz.dbDump("persistmessages", {});
-       args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah -  days down\n'}
-       m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
-       m.send(['soltesz@cs.utk.edu'])
-       m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
+       #from emailTxt import mailtxt
+       print "loaded"
+       #soltesz.dbDump("persistmessages", {});
+       #args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah -  days down\n'}
+       #m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
+       #m.send(['soltesz@cs.utk.edu'])
+       #m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
         # TRICK timer to thinking some time has passed.
-       m.actiontracker.time = time.time() - 6*60*60*24
-       m.send(['soltesz@cs.utk.edu'])
+       #m.actiontracker.time = time.time() - 6*60*60*24
+       #m.send(['soltesz@cs.utk.edu'])
diff --git a/www/printbadpcus.php b/www/printbadpcus.php

index c2d8daa..90f6645 100644 (file)
--- a/www/printbadpcus.php
+++ b/www/printbadpcus.php
@@ -46,7 +46,7 @@ function format_ports($pcu)
                 $portstat = $pcu['portstatus'];
  
                 #foreach ( array('22', '23', '80', '443') $portstat as $port => $state)
-               foreach ( array('22', '23', '80', '443') as $port)
+               foreach ( array('22', '23', '80', '443', '9100',  '16992') as $port)
                 {
                         $state = $portstat[$port];
                         switch ($state)
@@ -195,6 +195,13 @@ if ( $_GET['category'] )
         }
  }
  
+if ( $_REQUEST['id'] )
+{
+       $id = $_REQUEST['id'];
+} else{
+       $id = "all";
+}
+#print print_r($_SERVER) . "<BR>";
  
  //array_multisort($protocols, SORT_ASC, SORT_STRING, $pculist);
  ?>
@@ -203,42 +210,123 @@ if ( $_GET['category'] )
  <html>
  <body>
  
-Total PCUs : <?= $total ?>
-<table border=1>
-               <tr>
-                       <th>Count</th>
-                       <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
-                       <th><?= get_category_link("login_base", "Site") ?></th>
-                       <th><?= get_category_link("hostname", "PCU Name") ?></th>
-                       <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
-                       <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
-                       <th><?= get_category_link("portstatus", "Port Status") ?></th>
-                       <th><?= get_category_link("reboot", "Dry Run Results") ?></th>
-                       <th><?= get_category_link("model", "Model") ?></th>
-                       <th><?= get_category_link("node_ids", "Nodes") ?></th>
+<?php if ( $id == "all" ): ?>
+       Total PCUs : <?= $total ?>
+       <table border=1>
+                       <tr>
+                               <th>Count</th>
+                               <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
+                               <th><?= get_category_link("login_base", "Site") ?></th>
+                               <th><?= get_category_link("hostname", "PCU Name") ?></th>
+                               <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
+                               <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
+                               <th><?= get_category_link("portstatus", "Port Status") ?></th>
+                               <th><?= get_category_link("reboot", "Test Results") ?></th>
+                               <th><?= get_category_link("model", "Model") ?></th>
+                               <th><?= get_category_link("node_ids", "Nodes") ?></th>
+                       </tr>
+       <?php $count = 0; ?>
+       <?php $reachable_nodes = 0; ?>
+       <?php foreach ( $pculist as $pcu ): ?>
+                       <tr>
+                               <td><?= $count ?></td>
+                               <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
+                               <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
+                               <td><?= pcu_name($pcu) ?></td>
+                               <td><?= pcu_entry($pcu) ?></td>
+                               <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
+                               <td><?= format_ports($pcu) ?></td>
+                               <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
+                               <td nowrap><?= $pcu['model'] ?></td>
+                               <td><?= count( $pcu['node_ids'] ) ?></td>
+                       </tr>
+
+       <?php if ( $pcu['reboot'] == "0" ) $reachable_nodes+=count($pcu['node_ids']); ?>
+       <?php $count += 1; ?>
+       <?php endforeach; ?>
+       </table>
+       <b>Reachable Nodes:</b> <?= $reachable_nodes ?>
+<?php else: ?>
+       <table align=center border=1>
+                       <tr>
+                               <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
+                               <th><?= get_category_link("login_base", "Site") ?></th>
+                               <th><?= get_category_link("hostname", "PCU Name") ?></th>
+                               <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
+                               <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
+                               <th><?= get_category_link("portstatus", "Port Status") ?></th>
+                               <th><?= get_category_link("reboot", "Test Results") ?></th>
+                               <th><?= get_category_link("model", "Model") ?></th>
+                               <th><?= get_category_link("node_ids", "Nodes") ?></th>
+                       </tr>
+       <?php $count = 0; ?>
+       <?php $reachable_nodes = 0; ?>
+       <?php foreach ( $pculist as $pcu ): ?>
+               <?php if ( $pcu['pcu_id'] == $id ): ?>
+                       <tr>
+                               <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
+                               <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
+                               <td><?= pcu_name($pcu) ?></td>
+                               <td><?= pcu_entry($pcu) ?></td>
+                               <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
+                               <td><?= format_ports($pcu) ?></td>
+                               <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
+                               <td nowrap><?= $pcu['model'] ?></td>
+                               <td><?= count( $pcu['node_ids'] ) ?></td>
+                       </tr>
+               <?php endif; ?>
+       <?php endforeach; ?>
+       </table>
+       <br>
+       <table border=1 align=center>
+               <tr><th colspan=2>Legend for 'DNS Status'</th></tr>
+
+               <tr><td bgcolor=lightgreen>DNS-OK</td>
+                       <td>This indicates that the DNS name and registered IP address match.</td>
+               </tr>
+               <tr><td bgcolor=lightgrey>DNS-MISMATCH</td>
+                       <td>Sometimes, the registered IP and DNS IP address do not match.  In these cases it is not clear which is correct, 
+                               so an error is flagged.</td>
+               </tr>
+               <tr><td bgcolor=lightgrey>DNS-NOENTRY</td>
+                       <td>While a hostname is provided in the registration, the hostname is not actually registered in DNS.</td>
+               </tr>
+               <tr><td bgcolor=white>NOHOSTNAME</td>
+                       <td>While we prefer that a hostname be registered, it is not
+                       strictly required, since simply the IP address, if it is static, is enough to access the PCU.</td>
+               </tr>
+       <!--/table>
+       <table border=1-->
+               <tr><th colspan=2>Legend for 'Port Status'</th></tr>
+
+               <tr><td bgcolor=lightgreen>Open</td>
+                       <td>Green port numbers are believed to be open.</td>
                 </tr>
-<?php $count = 0; ?>
-<?php $reachable_nodes = 0; ?>
-<?php foreach ( $pculist as $pcu ): ?>
-               <tr>
-                       <td><?= $count ?></td>
-                       <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
-                       <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
-                       <td><?= pcu_name($pcu) ?></td>
-                       <td><?= pcu_entry($pcu) ?></td>
-                       <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
-                       <td><?= format_ports($pcu) ?></td>
-                       <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
-                       <td nowrap><?= $pcu['model'] ?></td>
-                       <td><?= count( $pcu['node_ids'] ) ?></td>
+               <tr><td bgcolor=gold>Filtered</td>
+                       <td>Gold port numbers are believed to be filtered or simply offline.</td>
                 </tr>
+               <tr><td bgcolor=indianred>Closed</td>
+                       <td>Finally, red ports appear to be closed.</td>
+               </tr>
+       <!--/table>
+       <table border=1-->
+               <tr><th colspan=2>Legend for 'Test Results'</th></tr>
  
-<?php if ( $pcu['reboot'] == "0" ) $reachable_nodes+=count($pcu['node_ids']); ?>
-<?php $count += 1; ?>
-<?php endforeach; ?>
-</table>
+               <tr><td bgcolor=darkseagreen>OK</td>
+                       <td>The PCU is accessible, and short of actually rebooting the node, everything appears to work.</td>
+               </tr>
+               <tr><td bgcolor=lightgrey>NetDown</td>
+                       <td>The PCU is inaccessible from the PlanetLab address block 128.112.139.0/25, or it is simply offline.</td>
+               </tr>
+               <tr><td bgcolor=lightgrey>Not_Run</td>
+                       <td>Previous errors, such as DNS or an incomplete configuration prevented the actual test from begin performed.</td>
+               </tr>
+               <tr><td bgcolor=indianred>Other Errors</td>
+                       <td>Other errors are reported by the test that are more specific to the block encountered by the script.</td>
+               </tr>
+       </table>
+<?php endif; ?>
  
-<b>Reachable Nodes:</b> <?= $reachable_nodes ?>
  
  </body>
  </html>
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Mon, 21 Jul 2008 16:30:31 +0000 (16:30 +0000)
automate_pl03.sh		patch \| blob \| history
emailTxt.py		patch \| blob \| history
fetch.py		patch \| blob \| history
findbad.py		patch \| blob \| history
findbadpcu.py		patch \| blob \| history
getconf.py		patch \| blob \| history
getsshkeys.py		patch \| blob \| history
grouprins.py		patch \| blob \| history
mailer.py		patch \| blob \| history
model.py		patch \| blob \| history
monitor.py		patch \| blob \| history
monitor_policy.py		patch \| blob \| history
nodecommon.py		patch \| blob \| history
nodeconfig.py		patch \| blob \| history
nodegroups.py		patch \| blob \| history
nodehistory.py		patch \| blob \| history
nodeinfo.py		patch \| blob \| history
nodequery.py		patch \| blob \| history
nodereboot.py		patch \| blob \| history
reboot.py		patch \| blob \| history
rt.py		patch \| blob \| history
soltesz.py		patch \| blob \| history
unified_model.py		patch \| blob \| history
www/printbadpcus.php		patch \| blob \| history