adds checks for readonly fs, dns errors, resets message timer if
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 29 Sep 2008 19:19:16 +0000 (19:19 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 29 Sep 2008 19:19:16 +0000 (19:19 +0000)
stage==improvement, and minor tweaks to nodehistory/common

clean_policy.py
findbad.py
nodecommon.py
nodehistory.py
unified_model.py

index 8e35903..aa8f7de 100644 (file)
@@ -198,7 +198,6 @@ class MonitorMergeDiagnoseSendEscellate:
                        print "action: getting message"
                        message = record.getMessage(record.data['ticket_id'])
                        if message:
-                               #message.reset()
                                print "action: sending email"
                                message.send(record.getContacts())
                                #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
index 47459ad..e156856 100755 (executable)
@@ -56,6 +56,8 @@ def collectPingAndSSH(nodename, cohash):
                                echo '  "bmlog":"'`ls /tmp/bm.log`'",'
                                echo '  "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
                                echo '  "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
+                               echo '  "readonlyfs":"'`touch /var/log/monitor 2>&1`'",'
+                               echo '  "dns":"'`host boot.planet-lab.org 2>&1`'",'
                                echo '  "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
 
                                ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` 
@@ -70,9 +72,13 @@ EOF                  """)
                                values['sshport'] = port
                                break
                        else:
-                               values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 'nm' :
-                               '', 'princeton_comon' : '', 'princeton_comon_running' : '',
-                               'princeton_comon_procs' : '', 'sshport' : None})
+                               values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 
+                                                               'nm' : '', 
+                                                               'readonlyfs' : '',
+                                                               'dns' : '',
+                                                               'princeton_comon' : '', 
+                                                               'princeton_comon_running' : '', 
+                                                               'princeton_comon_procs' : '', 'sshport' : None})
        except:
                print traceback.print_exc()
                sys.exit(1)
index 624ee2c..cbbc2c4 100644 (file)
@@ -109,6 +109,16 @@ def diff_time(timestamp, abstime=True):
                t_str = "%s mnths ago" % int(t)
        return t_str
 
+def getvalue(fb, path):
+    indexes = path.split("/")
+    values = fb
+    for index in indexes:
+        if index in values:
+            values = values[index]
+        else:
+            return None
+    return values
+
 def nodegroup_display(node, fb, conf=None):
        if node['hostname'] in fb['nodes']:
                node['current'] = get_current_state(fb['nodes'][node['hostname']]['values'])
index abbcee8..a7f030b 100755 (executable)
@@ -43,9 +43,10 @@ def fb_print_nodeinfo(fbnode, verbose, date=None):
        else:
                fbnode['bootcd'] = "unknown"
        fbnode['state'] = color_boot_state(get_current_state(fbnode))
+       fbnode['boot_state'] = getvalue(fbnode, 'plcnode/boot_state')
        if len(fbnode['kernel'].split()) >= 3:
                fbnode['kernel'] = fbnode['kernel'].split()[2]
-       print "    %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+       print "    %(state)5s | %(boot_state)s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
 
 def pcu_print_info(pcuinfo, hostname):
        print "   Checked: ",
index e237bc9..ea52ebc 100755 (executable)
@@ -230,7 +230,7 @@ class PersistMessage(Message):
                        #print "creating new object"
                        obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
                        obj.id = id
-                       obj.actiontracker = Recent(3*60*60*24)
+                       obj.actiontracker = Recent(1*60*60*24)
                        obj.ticket_id = None
 
                if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
@@ -259,6 +259,7 @@ class PersistMessage(Message):
                        self.save()
                else:
                        # NOTE: only send a new message every week, regardless.
+                       # NOTE: can cause thank-you messages to be lost, for instance when node comes back online within window.
                        print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
 
 class MonitorMessage(object):
@@ -539,6 +540,8 @@ class Record(object):
                                                                 self.data['message'][1] % self.data['args'],
                                                                 True, db='monitor_persistmessages',
                                                                 ticket_id=ticket_id)
+                       if self.data['stage'] == "improvement":
+                               message.reset()
                        return message
                else:
                        return None