From: Stephen Soltesz Date: Mon, 29 Sep 2008 19:19:16 +0000 (+0000) Subject: adds checks for readonly fs, dns errors, resets message timer if X-Git-Tag: Monitor-1.0-10~3 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=32e6f3bdf4fe68b057cf070b756418333ae707d6;p=monitor.git adds checks for readonly fs, dns errors, resets message timer if stage==improvement, and minor tweaks to nodehistory/common --- diff --git a/clean_policy.py b/clean_policy.py index 8e35903..aa8f7de 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -198,7 +198,6 @@ class MonitorMergeDiagnoseSendEscellate: print "action: getting message" message = record.getMessage(record.data['ticket_id']) if message: - #message.reset() print "action: sending email" message.send(record.getContacts()) #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!" diff --git a/findbad.py b/findbad.py index 47459ad..e156856 100755 --- a/findbad.py +++ b/findbad.py @@ -56,6 +56,8 @@ def collectPingAndSSH(nodename, cohash): echo ' "bmlog":"'`ls /tmp/bm.log`'",' echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",' echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",' + echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",' echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` @@ -70,9 +72,13 @@ EOF """) values['sshport'] = port break else: - values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 'nm' : - '', 'princeton_comon' : '', 'princeton_comon_running' : '', - 'princeton_comon_procs' : '', 'sshport' : None}) + values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', + 'nm' : '', + 'readonlyfs' : '', + 'dns' : '', + 'princeton_comon' : '', + 'princeton_comon_running' : '', + 'princeton_comon_procs' : '', 'sshport' : None}) except: print traceback.print_exc() sys.exit(1) diff --git a/nodecommon.py b/nodecommon.py index 624ee2c..cbbc2c4 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -109,6 +109,16 @@ def diff_time(timestamp, abstime=True): t_str = "%s mnths ago" % int(t) return t_str +def getvalue(fb, path): + indexes = path.split("/") + values = fb + for index in indexes: + if index in values: + values = values[index] + else: + return None + return values + def nodegroup_display(node, fb, conf=None): if node['hostname'] in fb['nodes']: node['current'] = get_current_state(fb['nodes'][node['hostname']]['values']) diff --git a/nodehistory.py b/nodehistory.py index abbcee8..a7f030b 100755 --- a/nodehistory.py +++ b/nodehistory.py @@ -43,9 +43,10 @@ def fb_print_nodeinfo(fbnode, verbose, date=None): else: fbnode['bootcd'] = "unknown" fbnode['state'] = color_boot_state(get_current_state(fbnode)) + fbnode['boot_state'] = getvalue(fbnode, 'plcnode/boot_state') if len(fbnode['kernel'].split()) >= 3: fbnode['kernel'] = fbnode['kernel'].split()[2] - print " %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + print " %(state)5s | %(boot_state)s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode def pcu_print_info(pcuinfo, hostname): print " Checked: ", diff --git a/unified_model.py b/unified_model.py index e237bc9..ea52ebc 100755 --- a/unified_model.py +++ b/unified_model.py @@ -230,7 +230,7 @@ class PersistMessage(Message): #print "creating new object" obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs) obj.id = id - obj.actiontracker = Recent(3*60*60*24) + obj.actiontracker = Recent(1*60*60*24) obj.ticket_id = None if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None: @@ -259,6 +259,7 @@ class PersistMessage(Message): self.save() else: # NOTE: only send a new message every week, regardless. + # NOTE: can cause thank-you messages to be lost, for instance when node comes back online within window. print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24)) class MonitorMessage(object): @@ -539,6 +540,8 @@ class Record(object): self.data['message'][1] % self.data['args'], True, db='monitor_persistmessages', ticket_id=ticket_id) + if self.data['stage'] == "improvement": + message.reset() return message else: return None