updated description of error message from CoMon since it has changed.
M showlatlon.py
updated hardware spec thresholds to include more machines.
M clean_policy.py
stricter activation of the 'action-levels' that each event triggers.
Previously things were out of sorts.
M unified_model.py
works with the 'action-level' changes above.
M nodesets.py
M grouprins.py
add a site option
M nodecommon.py
add missing module
M bootman.py
M rtinfo.py
M todo
# By using the sequence identifier, we guarantee that there will be no
# frequent loops. I'm guessing there is a better way to track loops,
# though.
- if not config.force and pflags.getRecentFlag(s):
- pflags.setRecentFlag(s)
- pflags.save()
- print "... flag is set or it has already run recently. Skipping %s" % node
- return True
+ #if not config.force and pflags.getRecentFlag(s):
+ # pflags.setRecentFlag(s)
+ # pflags.save()
+ # print "... flag is set or it has already run recently. Skipping %s" % node
+ # return True
sequences = {}
fbnode['log'] = None
fbnode['time'] = time.time()
fbnode['email'] = TECH
+ fbnode['action-level'] = 0
fbnode['action'] = ['noop']
fbnode['date_created'] = time.time()
print "diagnose: checkStageAndTime Returned Valid Record"
site = PersistFlags(self.loginbase, 1, db='site_persistflags')
- if site.status != "good":
+ if "good" not in site.status: # != "good":
print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
diag.setFlag('Squeeze')
else:
#print record.data['stage']
#print "improvement" in record.data['stage']
#print self.getSendEmailFlag(record)
- if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']:
+ print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
+ if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
+ "monitor-end-record" in record.data['stage']:
print "action: getting message"
message = record.getMessage(record.data['ticket_id'])
if message:
print "action: setting record ticket_id"
record.data['ticket_id'] = message.rt.ticket_id
- if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+ if ( record.data['takeaction'] and diag.getFlag('Squeeze') ):
print "action: taking action"
- record.takeAction()
+ record.takeAction(record.data['action-level'])
diag.resetFlag('Squeeze')
+ diag.save()
+ if diag.getFlag('BackOff'):
+ record.takeAction(0)
diag.resetFlag('BackOff')
diag.save()
record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'stage_actinoneweek'
record.data['save-act-all'] = True
+ record.data['action-level'] = 0
elif 'reboot_node' in record.data['stage']:
record.data['email'] = TECH
record.data['stage'] = 'stage_actinoneweek'
record.data['takeaction'] = False
record.data['save-act-all'] = False
+ record.data['action-level'] = 0
elif 'improvement' in record.data['stage']:
print "checkStageAndTime: backing off of %s" % self.hostname
record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'monitor-end-record'
record.data['save-act-all'] = True
+ record.data['action-level'] = 0
elif 'actinoneweek' in record.data['stage']:
if delta >= 7 * SPERDAY:
record.data['time'] = current_time # reset clock for waitforever
record.data['takeaction'] = True
record.data['save-act-all'] = True
+ record.data['action-level'] = 1
elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
print "checkStageAndTime: second message in one week"
record.data['email'] = TECH
record.data['second-mail-at-oneweek'] = True
record.data['takeaction'] = False
record.data['save-act-all'] = True
+ record.data['action-level'] = 0
else:
record.data['message'] = None
record.data['action'] = ['waitforoneweekaction' ]
record.data['takeaction'] = False
record.data['save-act-all'] = False
+ record.data['action-level'] = 0
print "checkStageAndTime: ignoring this record for: %s" % self.hostname
#return None # don't send if there's no action
record.data['time'] = current_time # reset clock for waitforever
record.data['takeaction'] = True
record.data['save-act-all'] = True
+ record.data['action-level'] = 2
elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
print "checkStageAndTime: second message in one week for stage two"
record.data['email'] = TECH | PI
record.data['second-mail-at-twoweeks'] = True
record.data['takeaction'] = False
record.data['save-act-all'] = True
+ record.data['action-level'] = 1
else:
record.data['message'] = None
record.data['takeaction'] = False
record.data['action'] = ['waitfortwoweeksaction']
record.data['save-act-all'] = False
print "checkStageAndTime: second message in one week for stage two"
+ record.data['action-level'] = 1
#return None # don't send if there's no action
elif 'ticket_waitforever' in record.data['stage']:
record.data['message'] = None
record.data['time'] = current_time
record.data['save-act-all'] = True
+ record.data['action-level'] = 2
else:
if delta >= 7*SPERDAY:
record.data['action'] = ['ticket_waitforever']
record.data['message'] = None
record.data['time'] = current_time # reset clock
record.data['save-act-all'] = True
+ record.data['action-level'] = 2
else:
record.data['action'] = ['ticket_waitforever']
record.data['message'] = None
record.data['takeaction'] = False
record.data['save-act-all'] = False
- return None
+ record.data['action-level'] = 2
+ #return None
elif 'waitforever' in record.data['stage']:
# more than 3 days since last action
record.data['message'] = record.data['message_series'][2]
record.data['time'] = current_time # reset clock
record.data['save-act-all'] = True
+ record.data['action-level'] = 2
else:
record.data['action'] = ['waitforever']
record.data['message'] = None
record.data['takeaction'] = False
record.data['save-act-all'] = False
+ record.data['action-level'] = 2
#return None # don't send if there's no action
else:
%(hostname_list)s
We're writing because we need your help returning them to their regular operation.
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
%(hostname_list)s
We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation. We understand that machine maintenance can take time. So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site. No new slices may be created, but the existing slices and services running within them will be unaffected.
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
%(hostname_list)s
We understand that machine maintenance can take time. We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation. This is the third time attempting to contact someone in regard to these machines at your site. So, while we wait for the machines to return to their regular operation all current slice activity will be suspended. Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
nodelist = api.GetNodes(ng[0]['node_ids'])
hostnames = [ n['hostname'] for n in nodelist ]
+if config.site:
+ site = api.GetSites(config.site)
+ l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+ hostnames = [ n['hostname'] for n in l_nodes ]
+
if config.node or config.nodelist:
if config.node: hostnames = [ config.node ]
else: hostnames = util.file.getListFromFile(config.nodelist)
print "ALL METHODS OF RESTARTING %s FAILED" % host
args = {}
args['hostname'] = host
- m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
- "CANNOT CONTACT", False, db='suspect_persistmessages')
- m.reset()
- m.send(['monitor-list@lists.planet-lab.org'])
+ #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
+ # "CANNOT CONTACT", False, db='suspect_persistmessages')
+ #m.reset()
+ #m.send(['monitor-list@lists.planet-lab.org'])
l = Log(host, record)
print l
import time
import util.file
import plc
+from datetime import datetime
from monitor import database
from unified_model import PersistFlags
esc = struct.pack('i', 27)
import os
from sets import Set
import parser as parsermodule
+import util.file
def main():
parser = parsermodule.getParser()
f1 = config.args[0]
f2 = config.args[1]
- s1 = config.getListFromFile(f1)
- s2 = config.getListFromFile(f2)
+ s1 = util.file.getListFromFile(f1)
+ s2 = util.file.getListFromFile(f2)
s = nodesets(config.operation, s1, s2)
print "Unknown operation: %s " % operation
return []
+
+if __name__ == "__main__":
+ main()
#print sql[id].keys()
#sys.exit(1)
key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id]
- sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
+ sortkeys[key] = "%(ticket_id)s %(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
#sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id]
keys = sortkeys.keys()
for field in ['cpuspeed', 'memsize', 'disksize']:
if field not in cstat: cstat[field] = "null"
- if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4:
+ if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.2:
return "BAD" # "cpu_slow",
- if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9:
+ if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.8:
return "BAD" # "mem_small",
- if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0:
+ if cstat['disksize'] != "null" and float(cstat['disksize']) < 300.0:
return "BAD" # "disk_small",
if cstat['disksize'] == "null" and \
return "N/A"
try:
- if float(cstat['cpuspeed']) >= 2.4 and \
- float(cstat['memsize']) >= 2.9 and \
- (cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0):
+ if float(cstat['cpuspeed']) >= 2.2 and \
+ float(cstat['memsize']) >= 2.8 and \
+ (cstat['disksize'] == "null" or float(cstat['disksize']) >= 300.0):
return "A-OK"
except:
print cstat
- testapi.py
- findbad.py on sample site.
- nodebad.py
+ - findbadpcus.py
- nodequery.py
- nodegroups.py
- - findbadpcus.py
- loads webpage for those retreived values to confirm setup succeeded.
* reimplement the config.py / .config mechanism. I'd like for many commands
from monitor import database
import plc
-api = plc.getAuthAPI()
-
import mailer
import time
def gethostlist(hostlist_file):
return util.file.getListFromFile(hostlist_file)
-
- #nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
- #return [ n['hostname'] for n in nodes ]
def array_to_priority_map(array):
""" Create a mapping where each entry of array is given a priority equal
def getDaysDown(cls, diag_record):
daysdown = -1
- if diag_record['comonstats']['uptime'] != "null":
+ if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
#elif diag_record['comonstats']['sshstatus'] != "null":
# daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
# return "%d days up"% -daysdown
#getStrDaysDown = classmethod(getStrDaysDown)
- def takeAction(self):
+ def takeAction(self, index=0):
pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
if 'improvement' in self.data['stage'] or self.improved() or \
'monitor-end-record' in self.data['stage']:
else:
print "takeAction: increasing penalty for %s"%self.hostname
pp.increase()
+ pp.index = index
pp.apply(self.hostname)
pp.save()