try:
k = SSHKnownHosts(); k.update(node); k.write(); del k
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
return False
else:
session = PlanetLabSession(node, config.nosetup, config.verbose)
except Exception, e:
- print "ERROR setting up session for %s" % hostname
+ msg = "ERROR setting up session for %s" % hostname
+ print msg
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception(msg)
print e
return False
conn = session.get_connection(config)
except:
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
return False
if forced_action == "reboot":
args = {}
args['hostname'] = hostname
args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
+ m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodenet_persistmessages')
loginbase = plc.siteId(hostname)
emails = plc.getTechEmails(loginbase)
node = api.GetNodes(hostname)[0]
net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
# TODO: api error. skip email, b/c all info is not available,
# flag_set will not be recorded.
actnode.update(fbnode)
actnode['ticket_id'] = ""
actnode['prev_category'] = "ERROR"
+ actnode['prev_state'] = "DOWN"
else:
actnode['prev_category']= actnode['category']
+ actnode['prev_state'] = actnode['state']
actnode['comonstats'] = fbnode['comonstats']
actnode['category'] = fbnode['category']
actnode['state'] = fbnode['state']
actnode= self.getActionRecord()
actrec = self.mergeRecord(fbnode, actnode)
record = Record(self.hostname, actrec)
+ #print record
+ #print actrec
+ #print record.data['time']
+ #print time.time() - record.data['time']
diag = self.diagnose(record)
if self.act and diag is not None:
self.action(record,diag)
record.data['ticket_id'] = message.rt.ticket_id
if ( record.data['takeaction'] and diag.getFlag('Squeeze') ):
- print "action: taking action"
+ print "action: taking squeeze action"
record.takeAction(record.data['action-level'])
diag.resetFlag('Squeeze')
diag.save()
if diag.getFlag('BackOff'):
+ print "action: taking backoff action"
record.takeAction(0)
diag.resetFlag('BackOff')
diag.save()
raise Exception, "No such file %s" % name
- #import traceback
- #print traceback.print_stack()
#print "loading %s" % self.__file(name, type)
#sys.stderr.write("-----------------------------\n")
f = open(self.__file(name, type), 'r')
'princeton_comon_procs' : '', 'sshport' : None})
except:
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
sys.exit(1)
### RUN SSH ######################
except:
b_except = True
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
plc_lock.release()
if b_except: return (None, None)
except:
b_except = True
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
plc_lock.release()
if b_except: return (None, None)
main()
except Exception, err:
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
database.dbDump(config.dbname, externalState)
l_pcu = i
except:
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
l_pcu = None
plc_lock.release()
l_node.append(n)
except:
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
l_node = None
plc_lock.release()
break
except:
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
values = None
plc_lock.release()
except:
b_except = True
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
continue_probe = False
if b_except or not continue_probe: return (None, None, None)
time.sleep(1)
except Exception, err:
traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
database.dbDump(config.dbname, externalState)
return ret
except Exception,e:
+ email_exception()
print traceback.print_exc(); print e
# NOTE: this failure could be an implementation issue on
return ret
except Exception,e:
+ email_exception()
print traceback.print_exc(); print e
# NOTE: this failure could be an implementation issue on
try:
return mailmonitor.reboot(host)
except Exception, e:
+ email_exception(host)
print traceback.print_exc(); print e
return False
try:
node = api.GetNodes(host)[0]
except:
+ email_exception()
print traceback.print_exc();
print "FAILED GETNODES for host: %s" % host
continue
# todo: send thank you, etc.
mailmonitor.reboot(host)
except Exception, e:
+ email_exception()
print traceback.print_exc(); print e
continue
print "Killed by interrupt"
sys.exit(0)
except:
+ email_exception()
print traceback.print_exc();
print "Continuing..."
except ExceptionTimeout:
import traceback; print traceback.print_exc()
return ("", "SCRIPTTIMEOUT")
+ except:
+ from nodecommon import email_exception
+ email_exception()
def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
(o,e) = self.run(cmd, timeout)
raise Exception, "No such file %s" % name
- #import traceback
- #print traceback.print_stack()
#print "loading %s" % self.__file(name, type)
#sys.stderr.write("-----------------------------\n")
f = open(self.__file(name, type), 'r')
print "----------------"
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print err
#if config.policysavedb:
sys.exit(1)
print "----------------"
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print err
if config.policysavedb:
print "Saving Databases... act_all"
print "exception on message:"
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print message
return ticket_id
l_plcnodes = database.dbLoad("l_plcnodes")
l_nodes = get_nodeset(config)
+ print len(l_nodes)
#if config.node:
# l_nodes = [config.node]
##else:
externalState['nodes'][nodename]['values'] = values
externalState['nodes'][nodename]['round'] = global_round
else:
+ pf = PersistFlags(nodename, 1, db='node_persistflags')
+ print "%d %35s %s since %s" % (count, nodename, pf.status, pf.last_changed)
+ del pf
count += 1
if count % 20 == 0:
except Exception, err:
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
database.dbDump(config.dbname, externalState)
return l_nodes
+def email_exception(content=None):
+ import config
+ from unified_model import Message
+ import traceback
+ msg=traceback.format_exc()
+ if content:
+ msg = content + "\n" + msg
+ m=Message("exception running monitor", msg, False)
+ m.send([config.cc_email])
+ return
except:
print "Error with %s" % node
import traceback; print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
pass
# commands:
def fb_print_nodeinfo(fbnode, verbose, date=None):
if verbose: print " state | ssh | pcu | bootcd | category | kernel"
if 'checked' in fbnode:
- print "%11.11s " % diff_time(fbnode['checked']),
+ if date: print date,
+ #print "%11.11s " % diff_time(fbnode['checked']),
else:
if date: print date,
else: print "Unknown",
except KeyboardInterrupt:
sys.exit(1)
except:
- #import traceback; print traceback.print_exc()
print d.strftime("%Y-%m-%d"), "No record"
d = d + tdelta
except Exception, err:
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
database.dbDump(config.dbname, externalState)
print "----------------"
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print err
#if config.policysavedb:
sys.exit(1)
import traceback
traceback.print_exc()
return "EOF connection reset" + str(err)
+ except:
+ from nodecommon import email_exception
+ email_exception()
+ raise Exception('unknown')
class IPAL(PCUControl):
"""
class BayTechGeorgeTown(PCUControl):
def run(self, node_port, dryrun):
+ # this initial open/close is to prevent things from raising an
+ # exception. the pcu always is weird during the first connection, and
+ # even if it's not, what does it matter to open a second connection
+ # right away?
+ self.open(self.host, self.username, None, "Enter user name:")
+ self.close()
+ time.sleep(1)
self.open(self.host, self.username, None, "Enter user name:")
self.sendPassword(self.password, "Enter Password:")
if self.verbose: print f.read()
except:
import traceback; traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
# fetch url one more time on cmd.html, econtrol.html or whatever.
# pass
print "failed"
except Exception, err:
import traceback; traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print err
if __name__ == '__main__':
from nodecommon import color_pcu_state, datetime_fromstr
from nodehistory import get_filefromglob
import time
-import traceback
# region
# total
'hardware' : gethardwarequality(hostname, fb),
'pcuok' : color_pcu_state(fb['nodes'][hostname]['values']) }
#except:
- # print traceback.print_exc()
# print args
# print fb['nodes'][hostname]['values']
results.append("%(cc)7s %(status)8s %(hardware)8s %(pcuok)8s %(site)15s %(host)42s " % args)
externalState['sites'][sitename]['values'] = values
externalState['sites'][sitename]['round'] = global_round
else:
+ pf = PersistFlags(sitename, 1, db=config.dbpfname )
+ print "%d noinc %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used,
+ pf.nodes_total, pf.nodes_up, pf.status)
count += 1
if count % 20 == 0:
return None
if sitename in lb2hn:
- pf = PersistFlags(sitename, 1, db='site_persistflags')
+ pf = PersistFlags(sitename, 1, db=config.dbpfname )
if not pf.checkattr('last_changed'):
pf.last_changed = time.time()
parser = parsermodule.getParser()
parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None,
- increment=False, dbname="sitebad", cachenodes=False)
+ increment=False, dbname="sitebad", dbpfname="site_persistflags", cachenodes=False)
parser.add_option("", "--site", dest="site", metavar="login_base",
help="Provide a single site to operate on")
parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list",
parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
help="Specify the name of the database to which the information is saved")
+ parser.add_option("", "--dbpfname", dest="dbpfname", metavar="FILE",
+ help="Specify the persistflags db name")
parser.add_option("-i", "--increment", action="store_true", dest="increment",
help="Increment round number to force refresh or retry")
config = parsermodule.parse_args(parser)
except Exception, err:
import traceback
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
database.dbDump(config.dbname, externalState)
return CMD.run(self,cmd,timeout)
except ExceptionTimeout:
import traceback; print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception()
return ("", "SCRIPTTIMEOUT")
def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
network = api.GetNodeNetworks(node['nodenetwork_ids'])
print "ok"
except:
- sys.stderr.write(traceback.print_exc())
+ sys.stderr.write(traceback.format_exc())
+ from nodecommon import email_exception
+ email_exception()
print "fail"
if v1 == 'ALPHA': v1 = "PROD"
if v2 == 'ALPHA': v2 = "PROD"
#map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
- map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
+ map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
return cmpValMap(v1,v2,map)
#print pm
if id in pm:
- print "Using existing object"
+ print "PersistSitePenalty Using existing object"
obj = pm[id]
else:
print "creating new object"
def severity(self):
category = self.data['category']
prev_category = self.data['prev_category']
- #print "SEVERITY: ", category, prev_category
+ print "SEVERITY: ", category, prev_category
+ try:
+ print "SEVERITY state: ", self.data['state'], self.data['prev_state']
+ except:
+ print "SEVERITY state: unknown unknown"
val = cmpCategoryVal(category, prev_category)
return val
else:
print "takeAction: increasing penalty for %s"%self.hostname
pp.increase()
+ print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
pp.index = index
pp.apply(self.hostname)
pp.save()