From 66b2c2382b2c4550f962a7224466f26a23510820 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 2 Nov 2007 18:11:55 +0000 Subject: [PATCH] syncplcdb gets info from the PLC db necessary for site, node , and pcu associations. findbadpcu.py should output in native python pickle format, and be converted later using pkl2php.py. This will facilitate my using the input for diagnose and action.py --- emailTxt.py | 12 ++++- findbadpcu.py | 8 +-- printbadcsv.py | 138 +++++++++++++++++++++++++++++++++++++++++++++++++ syncplcdb.py | 73 ++++++++++++++++++++++++++ 4 files changed, 226 insertions(+), 5 deletions(-) create mode 100755 printbadcsv.py create mode 100755 syncplcdb.py diff --git a/emailTxt.py b/emailTxt.py index b029b20..b2d3435 100644 --- a/emailTxt.py +++ b/emailTxt.py @@ -3,7 +3,7 @@ # # Faiyaz Ahmed # -# $Id: emailTxt.py,v 1.9 2007/08/08 13:26:46 soltesz Exp $ +# $Id: emailTxt.py,v 1.10 2007/08/29 17:26:50 soltesz Exp $ # @@ -183,10 +183,20 @@ OLDBOOTCD- This state corresponds to the situation where an oldbootcd prevented ERROR- This is an error state, where there is absolutely no contact with PlanetLab. """) + + nmreset =("""NM Reset at %(loginbase)s""", + """ +Monitor restarted NM on the following machines: + +%(hostname_list)s + + """) + # TODO: need reminder versions for repeats... newdown=[newdown_one, newdown_two, newdown_three] newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three] newthankyou=[thankyou,thankyou,thankyou] + NMReset=[nmreset,nmreset,nmreset] down=("""PlanetLab node %(hostname)s down.""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has been down for %(days)s days. diff --git a/findbadpcu.py b/findbadpcu.py index 0e06e17..365c281 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -355,7 +355,7 @@ def recordPingAndSSH(request, result): count += 1 print "%d %s %s" % (count, nodename, externalState['nodes'][pcu_id]['values']) - soltesz.dbDump(config.dbname, externalState, 'php') + soltesz.dbDump(config.dbname, externalState) # this will be called when an exception occurs within a thread def handle_exception(request, result): @@ -409,7 +409,7 @@ def checkAndRecordState(l_pcus, cohash): def main(): global externalState - externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState, 'php') + externalState = soltesz.if_cached_else(1, config.dbname, lambda : externalState) cohash = {} if config.increment: @@ -419,7 +419,7 @@ def main(): if config.filename == "": print "Calling API GetPCUs() : refresh(%s)" % config.refresh l_pcus = soltesz.if_cached_else_refresh(1, - config.refresh, "pculist", lambda : plc.GetPCUs(), 'php') + config.refresh, "pculist", lambda : plc.GetPCUs()) l_pcus = [pcu['pcu_id'] for pcu in l_pcus] else: l_pcus = config.getListFromFile(config.filename) @@ -450,5 +450,5 @@ if __name__ == '__main__': except Exception, err: print "Exception: %s" % err print "Saving data... exitting." - soltesz.dbDump(config.dbname, externalState, 'php') + soltesz.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/printbadcsv.py b/printbadcsv.py new file mode 100755 index 0000000..8911a0a --- /dev/null +++ b/printbadcsv.py @@ -0,0 +1,138 @@ +#!/usr/bin/python +import soltesz +from config import config +from optparse import OptionParser +from printbadbysite import * + + +def main(): + db = soltesz.dbLoad(config.dbname) + + ## Field widths used for printing + maxFieldLengths = { 'nodename' : -45, + 'ping' : 6, + 'ssh' : 6, + 'pcu' : 7, + 'category' : 9, + 'state' : 5, + 'kernel' : 10.65, + 'comonstats' : 5, + 'plcsite' : 12, + 'bootcd' : 10.65} + ## create format string based on config.fields + fields = {} + format = "" + for f in config.fields.split(','): + fields[f] = "%%(%s)%ds" % (f, maxFieldLengths[f]) + for f in config.fields.split(','): + format += fields[f] + " " + + + d_n = db['nodes'] + l_nodes = d_n.keys() + + # category by site + #bysite = {} + #for nodename in l_nodes: + # if 'plcsite' in d_n[nodename]['values'] and \ + # 'login_base' in d_n[nodename]['values']['plcsite']: + # loginbase = d_n[nodename]['values']['plcsite']['login_base'] + # if loginbase not in bysite: + # bysite[loginbase] = [] + # d_n[nodename]['values']['nodename'] = nodename + # bysite[loginbase].append(d_n[nodename]['values']) + + # d2 was an array of [{node}, {}, ...] + # the bysite is a loginbase dict of [{node}, {node}] + d2 = [] + for nodename in l_nodes: + vals=d_n[nodename]['values'] + v = {} + v.update(vals) + v['nodename'] = nodename + if 'plcsite' in vals and 'status' in vals['plcsite'] and vals['plcsite']['status'] == "SUCCESS": + site_string = "%-20s %2s nodes :: %2s of %4s slices" % ( \ + vals['plcsite']['login_base'], + vals['plcsite']['num_nodes'], + vals['plcsite']['num_slices'], + vals['plcsite']['max_slices']) + v['site_string'] = site_string + d2.append(v) + else: + #print "ERROR: ", nodename, vals, "
" + pass + #site_string = "UNKNOWN" + + + if config.cmpping: + d2.sort(cmp=cmpPing) + elif config.cmpssh: + d2.sort(cmp=cmpSSH) + elif config.cmpcategory: + d2.sort(cmp=cmpCategory) + elif config.cmpstate: + d2.sort(cmp=cmpState) + elif config.cmpdays: + d2.sort(cmp=cmpDays) + elif config.cmpkernel: + d2.sort(cmp=cmpUname) + else: + d2.sort(cmp=cmpCategory) + + + for row in d2: + site_string = row['site_string'] + vals = row + # convert uname values into a single kernel version string + if 'kernel' in vals: + kernel = vals['kernel'].split() + if len(kernel) > 0: + if kernel[0] == "Linux": + vals['kernel'] = kernel[2] + else: + vals['ssherror'] = vals['kernel'] + vals['kernel'] = "" + else: + vals['ssherror'] = "" + vals['kernel'] = "" + continue + + str = format % vals + fields = str.split() + #print "" + s = fields_to_html(fields, vals) + + keys = categories.keys() + for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', + 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + if cat not in keys: + categories[cat] = 0 + keys = categories.keys() + for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', + 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + if cat in keys: + print "%d," % categories[cat], + print "" +import cgi +if __name__ == '__main__': + parser = OptionParser() + parser.set_defaults(cmpdays=False, + comon="sshstatus", + fields="nodename,ping,ssh,pcu,category,state,kernel,bootcd", + dbname="findbad", # -070724-1", + cmpping=False, + cmpssh=False, + cmpcategory=False, + cmpstate=False) + parser.add_option("", "--fields", dest="dbname", help="") + parser.add_option("", "--dbname", dest="dbname", help="") + parser.add_option("", "--days", dest="cmpdays", action="store_true", help="") + parser.add_option("", "--ping", dest="cmpping", action="store_true", help="") + parser.add_option("", "--ssh", dest="cmpssh", action="store_true", help="") + parser.add_option("", "--category", dest="cmpcategory", action="store_true", help="") + parser.add_option("", "--kernel", dest="cmpkernel", action="store_true", help="") + parser.add_option("", "--state", dest="cmpstate", action="store_true", help="") + parser.add_option("", "--comon", dest="comon", help="") + config = config(parser) + config.parse_args() + main() diff --git a/syncplcdb.py b/syncplcdb.py new file mode 100755 index 0000000..0bae012 --- /dev/null +++ b/syncplcdb.py @@ -0,0 +1,73 @@ +#!/usr/bin/python + +import plc +from config import config +import soltesz +import sys + +config = config() + +def dsites_from_lsites(l_sites): + d_sites = {} + id2lb = {} + for site in l_sites: + if not site['login_base'] in d_sites: + d_sites[site['login_base']] = site + id2lb[site['site_id']] = site['login_base'] + else: + #print "Two sites have the same login_base value %s!" % site['login_base'] + sys.exit(1) + return (d_sites, id2lb) + +def dsn_from_dsln(d_sites, id2lb, l_nodes): + dsn = {} + hn2lb = {} + for node in l_nodes: + # this won't reach sites without nodes, which I guess isn't a problem. + if node['site_id'] in id2lb.keys(): + login_base = id2lb[node['site_id']] + else: + for i in id2lb: + print i, " ", id2lb[i] + raise Exception, "Node has missing site id!! %s %d" %(node['hostname'], node['site_id']) + if not login_base in dsn: + dsn[login_base] = {} + dsn[login_base]['plc'] = d_sites[login_base] + dsn[login_base]['monitor'] = {} # event log, or something + + hostname = node['hostname'] + dsn[login_base][hostname] = {} + dsn[login_base][hostname]['plc'] = node + dsn[login_base][hostname]['comon'] = {} + dsn[login_base][hostname]['monitor'] = {} + + hn2lb[hostname] = login_base + return (dsn, hn2lb) + +def create_plcdb(): + + # get sites, and stats + l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id']) + if len(l_sites) == 0: + sys.exit(1) + (d_sites,id2lb) = dsites_from_lsites(l_sites) + + # get nodes at each site, and + l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'site_id', 'version', 'last_updated', 'date_created', 'last_contact', 'pcu_ids']) + (plcdb, hn2lb) = dsn_from_dsln(d_sites, id2lb, l_nodes) + + # save information for future. + id2lb = id2lb + hn2lb = hn2lb + db = plcdb + + if config.cachenodes: + soltesz.dbDump("plcdb_hn2lb", hn2lb) + soltesz.dbDump("l_plcnodes", l_nodes) + soltesz.dbDump("l_plcsites", l_sites) + + return l_nodes + + +if __name__ == '__main__': + create_plcdb() -- 2.43.0