with the new clean_policy and unified_model.
#from Rpyc import SocketConnection, Async
from Rpyc import SocketConnection, Async
from Rpyc.Utils import *
+fb = None
def get_fbnode(node):
- fb = database.dbLoad("findbad")
+ global fb
+ if fb is None:
+ fb = database.dbLoad("findbad")
fbnode = fb['nodes'][node]['values']
return fbnode
except:
print traceback.print_exc()
return False
-
if forced_action == "reboot":
conn.restart_node('rins')
return None
class MonitorMergeDiagnoseSendEscellate:
+ act_all = None
+ fb = None
+
def __init__(self, hostname, act):
self.hostname = hostname
self.act = act
return
def getFBRecord(self):
- fb = database.dbLoad("findbad")
+ if MonitorMergeDiagnoseSendEscellate.fb == None:
+ MonitorMergeDiagnoseSendEscellate.fb = database.dbLoad("findbad")
+
+ fb = MonitorMergeDiagnoseSendEscellate.fb
+
if self.hostname in fb['nodes']:
fbnode = fb['nodes'][self.hostname]['values']
else:
def getActionRecord(self):
# update ticket status
- act_all = database.dbLoad("act_all")
+ if MonitorMergeDiagnoseSendEscellate.act_all == None:
+ MonitorMergeDiagnoseSendEscellate.act_all = database.dbLoad("act_all")
+
+ act_all = MonitorMergeDiagnoseSendEscellate.act_all
+
if self.hostname in act_all and len(act_all[self.hostname]) > 0:
actnode = act_all[self.hostname][0]
else:
actnode = None
- del act_all
return actnode
def getKernel(self, unamestr):
fbnode['info'] = None
fbnode['log'] = None
fbnode['time'] = time.time()
+ fbnode['email'] = TECH
+ fbnode['action'] = ['noop']
fbnode['date_created'] = time.time()
- if actnode is None:
+ if actnode is None: # there is no entry in act_all
actnode = {}
actnode.update(fbnode)
actnode['ticket_id'] = ""
- actnode['prev_category'] = "NORECORD"
+ actnode['prev_category'] = "ERROR"
else:
actnode['prev_category']= actnode['category']
actnode['comonstats'] = fbnode['comonstats']
diag = PersistFlags(record.hostname, 60*60*24, db='persist_diagnose_flags')
# NOTE: change record stage based on RT status.
- diag.setFlag('ResetStage')
+ #diag.setFlag('ResetStage')
if record.stageIswaitforever():
ticket = record.data['rt']
if 'new' in ticket['Status']:
- diag.setFlag('ResetStage')
+ print "Resetting Stage!!!!!"
+ # diag.setFlag('ResetStage')
+ record.reset_stage()
+ #if diag.getFlag('ResetStage'):
+ # print "diagnose: resetting stage"
+ # diag.resetFlag('ResetStage')
if 'resolved' in ticket['Status']:
- diag.setFlag('EndRecord')
+ diag.setFlag('RTEndRecord')
# NOTE: take category, and prepare action
category = record.getCategory()
if category == "error":
diag.setFlag('SendNodedown')
- record.data['message'] = emailTxt.mailtxt.newdown
+ record.data['message_series'] = emailTxt.mailtxt.newdown
record.data['log'] = self.getDownLog(record)
- elif category == "prod":
+ elif category == "prod" or category == "alpha":
state = record.getState()
if state == "boot":
- diag.setFlag('SendThankyou')
- record.data['message'] = emailTxt.mailtxt.newthankyou
- record.data['log'] = self.getThankyouLog(record)
-
+ if record.severity() != 0:
+ diag.setFlag('SendThankyou')
+ print "RESETTING STAGE: improvement"
+ record.data['stage'] = 'improvement'
+ record.data['message_series'] = emailTxt.mailtxt.newthankyou
+ record.data['log'] = self.getThankyouLog(record)
+ else:
+ # NOTE: do nothing, since we've already done the above.
+ print "DIAGNOSED: %s is boot. no further action necessary." % record.hostname
+ return None
elif state == "debug":
pass
else:
else:
print "unknown category: %s" % category
- if diag.getFlag('ResetStage'):
- print "resetting stage"
- record.reset_stage()
+ # TODO: how to not send email?...
record = self.checkStageAndTime(diag,record)
- if record:
- print "checkStageAndTime Returned Valid Record"
- site = PersistFlags(self.loginbase, 1, db='site_persistflags')
+ #if record:
+ print "diagnose: checkStageAndTime Returned Valid Record"
+ site = PersistFlags(self.loginbase, 1, db='site_persistflags')
- if site.status is not "good":
- print "Setting site %s for 'squeeze'" % self.loginbase
- diag.setFlag('Squeeze')
- else:
- print "Setting site %s for 'backoff'" % self.loginbase
- diag.setFlag('BackOff')
-
- diag.save()
- return diag
+ if site.status != "good":
+ print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
+ diag.setFlag('Squeeze')
else:
- print "checkStageAndTime Returned NULL Record"
- return None
+ print "diagnose: Setting site %s for 'backoff'" % self.loginbase
+ diag.setFlag('BackOff')
+
+ diag.save()
+ return diag
+ #else:
+ # print "checkStageAndTime Returned NULL Record"
+ # return None
def action(self, record, diag):
- if record.improved() or diag.getFlag('EndRecord'):
- print "end record for %s" % self.hostname
- record.end_record()
- diag.setFlag('CloseRT')
- return None
-
- if self.getSendEmailFlag(record):
- print "sending email"
+
+ message = None
+
+ #print record.data['stage']
+ #print "improvement" in record.data['stage']
+ #print self.getSendEmailFlag(record)
+ if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']:
+ print "action: getting message"
message = record.getMessage(record.data['ticket_id'])
- message.reset()
- message.send(record.getContacts())
- if message.rt.ticket_id:
- print "setting record ticket_id"
- record.data['ticket_id'] = message.rt.ticket_id
- if diag.getFlag('CloseRT'):
- message.rt.closeTicket()
+ if message:
+ #message.reset()
+ print "action: sending email"
+ message.send(record.getContacts())
+ #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+ #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+ #print "DEBUG NOT SENDING MESSAGE WHEN I SHOULD BE!!!!!"
+ #print message
+ if message.rt.ticket_id:
+ print "action: setting record ticket_id"
+ record.data['ticket_id'] = message.rt.ticket_id
+
+ if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+ print "action: taking action"
+ record.takeAction()
+ diag.resetFlag('Squeeze')
+ diag.resetFlag('BackOff')
+ diag.save()
+
+ if record.saveAction():
+ print "action: saving act_all db"
+ self.add_and_save_act_all(record)
+ else:
+ print "action: NOT saving act_all db"
+ print "stage: %s %s" % ( record.data['stage'], record.data['save-act-all'] )
+
+ if record.improved() or diag.getFlag('RTEndRecord'):
+ print "action: end record for %s" % self.hostname
+ record.end_record()
+ diag.setFlag('CloseRT')
+ diag.resetFlag('RTEndRecord')
+ diag.save()
+ #return None
+
+ if message:
+ if diag.getFlag('CloseRT'):
+ message.rt.closeTicket()
+ diag.resetFlag('CloseRT')
+ diag.save()
+
else:
print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
- if record.data['takeaction'] and diag.getFlag('Squeeze'):
- print "taking action"
- record.takeAction()
-
- print "saving act_all db"
- self.add_and_save_act_all(record)
-
return
def getSendEmailFlag(self, record):
if 'rt' in record.data and \
'Status' in record.data['rt'] and \
"open" in record.data['rt']['Status'] and \
- record.data['rt']['Created'] < 60*60*24*30:
+ record.data['rt']['Created'] > int(time.time() - 60*60*24*30):
+ # if created-time is greater than the thirty days ago from the current time
return False
return True
def add_and_save_act_all(self, record):
self.act_all = database.dbLoad("act_all")
+ if self.hostname not in self.act_all:
+ self.act_all[self.hostname] = []
self.act_all[self.hostname].insert(0,record.data)
database.dbDump("act_all", self.act_all)
#for key in record.data.keys():
# print "%10s %s %s " % (key, "==", record.data[key])
- if record.data['ticket_id'] == "":
+ if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
log = "DOWN: %20s : %-40s == %20s %s" % \
(self.loginbase, self.hostname, record.data['info'][1:], record.data['found_rt_ticket'])
else:
record.data['args'] = {'nodename': self.hostname}
record.data['info'] = (self.hostname, record.data['prev_category'], record.data['category'])
- if record.data['ticket_id'] == "":
- log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ try:
+ if record.data['ticket_id'] == "" and 'found_rt_ticket' in record.data:
+ log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
(self.loginbase, self.hostname, record.data['stage'],
- state, category, record.data['found_rt_ticket'])
- else:
- log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ record.data['prev_category'], record.data['category'], record.data['found_rt_ticket'])
+ else:
+ log = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
(self.loginbase, self.hostname, record.data['stage'],
- state, category, record.data['ticket_id'])
+ record.data['prev_category'], record.data['category'], record.data['ticket_id'])
+ except:
+ log = "IMPR: %s improved to %s " % (self.hostname, record.data['category'])
return log
def checkStageAndTime(self, diag, record):
current_time = time.time()
delta = current_time - record.data['time']
+ #print record.data
if 'findbad' in record.data['stage']:
# The node is bad, and there's no previous record of it.
record.data['email'] = TECH
record.data['action'] = ['noop']
record.data['takeaction'] = False
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'stage_actinoneweek'
+ record.data['save-act-all'] = True
elif 'reboot_node' in record.data['stage']:
record.data['email'] = TECH
record.data['action'] = ['noop']
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'stage_actinoneweek'
record.data['takeaction'] = False
+ record.data['save-act-all'] = False
elif 'improvement' in record.data['stage']:
- print "backing off of %s" % self.hostname
+ print "checkStageAndTime: backing off of %s" % self.hostname
record.data['action'] = ['close_rt']
record.data['takeaction'] = True
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'monitor-end-record'
+ record.data['save-act-all'] = True
elif 'actinoneweek' in record.data['stage']:
if delta >= 7 * SPERDAY:
+ print "checkStageAndTime: transition to next stage actintwoweeks"
record.data['email'] = TECH | PI
record.data['stage'] = 'stage_actintwoweeks'
- record.data['message'] = record.data['message'][1]
+ record.data['message'] = record.data['message_series'][1]
record.data['action'] = ['nocreate' ]
record.data['time'] = current_time # reset clock for waitforever
record.data['takeaction'] = True
+ record.data['save-act-all'] = True
elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
+ print "checkStageAndTime: second message in one week"
record.data['email'] = TECH
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['action'] = ['sendmailagain-waitforoneweekaction' ]
record.data['second-mail-at-oneweek'] = True
record.data['takeaction'] = False
+ record.data['save-act-all'] = True
else:
record.data['message'] = None
record.data['action'] = ['waitforoneweekaction' ]
- print "ignoring this record for: %s" % self.hostname
- return None # don't send if there's no action
+ record.data['takeaction'] = False
+ record.data['save-act-all'] = False
+ print "checkStageAndTime: ignoring this record for: %s" % self.hostname
+ #return None # don't send if there's no action
elif 'actintwoweeks' in record.data['stage']:
if delta >= 7 * SPERDAY:
+ print "checkStageAndTime: transition to next stage waitforever"
record.data['email'] = TECH | PI | USER
record.data['stage'] = 'stage_waitforever'
- record.data['message'] = record.data['message'][2]
+ record.data['message'] = record.data['message_series'][2]
record.data['action'] = ['suspendslices']
record.data['time'] = current_time # reset clock for waitforever
record.data['takeaction'] = True
+ record.data['save-act-all'] = True
elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
+ print "checkStageAndTime: second message in one week for stage two"
record.data['email'] = TECH | PI
- record.data['message'] = record.data['message'][1]
+ record.data['message'] = record.data['message_series'][1]
record.data['action'] = ['sendmailagain-waitfortwoweeksaction' ]
record.data['second-mail-at-twoweeks'] = True
record.data['takeaction'] = False
+ record.data['save-act-all'] = True
else:
record.data['message'] = None
+ record.data['takeaction'] = False
record.data['action'] = ['waitfortwoweeksaction']
- return None # don't send if there's no action
+ record.data['save-act-all'] = False
+ print "checkStageAndTime: second message in one week for stage two"
+ #return None # don't send if there's no action
elif 'ticket_waitforever' in record.data['stage']:
record.data['email'] = TECH
record.data['action'] = ['ticket_waitforever']
record.data['message'] = None
record.data['time'] = current_time
+ record.data['save-act-all'] = True
else:
if delta >= 7*SPERDAY:
record.data['action'] = ['ticket_waitforever']
record.data['message'] = None
record.data['time'] = current_time # reset clock
+ record.data['save-act-all'] = True
else:
record.data['action'] = ['ticket_waitforever']
record.data['message'] = None
+ record.data['takeaction'] = False
+ record.data['save-act-all'] = False
return None
elif 'waitforever' in record.data['stage']:
record.data['takeaction'] = True
if delta >= 3*SPERDAY:
record.data['action'] = ['email-againwaitforever']
- record.data['message'] = record.data['message'][2]
+ record.data['message'] = record.data['message_series'][2]
record.data['time'] = current_time # reset clock
+ record.data['save-act-all'] = True
else:
record.data['action'] = ['waitforever']
record.data['message'] = None
- return None # don't send if there's no action
+ record.data['takeaction'] = False
+ record.data['save-act-all'] = False
+ #return None # don't send if there's no action
else:
# There is no action to be taken, possibly b/c the stage has
# TODO: figure out which. for now assume 2.
print "UNKNOWN stage for %s; nothing done" % self.hostname
record.data['action'] = ['unknown']
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['email'] = TECH
record.data['action'] = ['noop']
- record.data['message'] = record.data['message'][0]
+ record.data['message'] = record.data['message_series'][0]
record.data['stage'] = 'stage_actinoneweek'
record.data['time'] = current_time # reset clock
record.data['takeaction'] = False
+ record.data['save-act-all'] = True
print "%s" % record.data['log'],
print "%15s" % record.data['action']
sudo /usr/sbin/vps ax
-If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB. You can find instructions for this at the Technical Contact's Guide:
+If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide:
https://www.planet-lab.org/doc/guides/bootcdsetup
""")
pcudown_one =("""Could not use PCU to reboot %(hostname)s""",
-"""As part of our machine monitoring and maintenance, we tried to use the PCU
-registered below, but could not for the following reason at the link below:
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered below, but could not for the reasons at the link below:
https://monitor.planet-lab.org/cgi-bin/printbadpcus.php?id=%(pcu_id)s
-We need your help resolving this issue in two ways:
+We need your help resolving this issue in a few ways:
+
+ 1. First, we need your help rebooting %(hostname)s. Because the above PCU does
+ not appear to work, please manually reboot this machine. If it turns out that
+ there is a problem with the PCU configuration, we can help you
+ resolve that independently.
-* First, we need your help rebooting %(hostname)s. Because we cannot leverage
- the above PCU, please manually reboot this machine and we can help you
- resolve any configuration errors with the PCU independently.
+ 2. If there is nothing apparently wrong with the PCU, or the mapping between
+ the PCU and the host, then there is likely a problem with our bootstrap
+ software on your machine. To help us, please make a note of any text on
+ the console and report it to mailto:support@planet-lab.org . An example
+ might be that the console hangs waiting for a module to unload. The last
+ reported name or any error messages on the screen would be very helpful.
-* Second, if it is possible, please correcct the above PCU problem.
- By enabling us to take administrative actions automatically from
- PlanetLab Central without local intervention, you can trade a small
- amount of time now for a time savings in the future.
+ 3. Alternately, if it is possible, please correcct the above PCU problem, or
+ let us know what steps you are taking. By enabling us to take administrative
+ actions automatically from PlanetLab Central without your intervention, you
+ can trade a small amount of time now for a time savings in the future.
If the PCU is up and running, but behind a firewall, please make it accessible
-from address block 128.112.139.0/25. You can confirm that this is the address
+from address block 128.112.139.0/24. You can confirm that this is the address
space from which the PlanetLab Central servers run.
If the above PCU is no longer in service, please delete it by visiting:
import signal
import traceback
+from nodequery import pcu_select
#old_handler = signal.getsignal(signal.SIGCHLD)
global count
global_round = externalState['round']
- tp = threadpool.ThreadPool(20)
+ tp = threadpool.ThreadPool(10)
# CREATE all the work requests
for pcuname in l_pcus:
pcus = []
for node in l_nodes:
pcus += node['pcu_ids']
+ # clear out dups.
+ l_pcus = [pcu for pcu in sets.Set(pcus)]
+ elif config.pcuselect is not None:
+ n, pcus = pcu_select(config.pcuselect)
+ # clear out dups.
l_pcus = [pcu for pcu in sets.Set(pcus)]
elif config.nodelist == None and config.pcuid == None:
parser.set_defaults(nodelist=None,
increment=False,
pcuid=None,
+ pcuselect=None,
site=None,
dbname="findbadpcus",
cachenodes=False,
help="Provide the input file for the node list")
parser.add_option("", "--site", dest="site", metavar="FILE",
help="Get all pcus associated with the given site's nodes")
+ parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
+ help="Query string to apply to the findbad pcus")
parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
help="Provide the id for a single pcu")
self.action = "reboot.reboot('%s')" % host
pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
- pflags.resetRecentFlag('pcutried')
+ #pflags.resetRecentFlag('pcutried')
if not pflags.getRecentFlag('pcutried'):
- pflags.setRecentFlag('pcutried')
try:
print "CALLING REBOOT!!!"
ret = reboot.reboot(host)
+ pflags.setRecentFlag('pcutried')
+ pflags.save()
+ return ret
+
+ except Exception,e:
+ print traceback.print_exc(); print e
+
+ # NOTE: this failure could be an implementation issue on
+ # our end. So, extra notices are confusing...
+ # self._send_pcunotice(host)
+
+ pflags.setRecentFlag('pcufailed')
+ pflags.save()
+ return False
+
+ elif not pflags.getRecentFlag('pcu_rins_tried'):
+ try:
+ # set node to 'rins' boot state.
+ print "CALLING REBOOT +++ RINS"
+ plc.nodeBootState(host, 'rins')
+ ret = reboot.reboot(host)
+
+ pflags.setRecentFlag('pcu_rins_tried')
pflags.save()
return ret
pflags.setRecentFlag('pcumessagesent')
pflags.save()
- # NOTE: this will result in just one message sent at a time.
- return True
- else:
- print "GetRecentFlag()"
- return False
+ # This will result in mail() being called next, to try to
+ # engage the technical contact to take care of it also.
+ print "RETURNING FALSE"
+ return False
+
else:
print "NO PCUOK"
self.action = "None"
force=False,
nosetup=False,
verbose=False,
- stopkey=None,
- stopvalue=None,
quiet=False,
)
if config.node or config.nodelist:
if config.node: hostnames = [ config.node ]
- else: hostnames = config.getListFromFile(config.nodelist)
+ else: hostnames = util.file.getListFromFile(config.nodelist)
fb = database.dbLoad("findbad")
# rerun findbad with the nodes in the given nodes.
file = "findbad.txt"
util.file.setFileFromList(file, hostnames)
- os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
+ os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
+ # TODO: shouldn't we reload the node list now?
+l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
# commands:
i = 1
count = 1
+#print "hosts: %s" % hostnames
for host in hostnames:
#if 'echo' in host or 'hptest-1' in host: continue
+
try:
try:
node = api.GetNodes(host)[0]
print "%-2d" % i, nodegroup_display(node, fb)
i += 1
if i-1 <= int(config.skip): continue
+ if host in l_blacklist:
+ print "%s is blacklisted. Skipping." % host
+ continue
if config.stopselect:
dict_query = query_to_dict(config.stopselect)
if verify(dict_query, fbnode) and observed_state != "dbg ":
# evaluates to true, therefore skip.
print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
- continue
-
- if config.stopkey and config.stopvalue:
- fbnode = fb['nodes'][host]['values']
- observed_state = get_current_state(fbnode)
+ try:
+ # todo: clean up act_all record here.
+ # todo: send thank you, etc.
+ mailmonitor.reboot(host)
+ except Exception, e:
+ print traceback.print_exc(); print e
- if config.stopkey in fbnode:
- if config.stopvalue in fbnode[config.stopkey] and observed_state != "dbg ":
- print "%s has stopvalue; skipping..." % host
- continue
- else:
- print "stopkey %s not in fbnode record for %s; skipping..." % (config.stopkey, host)
- print fbnode
continue
+ #else:
+ #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
+ #sys.exit(1)
if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
print "recently rebooted %s. skipping... " % host
return
def emailViaRT(subject, text, to, ticket_id=None):
- if ticket_id == None or ticket_id == "":
+ if ticket_id == None or ticket_id == "" or ticket_id == 0:
print "No TICKET"
return emailViaRT_NoTicket(subject, text, to)
from clean_policy import *
def reboot(hostname):
- print "calling reboot!!! %s " % hostname
+ print "CALLING: mailmonitor.reboot(%s)" % hostname
l_nodes = api.GetNodes(hostname)
if len(l_nodes) == 0:
if len(l_nodes) == 0:
raise Exception("Host removed via blacklist: %s" % hostname)
- ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
- if ad_dbTickets == None:
- raise Exception("Could not find cached dbTickets")
+ #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
+ #if ad_dbTickets == None:
+ # raise Exception("Could not find cached dbTickets")
- print "starting new thing"
+ #print "starting new thing"
mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
mon.run()
node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
node['lastupdate'] = diff_time(node['last_contact'])
pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
- node['lc'] = diff_time(pf.last_changed)
+ try:
+ node['lc'] = diff_time(pf.last_changed)
+ except:
+ node['lc'] = "err"
ut = fb['nodes'][node['hostname']]['values']['comonstats']['uptime']
if ut != "null":
ut = diff_time(float(fb['nodes'][node['hostname']]['values']['comonstats']['uptime']), False)
node['uptime'] = ut
- return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
+ return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)35.35s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
def datetime_fromstr(str):
if '-' in str:
from nodecommon import *
import database
+import util.file
def main():
fb = database.dbLoad("findbad")
if config.node:
hostlist = [ config.node ]
else:
- hostlist = config.getListFromFile(config.nodelist)
+ hostlist = util.file.getListFromFile(config.nodelist)
# NOTE: preserve order given in file. Otherwise, return values are not in order
# given to GetNodes
if 'rt' in actnode and 'Status' in actnode['rt']:
print "\t %5.5s %5.5s | %8.8s | %15.15s | %s" % \
(actnode['rt']['Status'], actnode['rt']['id'][7:],
- actnode['category'], actnode['action'][0],
- actnode['msg_format'][:-1])
+ actnode['category'], actnode['action'][0], actnode['info'][1:])
else:
if type(actnode['action']) == type([]):
action = actnode['action'][0]
import re
#fb = {}
-fb = {}
-fbpcu = {}
+fb = None
+fbpcu = None
class NoKeyException(Exception): pass
fbnode['kernel'] = fbnode['kernel'].split()[2]
fbnode['boot_state'] = fbnode['plcnode']['boot_state']
- print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+ if len(fbnode['nodegroups']) > 0:
+ fbnode['category'] = fbnode['nodegroups'][0]
+
+ print "%(hostname)-45s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
else:
format = ""
for f in fields:
#print "looking at key: %s" % key
if key in data:
value_re = re.compile(con[key])
- con_and_true = con_and_true & (value_re.search(data[key]) is not None)
+ if type([]) == type(data[key]):
+ local_or_true = False
+ for val in data[key]:
+ local_or_true = local_or_true | (value_re.search(val) is not None)
+ con_and_true = con_and_true & local_or_true
+ else:
+ con_and_true = con_and_true & (value_re.search(data[key]) is not None)
elif key not in data:
print "missing key %s" % key,
pass
return False
def pcu_select(str_query, nodelist=None):
+ global fb
+ global fbpcu
pcunames = []
nodenames = []
if str_query is None: return (nodenames, pcunames)
+ if fb is None:
+ fb = database.dbLoad("findbad")
+ if fbpcu is None:
+ fbpcu = database.dbLoad("findbadpcus")
+
#print str_query
dict_query = query_to_dict(str_query)
#print dict_query
nodenames.append(node)
str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
(pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
- pcunames.append(str)
+ #pcunames.append(str)
+ pcunames.append(pcuinfo['pcu_id'])
return (nodenames, pcunames)
def node_select(str_query, nodelist=None, fbdb=None):
# Control Outlets (5 ,1).........5
try:
- print s
- print "Enter Request" in s.before
- index = s.expect("Enter Request")
+ #index = s.expect("Enter Request")
+ index = s.expect(["Enter Request :"])
if index == 0:
print "3"
print "Reboot %d" % node_port
s.send("Reboot %d\r\n" % node_port)
- index = s.expect(["(Y/N)?"])
+ time.sleep(5)
+ index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
if index == 0:
if dryrun:
print "sending N"
else:
print "sending Y"
s.send("Y\r\n")
+ elif index == 1:
+ raise ExceptionPrompt("PCU Reported 'Port in use.'")
+ elif index == 2:
+ raise ExceptionSequence("Issued command 'Reboot' failed.")
- #index = s.expect(["DS-RPC>"])
+ time.sleep(5)
+ index = s.expect(["DS-RPC>"])
#print "got prompt back"
s.close()
except pexpect.EOF:
- raise ExceptionPrompt("EOF before 'Enter Request' Prompt")
+ raise ExceptionPrompt("EOF before expected Prompt")
except pexpect.TIMEOUT:
- raise ExceptionPrompt("Timeout before 'Enter Request' Prompt")
+ raise ExceptionPrompt("Timeout before expected Prompt")
return 0
# Otherwise, the login succeeded.
# Send a ctrl-c to the remote process.
- print "sending ctrl-c"
+ print "SENDING ctrl-c"
s.send(chr(3))
# Control Outlets (5 ,1).........5
try:
+ print "EXPECTING: ", "Enter Request :"
index = s.expect(["Enter Request :"])
if index == 0:
- print "5"
+ print "SENDING: 5"
s.send("5\r\n")
- index = s.expect(["DS-RPC>", "Enter user name:"])
+ print "EXPECTING: ", "DS-RPC>"
+ index = s.expect(["DS-RPC>", "Enter user name:", "Port in use."])
if index == 1:
print "sending username"
s.send(self.username + "\r\n")
index = s.expect(["DS-RPC>"])
+ elif index == 2:
+ raise ExceptionPrompt("PCU Reported 'Port in use.'")
if index == 0:
- print "Reboot %d" % node_port
+ print "SENDING: Reboot %d" % node_port
s.send("Reboot %d\r\n" % node_port)
- index = s.expect(["(Y/N)?"])
+ print "SLEEPING: 5"
+ time.sleep(5)
+ print "EXPECTING: ", "Y/N?"
+ index = s.expect(["\(Y/N\)\?", "Port in use", "DS-RPC>"])
if index == 0:
if dryrun:
print "sending N"
s.send("N\r\n")
else:
- print "sending Y"
+ print "SENDING: Y"
s.send("Y\r\n")
+ elif index == 1:
+ raise ExceptionPrompt("PCU Reported 'Port in use.'")
+ elif index == 2:
+ raise ExceptionSequence("Issued command 'Reboot' failed.")
# NOTE: for some reason, the script times out with the
# following line. In manual tests, it works correctly, but
# with automated tests, evidently it fails.
- #index = s.expect(["DS-RPC>"])
- #print "got prompt back"
+ print "SLEEPING: 5"
+ time.sleep(5)
+ #print "TOTAL--", s.allstr, "--EOT"
+ index = s.expect(["DS-RPC>"])
+ print "got prompt back"
s.close()
# even after login...
print "msg: %s" % msg
self.transport.write(self.username + "\r\n")
+ time.sleep(5)
self.ifThenSend("DS-RPC>", "Reboot %d" % node_port)
# Reboot Outlet N (Y/N)?
self.ifThenSend("(Y/N)?", "N")
else:
self.ifThenSend("(Y/N)?", "Y")
+ time.sleep(5)
self.ifThenSend("DS-RPC>", "")
self.close()
print values
# TODO: make a more robust version of APC
- if values['pcu_id'] in [1163,1055,1111,1231,1113,1127,1128,1148]:
+ if values['pcu_id'] in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
apc = APCEurope(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
apc = APCBrazil(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
- elif values['pcu_id'] in [1221,1225]:
+ elif values['pcu_id'] in [1221,1225,1220]:
apc = APCBerlin(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
- elif values['pcu_id'] in [1173,1221,1220]:
+ elif values['pcu_id'] in [1173,1240]:
apc = APCFolsom(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
# BayTech DS4-RPC
elif continue_probe and values['model'].find("DS4-RPC") >= 0:
- if values['pcu_id'] in [1237,1052,1209,1002,1008,1041,1013,1022]:
+ if values['pcu_id'] in [1056,1237,1052,1209,1002,1008,1041,1013,1022]:
# These require a 'ctrl-c' to be sent...
baytech = BayTechCtrlC(values, verbose, ['22', '23'])
rb_ret = baytech.reboot(values[nodename], dryrun)
self.env = env
self.__irix_hack = sys.platform.lower().find('irix') >= 0 # This flags if we are running on irix
self.use_native_pty_fork = not (sys.platform.lower().find('solaris') >= 0) # Solaris uses internal __fork_pty(). All other use pty.fork().
+ self.allstr = ""
# allow dummy instances for subclasses that may not use command or args.
if command is None:
self.buffer = incoming[self.match.end() : ]
self.before = incoming[ : self.match.start()]
self.after = incoming[self.match.start() : self.match.end()]
+ #print "MATCH--", self.after, "--EOM"
return self.match_index
# No match at this point
if timeout < 0 and timeout is not None:
c = self.read_nonblocking (self.maxread, timeout)
time.sleep (0.0001)
incoming = incoming + c
+ self.allstr += c
+ #print "INCOMING--", c, "--EOI"
if timeout is not None:
timeout = end_time - time.time()
except EOF, e:
return self.status
def closeTicket(self):
- mailer.closeTicketViaRT(self.ticket_id)
+ mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist.")
def email(self, subject, body, to):
self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id)
#print pm
if id in pm:
- print "Using existing object"
+ #print "Using existing object"
obj = pm[id]
else:
- print "creating new object"
+ #print "creating new object"
obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
obj.id = id
obj.actiontracker = Recent(3*60*60*24)
def reset(self):
self.actiontracker.unsetRecent()
+ def save(self):
+ pm = database.dbLoad(self.db)
+ pm[self.id] = self
+ database.dbDump(self.db, pm)
+
def send(self, to):
if not self.actiontracker.isRecent():
self.ticket_id = Message.send(self, to)
self.actiontracker.setRecent()
-
- #print "recording object for persistance"
- pm = database.dbLoad(self.db)
- pm[self.id] = self
- database.dbDump(self.db, pm)
+ self.save()
else:
# NOTE: only send a new message every week, regardless.
- print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
+ print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
class MonitorMessage(object):
def __new__(typ, id, *args, **kwargs):
def severity(self):
category = self.data['category']
prev_category = self.data['prev_category']
+ #print "SEVERITY: ", category, prev_category
val = cmpCategoryVal(category, prev_category)
return val
def takeAction(self):
pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
- if 'improvement' in self.data['stage'] or self.improved():
- print "decreasing penalty for %s"%self.hostname
+ if 'improvement' in self.data['stage'] or self.improved() or \
+ 'monitor-end-record' in self.data['stage']:
+ print "takeAction: decreasing penalty for %s"%self.hostname
+ pp.decrease()
pp.decrease()
else:
- print "increasing penalty for %s"%self.hostname
+ print "takeAction: increasing penalty for %s"%self.hostname
pp.increase()
pp.apply(self.hostname)
pp.save()
def _format_diaginfo(self):
info = self.data['info']
+ print "FORMAT : STAGE: ", self.data['stage']
if self.data['stage'] == 'monitor-end-record':
+ if info[2] == "ALPHA": info = (info[0], info[1], "PROD")
hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
else:
hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
return hlist
+ def saveAction(self):
+ if 'save-act-all' in self.data and self.data['save-act-all'] == True:
+ return True
+ else:
+ return False
def getMessage(self, ticket_id=None):
self.data['args']['hostname'] = self.hostname
self.data['args']['loginbase'] = self.loginbase
self.data['args']['hostname_list'] = self._format_diaginfo()
- message = PersistMessage(self.hostname,
+ #print self.data['message']
+ if self.data['message']:
+ message = PersistMessage(self.hostname,
self.data['message'][0] % self.data['args'],
self.data['message'][1] % self.data['args'],
True, db='monitor_persistmessages',
ticket_id=ticket_id)
- return message
+ return message
+ else:
+ return None
def getContacts(self):
roles = self.data['email']
def severity(self):
category = self.data['category']
prev_category = self.data['prev_category']
+ print "IMPROVED: ", category, prev_category
val = cmpCategoryVal(category, prev_category)
return val
del act_all
return False
+ pm = database.dbLoad("monitor_persistmessages")
+ if node not in pm:
+ del pm
+ return False
+ else:
+ print "deleting node record"
+ del pm[node]
+ database.dbDump("monitor_persistmessages", pm)
+
a = Action(node, act_all[node][0])
a.delField('rt')
a.delField('found_rt_ticket')
a.delField('first-found')
rec = a.get()
rec['action'] = ["close_rt"]
- rec['category'] = "UNKNOWN"
+ rec['category'] = "ALPHA" # assume that it's up...
rec['stage'] = "monitor-end-record"
+ rec['ticket_id'] = None
rec['time'] = time.time() - 7*60*60*24
act_all[node].insert(0,rec)
database.dbDump("act_all", act_all)