3 from monitor import database
5 from monitor.wrapper import plc
6 from monitor.wrapper import mailer
9 from monitor.const import *
10 from monitor import util
11 from monitor import config
14 from datetime import datetime, timedelta
17 def gethostlist(hostlist_file):
18 return util.file.getListFromFile(hostlist_file)
20 def array_to_priority_map(array):
21 """ Create a mapping where each entry of array is given a priority equal
22 to its position in the array. This is useful for subsequent use in the
31 def cmpValMap(v1, v2, map):
32 if v1 in map and v2 in map and map[v1] < map[v2]:
34 elif v1 in map and v2 in map and map[v1] > map[v2]:
36 elif v1 in map and v2 in map:
39 raise Exception("No index %s or %s in map" % (v1, v2))
41 def cmpCategoryVal(v1, v2):
42 # Terrible hack to manage migration to no more 'ALPHA' states.
43 if v1 == 'ALPHA': v1 = "PROD"
44 if v2 == 'ALPHA': v2 = "PROD"
45 #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
46 map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
47 return cmpValMap(v1,v2,map)
51 def __init__(self, hostname):
52 self.hostname = hostname
58 def previous_attempt(self):
60 def setValidMapping(self):
64 def __init__(self, key, valuepattern, action):
71 # connect one penalty to another, in a FSM diagram. After one
72 # condition/penalty is applied, move to the next phase.
76 def __init__(self, ticket_id = None):
77 self.ticket_id = ticket_id
79 print "getting ticket status",
80 self.status = mailer.getTicketStatus(self.ticket_id)
83 def setTicketStatus(self, status):
84 mailer.setTicketStatus(self.ticket_id, status)
85 self.status = mailer.getTicketStatus(self.ticket_id)
88 def getTicketStatus(self):
90 self.status = mailer.getTicketStatus(self.ticket_id)
93 def closeTicket(self):
94 mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist.")
96 def email(self, subject, body, to):
97 self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id)
100 class Message(object):
101 def __init__(self, subject, message, via_rt=True, ticket_id=None, **kwargs):
103 self.subject = subject
104 self.message = message
105 self.rt = RT(ticket_id)
109 return self.rt.email(self.subject, self.message, to)
111 return mailer.email(self.subject, self.message, to)
113 class Recent(object):
114 def __init__(self, withintime):
115 self.withintime = withintime
118 self.time = self.__getattribute__('time')
120 self.time = time.time()- 7*24*60*60
122 #self.time = time.time()
123 #self.action_taken = False
126 if self.time + self.withintime < time.time():
127 self.action_taken = False
129 if self.time + self.withintime > time.time() and self.action_taken:
134 def unsetRecent(self):
135 self.action_taken = False
136 self.time = time.time()
140 self.action_taken = True
141 self.time = time.time()
144 class PersistFlags(Recent):
145 def __new__(typ, id, *args, **kwargs):
153 pm = database.dbLoad(db)
155 database.dbDump(db, {})
156 pm = database.dbLoad(db)
161 obj = super(PersistFlags, typ).__new__(typ, *args, **kwargs)
162 for key in kwargs.keys():
163 obj.__setattr__(key, kwargs[key])
164 obj.time = time.time()
165 obj.action_taken = False
170 def __init__(self, id, withintime, **kwargs):
172 Recent.__init__(self, withintime)
175 pm = database.dbLoad(self.db)
177 database.dbDump(self.db, pm)
179 def resetFlag(self, name):
180 self.__setattr__(name, False)
182 def setFlag(self, name):
183 self.__setattr__(name, True)
185 def getFlag(self, name):
187 return self.__getattribute__(name)
189 self.__setattr__(name, False)
192 def resetRecentFlag(self, name):
196 def setRecentFlag(self, name):
200 def getRecentFlag(self, name):
201 # if recent and flag set -> true
204 return self.isRecent() & self.__getattribute__(name)
206 self.__setattr__(name, False)
209 def checkattr(self, name):
211 x = self.__getattribute__(name)
217 class PersistMessage(Message):
218 def __new__(typ, id, subject, message, via_rt, **kwargs):
222 db = "persistmessages"
225 pm = database.dbLoad(db)
227 database.dbDump(db, {})
228 pm = database.dbLoad(db)
232 #print "Using existing object"
235 #print "creating new object"
236 obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
238 obj.actiontracker = Recent(1*60*60*24)
241 if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
242 obj.ticket_id = kwargs['ticket_id']
247 def __init__(self, id, subject, message, via_rt=True, **kwargs):
248 print "initializing object: %s" % self.ticket_id
250 Message.__init__(self, subject, message, via_rt, self.ticket_id)
253 self.actiontracker.unsetRecent()
256 pm = database.dbLoad(self.db)
258 database.dbDump(self.db, pm)
261 if not self.actiontracker.isRecent():
262 self.ticket_id = Message.send(self, to)
263 self.actiontracker.setRecent()
266 # NOTE: only send a new message every week, regardless.
267 # NOTE: can cause thank-you messages to be lost, for instance when node comes back online within window.
268 print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
270 class MonitorMessage(object):
271 def __new__(typ, id, *args, **kwargs):
275 db = "monitormessages"
278 if 'reset' in kwargs and kwargs['reset'] == True:
279 database.dbDump(db, {})
280 pm = database.dbLoad(db)
282 database.dbDump(db, {})
283 pm = database.dbLoad(db)
287 print "Using existing object"
290 print "creating new object"
291 obj = super(object, typ).__new__(typ, id, *args, **kwargs)
293 obj.sp = PersistSitePenalty(id, 0)
298 def __init__(self, id, message):
302 class SitePenalty(object):
304 penalty_map.append( { 'name': 'noop', 'enable' : lambda host: None,
305 'disable' : lambda host: None } )
306 penalty_map.append( { 'name': 'nocreate', 'enable' : lambda host: plc.removeSliceCreation(host),
307 'disable' : lambda host: plc.enableSliceCreation(host) } )
308 penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda host: plc.suspendSlices(host),
309 'disable' : lambda host: plc.enableSlices(host) } )
311 #def __init__(self, index=0, **kwargs):
314 def get_penalties(self):
315 # TODO: get penalties actually applied to a node from PLC DB.
316 return [ n['name'] for n in SitePenalty.penalty_map ]
319 self.index = self.index + 1
320 if self.index > len(SitePenalty.penalty_map)-1: self.index = len(SitePenalty.penalty_map)-1
324 self.index = self.index - 1
325 if self.index < 0: self.index = 0
328 def apply(self, host):
330 for i in range(len(SitePenalty.penalty_map)-1,self.index,-1):
331 print "\tdisabling %s on %s" % (SitePenalty.penalty_map[i]['name'], host)
332 SitePenalty.penalty_map[i]['disable'](host)
334 for i in range(0,self.index+1):
335 print "\tapplying %s on %s" % (SitePenalty.penalty_map[i]['name'], host)
336 SitePenalty.penalty_map[i]['enable'](host)
342 class PersistSitePenalty(SitePenalty):
343 def __new__(typ, id, index, **kwargs):
347 db = "persistpenalties"
350 if 'reset' in kwargs and kwargs['reset'] == True:
351 database.dbDump(db, {})
352 pm = database.dbLoad(db)
354 database.dbDump(db, {})
355 pm = database.dbLoad(db)
359 print "Using existing object"
362 print "creating new object"
363 obj = super(PersistSitePenalty, typ).__new__(typ, [index], **kwargs)
370 def __init__(self, id, index, **kwargs):
374 pm = database.dbLoad(self.db)
376 database.dbDump(self.db, pm)
381 Each host has a target set of attributes. Some may be set manually,
382 or others are set globally for the preferred target.
385 All nodes in the Alpha or Beta group would have constraints like:
386 [ { 'state' : 'BOOT', 'kernel' : '2.6.22' } ]
388 def __init__(self, constraints):
389 self.constraints = constraints
391 def verify(self, data):
393 self.constraints is a list of key, value pairs.
394 # [ {... : ...}==AND , ... , ... , ] == OR
397 for con in self.constraints:
398 #print "con: %s" % con
400 for key in con.keys():
401 #print "looking at key: %s" % key
403 #print "%s %s" % (con[key], data[key])
404 con_and_true = con_and_true & (con[key] in data[key])
405 elif key not in data:
406 print "missing key %s" % key
409 con_or_true = con_or_true | con_and_true
413 class Record(object):
415 def __init__(self, hostname, data):
416 from monitor.wrapper import plccache
417 self.hostname = hostname
419 self.plcdb_hn2lb = plccache.plcdb_hn2lb
420 self.loginbase = self.plcdb_hn2lb[self.hostname]
424 def stageIswaitforever(self):
425 if 'waitforever' in self.data['stage']:
431 category = self.data['category']
432 prev_category = self.data['prev_category']
433 #print "SEVERITY: ", category, prev_category
434 val = cmpCategoryVal(category, prev_category)
438 return self.severity() > 0
440 def end_record(self):
441 return node_end_record(self.hostname)
443 def reset_stage(self):
444 self.data['stage'] = 'findbad'
447 def getCategory(self):
448 return self.data['category'].lower()
451 return self.data['state'].lower()
453 def getDaysDown(cls, diag_record):
455 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
456 daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
457 #elif diag_record['comonstats']['sshstatus'] != "null":
458 # daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
459 #elif diag_record['comonstats']['lastcotop'] != "null":
460 # daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
463 last_contact = diag_record['plcnode']['last_contact']
464 if last_contact == None:
465 # the node has never been up, so give it a break
468 diff = now - last_contact
469 daysdown = diff // (60*60*24)
471 getDaysDown = classmethod(getDaysDown)
473 def getStrDaysDown(cls, diag_record):
475 last_contact = diag_record['plcnode']['last_contact']
476 date_created = diag_record['plcnode']['date_created']
478 if diag_record['comonstats']['uptime'] != "null" and \
479 diag_record['comonstats']['uptime'] != "-1":
480 daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
481 daysdown = "%d days up" % daysdown
483 elif last_contact is None:
484 if date_created is not None:
486 diff = now - date_created
487 daysdown = diff // (60*60*24)
488 daysdown = "Never contacted PLC, created %s days ago" % daysdown
490 daysdown = "Never contacted PLC"
493 diff = now - last_contact
494 daysdown = diff // (60*60*24)
495 daysdown = "%s days down" % daysdown
497 getStrDaysDown = classmethod(getStrDaysDown)
499 def getSendEmailFlag(self):
503 # resend if open & created longer than 30 days ago.
504 if 'rt' in self.data and \
505 'Status' in self.data['rt'] and \
506 "open" in self.data['rt']['Status'] and \
507 self.data['rt']['Created'] > int(time.time() - 60*60*24*30):
508 # if created-time is greater than the thirty days ago from the current time
513 def getMostRecentStage(self):
514 lastact = self.data['last_action_record']
517 def getMostRecentTime(self):
518 lastact = self.data['last_action_record']
519 return lastact.date_action_taken
521 def takeAction(self, index=0):
522 pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
523 if 'improvement' in self.data['stage'] or self.improved() or \
524 'monitor-end-record' in self.data['stage']:
525 print "takeAction: decreasing penalty for %s"%self.hostname
529 print "takeAction: increasing penalty for %s"%self.hostname
532 print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
534 pp.apply(self.hostname)
537 def _format_diaginfo(self):
538 info = self.data['info']
539 print "FORMAT : STAGE: ", self.data['stage']
540 if self.data['stage'] == 'monitor-end-record':
541 if info[2] == "ALPHA": info = (info[0], info[1], "PROD")
542 hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
544 hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
546 def saveAction(self):
547 if 'save_act_all' in self.data and self.data['save_act_all'] == True:
552 def getMessage(self, ticket_id=None):
553 self.data['args']['hostname'] = self.hostname
554 self.data['args']['loginbase'] = self.loginbase
555 self.data['args']['hostname_list'] = self._format_diaginfo()
556 #print self.data['message']
557 if self.data['message']:
558 message = PersistMessage(self.hostname,
559 self.data['message'][0] % self.data['args'],
560 self.data['message'][1] % self.data['args'],
561 True, db='monitor_persistmessages',
563 if self.data['stage'] == "improvement":
569 def getContacts(self):
570 roles = self.data['email']
572 if not config.mail and not config.debug and config.bcc:
574 if config.mail and config.debug:
580 contacts += [config.email]
582 #contacts += [TECHEMAIL % self.loginbase]
583 contacts += plc.getTechEmails(self.loginbase)
585 #contacts += [PIEMAIL % self.loginbase]
586 contacts += plc.getPIEmails(self.loginbase)
588 contacts += plc.getSliceUserEmails(self.loginbase)
589 slices = plc.slices(self.loginbase)
591 #for slice in slices:
592 # contacts += [SLICEMAIL % slice]
593 print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
595 print "SLIC: %20s : 0 slices" % self.loginbase
601 def __init__(self, hostname, target):
602 self.hostname = hostname
607 class MonRecord(object):
608 def __init__(self, data):
609 self.keys = data.keys()
611 self.__dict__.update(data)
617 ret[k] = self.__dict__[k]
622 str += self.host + "\n"
624 if "message" in k or "msg" in k:
627 s_time=time.strftime("%Y/%m/%d %H:%M:%S",
628 time.gmtime(self.__dict__[k]))
629 str += "\t'%s' : %s\n" % (k, s_time)
631 str += "\t'%s' : %s\n" % (k, self.__dict__[k])
635 def delField(self, field):
636 if field in self.__dict__:
637 del self.__dict__[field]
639 if field in self.keys:
640 for i in range(0,len(self.keys)):
641 if self.keys[i] == field:
645 class Action(MonRecord):
646 def __init__(self, host, data):
648 MonRecord.__init__(self, data)
651 def deltaDays(self, delta):
652 t = datetime.fromtimestamp(self.__dict__['time'])
653 d = t + timedelta(delta)
654 self.__dict__['time'] = time.mktime(d.timetuple())
656 def node_end_record(node):
657 act_all = database.dbLoad("act_all")
658 if node not in act_all:
662 if len(act_all[node]) == 0:
666 pm = database.dbLoad("monitor_persistmessages")
671 print "deleting node record"
673 database.dbDump("monitor_persistmessages", pm)
675 a = Action(node, act_all[node][0])
677 a.delField('found_rt_ticket')
678 a.delField('second-mail-at-oneweek')
679 a.delField('second-mail-at-twoweeks')
680 a.delField('first-found')
682 rec['action'] = ["close_rt"]
683 rec['category'] = "ALPHA" # assume that it's up...
684 rec['stage'] = "monitor-end-record"
685 rec['ticket_id'] = None
686 rec['time'] = time.time() - 7*60*60*24
687 act_all[node].insert(0,rec)
688 database.dbDump("act_all", act_all)
694 def __init__(self, list=None):
696 if self.list == None:
699 def find(self, host, filter, timerange):
700 if host not in self.list:
703 host_log_list = self.list[host]
704 for log in host_log_list:
705 for key in filter.keys():
706 #print "searching key %s in log keys" % key
708 #print "%s in log.keys" % key
709 cmp = re.compile(filter[key])
710 res = cmp.search(log.__getattribute__(key))
712 #print "found match in log: %s %s ~=~ %s" % (log, key, filter[key])
713 if log.time > time.time() - timerange:
714 print "returning log b/c it occured within time."
720 if host in self.list:
721 return self.list[host][0]
726 if log.host not in self.list:
727 self.list[log.host] = []
729 self.list[log.host].insert(0,log)
731 class Log(MonRecord):
732 def __init__(self, host, data):
734 MonRecord.__init__(self, data)
739 str += self.host + " : { "
741 if "message" in k or "msg" in k:
744 s_time=time.strftime("%Y/%m/%d %H:%M:%S",
745 time.gmtime(self.__dict__[k]))
746 #str += " '%s' : %s, " % (k, s_time)
748 str += "'%s' : %s, " % (k, self.__dict__[k])
753 class Diagnose(MonRecord):
754 def __init__(self, host):
756 MonRecord.__init__(self, data)
761 if __name__ == "__main__":
763 #r.email("test", "body of test message", ['database@cs.princeton.edu'])
764 #from emailTxt import mailtxt
766 #database.dbDump("persistmessages", {});
767 #args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah - days down\n'}
768 #m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
769 #m.send(['soltesz@cs.utk.edu'])
770 #m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
771 # TRICK timer to thinking some time has passed.
772 #m.actiontracker.time = time.time() - 6*60*60*24
773 #m.send(['soltesz@cs.utk.edu'])