805dd0e5cb9077879e46598160e266c39848405b
[monitor.git] / unified_model.py
1 #!/usr/bin/python
2
3 from monitor import database
4
5 from monitor.wrapper import plc, plccache
6 from monitor.wrapper import mailer
7 import time
8
9 from model import *
10 from monitor.const import *
11 from monitor import util
12 from monitor import config
13
14 def gethostlist(hostlist_file):
15         return util.file.getListFromFile(hostlist_file)
16
17 def array_to_priority_map(array):
18         """ Create a mapping where each entry of array is given a priority equal
19         to its position in the array.  This is useful for subsequent use in the
20         cmpMap() function."""
21         map = {}
22         count = 0
23         for i in array:
24                 map[i] = count
25                 count += 1
26         return map
27
28 def cmpValMap(v1, v2, map):
29         if v1 in map and v2 in map and map[v1] < map[v2]:
30                 return 1
31         elif v1 in map and v2 in map and map[v1] > map[v2]:
32                 return -1
33         elif v1 in map and v2 in map:
34                 return 0
35         else:
36                 raise Exception("No index %s or %s in map" % (v1, v2))
37
38 def cmpCategoryVal(v1, v2):
39         map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
40         return cmpValMap(v1,v2,map)
41
42
43 class PCU:
44         def __init__(self, hostname):
45                 self.hostname = hostname
46
47         def reboot(self):
48                 return True
49         def available(self):
50                 return True
51         def previous_attempt(self):
52                 return True
53         def setValidMapping(self):
54                 pass
55
56 class Penalty:
57         def __init__(self, key, valuepattern, action):
58                 pass
59
60 class PenaltyMap:
61         def __init__(self):
62                 pass
63
64         # connect one penalty to another, in a FSM diagram.  After one
65         #       condition/penalty is applied, move to the next phase.
66
67
68 class RT(object):
69         def __init__(self, ticket_id = None):
70                 self.ticket_id = ticket_id
71                 if self.ticket_id:
72                         print "getting ticket status",
73                         self.status = mailer.getTicketStatus(self.ticket_id)
74                         print self.status
75
76         def setTicketStatus(self, status):
77                 mailer.setTicketStatus(self.ticket_id, status)
78                 self.status = mailer.getTicketStatus(self.ticket_id)
79                 return True
80         
81         def getTicketStatus(self):
82                 if not self.status:
83                         self.status = mailer.getTicketStatus(self.ticket_id)
84                 return self.status
85
86         def closeTicket(self):
87                 mailer.closeTicketViaRT(self.ticket_id, "Ticket CLOSED automatically by SiteAssist.") 
88
89         def email(self, subject, body, to):
90                 self.ticket_id = mailer.emailViaRT(subject, body, to, self.ticket_id)
91                 return self.ticket_id
92
93 class Message(object):
94         def __init__(self, subject, message, via_rt=True, ticket_id=None, **kwargs):
95                 self.via_rt = via_rt
96                 self.subject = subject
97                 self.message = message
98                 self.rt = RT(ticket_id)
99
100         def send(self, to):
101                 if self.via_rt:
102                         return self.rt.email(self.subject, self.message, to)
103                 else:
104                         return mailer.email(self.subject, self.message, to)
105
106 class Recent(object):
107         def __init__(self, withintime):
108                 self.withintime = withintime
109
110                 try:
111                         self.time = self.__getattribute__('time')
112                 except:
113                         self.time = time.time()- 7*24*60*60
114
115                 #self.time = time.time()
116                 #self.action_taken = False
117
118         def isRecent(self):
119                 if self.time + self.withintime < time.time():
120                         self.action_taken = False
121
122                 if self.time + self.withintime > time.time() and self.action_taken:
123                         return True
124                 else:
125                         return False
126
127         def unsetRecent(self):
128                 self.action_taken = False
129                 self.time = time.time()
130                 return True
131
132         def setRecent(self):
133                 self.action_taken = True
134                 self.time = time.time()
135                 return True
136                 
137 class PersistFlags(Recent):
138         def __new__(typ, id, *args, **kwargs):
139                 if 'db' in kwargs:
140                         db = kwargs['db']
141                         del kwargs['db']
142                 else:
143                         db = "persistflags"
144
145                 try:
146                         pm = database.dbLoad(db)
147                 except:
148                         database.dbDump(db, {})
149                         pm = database.dbLoad(db)
150                 #print pm
151                 if id in pm:
152                         obj = pm[id]
153                 else:
154                         obj = super(PersistFlags, typ).__new__(typ, *args, **kwargs)
155                         for key in kwargs.keys():
156                                 obj.__setattr__(key, kwargs[key])
157                         obj.time = time.time()
158                         obj.action_taken = False
159
160                 obj.db = db
161                 return obj
162
163         def __init__(self, id, withintime, **kwargs):
164                 self.id = id
165                 Recent.__init__(self, withintime)
166
167         def save(self):
168                 pm = database.dbLoad(self.db)
169                 pm[self.id] = self
170                 database.dbDump(self.db, pm)
171
172         def resetFlag(self, name):
173                 self.__setattr__(name, False)
174
175         def setFlag(self, name):
176                 self.__setattr__(name, True)
177                 
178         def getFlag(self, name):
179                 try:
180                         return self.__getattribute__(name)
181                 except:
182                         self.__setattr__(name, False)
183                         return False
184
185         def resetRecentFlag(self, name):
186                 self.resetFlag(name)
187                 self.unsetRecent()
188
189         def setRecentFlag(self, name):
190                 self.setFlag(name)
191                 self.setRecent()
192
193         def getRecentFlag(self, name):
194                 # if recent and flag set -> true
195                 # else false
196                 try:
197                         return self.isRecent() & self.__getattribute__(name)
198                 except:
199                         self.__setattr__(name, False)
200                         return False
201
202         def checkattr(self, name):
203                 try:
204                         x = self.__getattribute__(name)
205                         return True
206                 except:
207                         return False
208                 
209
210 class PersistMessage(Message):
211         def __new__(typ, id, subject, message, via_rt, **kwargs):
212                 if 'db' in kwargs:
213                         db = kwargs['db']
214                 else:
215                         db = "persistmessages"
216
217                 try:
218                         pm = database.dbLoad(db)
219                 except:
220                         database.dbDump(db, {})
221                         pm = database.dbLoad(db)
222
223                 #print pm
224                 if id in pm:
225                         #print "Using existing object"
226                         obj = pm[id]
227                 else:
228                         #print "creating new object"
229                         obj = super(PersistMessage, typ).__new__(typ, [id, subject, message, via_rt], **kwargs)
230                         obj.id = id
231                         obj.actiontracker = Recent(1*60*60*24)
232                         obj.ticket_id = None
233
234                 if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
235                         obj.ticket_id = kwargs['ticket_id']
236
237                 obj.db = db
238                 return obj
239
240         def __init__(self, id, subject, message, via_rt=True, **kwargs):
241                 print "initializing object: %s" % self.ticket_id
242                 self.id = id
243                 Message.__init__(self, subject, message, via_rt, self.ticket_id)
244
245         def reset(self):
246                 self.actiontracker.unsetRecent()
247
248         def save(self):
249                 pm = database.dbLoad(self.db)
250                 pm[self.id] = self
251                 database.dbDump(self.db, pm)
252
253         def send(self, to):
254                 if not self.actiontracker.isRecent():
255                         self.ticket_id = Message.send(self, to)
256                         self.actiontracker.setRecent()
257                         self.save()
258                 else:
259                         # NOTE: only send a new message every week, regardless.
260                         # NOTE: can cause thank-you messages to be lost, for instance when node comes back online within window.
261                         print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // (60*60*24))
262
263 class MonitorMessage(object):
264         def __new__(typ, id, *args, **kwargs):
265                 if 'db' in kwargs:
266                         db = kwargs['db']
267                 else:
268                         db = "monitormessages"
269
270                 try:
271                         if 'reset' in kwargs and kwargs['reset'] == True:
272                                 database.dbDump(db, {})
273                         pm = database.dbLoad(db)
274                 except:
275                         database.dbDump(db, {})
276                         pm = database.dbLoad(db)
277
278                 #print pm
279                 if id in pm:
280                         print "Using existing object"
281                         obj = pm[id]
282                 else:
283                         print "creating new object"
284                         obj = super(object, typ).__new__(typ, id, *args, **kwargs)
285                         obj.id = id
286                         obj.sp = PersistSitePenalty(id, 0)
287
288                 obj.db = db
289                 return obj
290
291         def __init__(self, id, message):
292                 pass
293                 
294
295 class SitePenalty(object):
296         penalty_map = [] 
297         penalty_map.append( { 'name': 'noop',                   'enable'   : lambda host: None,
298                                                                                                         'disable'  : lambda host: None } )
299         penalty_map.append( { 'name': 'nocreate',               'enable'   : lambda host: plc.removeSliceCreation(host),
300                                                                                                         'disable'  : lambda host: plc.enableSliceCreation(host) } )
301         penalty_map.append( { 'name': 'suspendslices',  'enable'   : lambda host: plc.suspendSlices(host),
302                                                                                                         'disable'  : lambda host: plc.enableSlices(host) } )
303
304         #def __init__(self, index=0, **kwargs):
305         #       self.index = index
306
307         def get_penalties(self):
308                 # TODO: get penalties actually applied to a node from PLC DB.
309                 return [ n['name'] for n in SitePenalty.penalty_map ] 
310
311         def increase(self):
312                 self.index = self.index + 1
313                 if self.index > len(SitePenalty.penalty_map)-1: self.index = len(SitePenalty.penalty_map)-1
314                 return True
315
316         def decrease(self):
317                 self.index = self.index - 1
318                 if self.index < 0: self.index = 0
319                 return True
320
321         def apply(self, host):
322
323                 for i in range(len(SitePenalty.penalty_map)-1,self.index,-1):
324                         print "\tdisabling %s on %s" % (SitePenalty.penalty_map[i]['name'], host)
325                         SitePenalty.penalty_map[i]['disable'](host)
326
327                 for i in range(0,self.index+1):
328                         print "\tapplying %s on %s" % (SitePenalty.penalty_map[i]['name'], host)
329                         SitePenalty.penalty_map[i]['enable'](host)
330
331                 return
332
333
334
335 class PersistSitePenalty(SitePenalty):
336         def __new__(typ, id, index, **kwargs):
337                 if 'db' in kwargs:
338                         db = kwargs['db']
339                 else:
340                         db = "persistpenalties"
341
342                 try:
343                         if 'reset' in kwargs and kwargs['reset'] == True:
344                                 database.dbDump(db, {})
345                         pm = database.dbLoad(db)
346                 except:
347                         database.dbDump(db, {})
348                         pm = database.dbLoad(db)
349
350                 #print pm
351                 if id in pm:
352                         print "Using existing object"
353                         obj = pm[id]
354                 else:
355                         print "creating new object"
356                         obj = super(PersistSitePenalty, typ).__new__(typ, [index], **kwargs)
357                         obj.id = id
358                         obj.index = index
359
360                 obj.db = db
361                 return obj
362
363         def __init__(self, id, index, **kwargs):
364                 self.id = id
365
366         def save(self):
367                 pm = database.dbLoad(self.db)
368                 pm[self.id] = self
369                 database.dbDump(self.db, pm)
370
371
372 class Target:
373         """
374                 Each host has a target set of attributes.  Some may be set manually,
375                 or others are set globally for the preferred target.
376
377                 For instance:
378                         All nodes in the Alpha or Beta group would have constraints like:
379                                 [ { 'state' : 'BOOT', 'kernel' : '2.6.22' } ]
380         """
381         def __init__(self, constraints):
382                 self.constraints = constraints
383
384         def verify(self, data):
385                 """
386                         self.constraints is a list of key, value pairs.
387                         # [ {... : ...}==AND , ... , ... , ] == OR
388                 """
389                 con_or_true = False
390                 for con in self.constraints:
391                         #print "con: %s" % con
392                         con_and_true = True
393                         for key in con.keys():
394                                 #print "looking at key: %s" % key
395                                 if key in data: 
396                                         #print "%s %s" % (con[key], data[key])
397                                         con_and_true = con_and_true & (con[key] in data[key])
398                                 elif key not in data:
399                                         print "missing key %s" % key
400                                         con_and_true = False
401
402                         con_or_true = con_or_true | con_and_true
403
404                 return con_or_true
405
406 class Record(object):
407
408         def __init__(self, hostname, data):
409                 self.hostname = hostname
410                 self.data = data
411                 self.plcdb_hn2lb = plccache.plcdb_hn2lb
412                 self.loginbase = self.plcdb_hn2lb[self.hostname]
413                 return
414
415
416         def stageIswaitforever(self):
417                 if 'waitforever' in self.data['stage']:
418                         return True
419                 else:
420                         return False
421
422         def severity(self):
423                 category = self.data['category']
424                 prev_category = self.data['prev_category']
425                 #print "SEVERITY: ", category, prev_category
426                 val = cmpCategoryVal(category, prev_category)
427                 return val 
428
429         def improved(self):
430                 return self.severity() > 0
431         
432         def end_record(self):
433                 return node_end_record(self.hostname)
434
435         def reset_stage(self):
436                 self.data['stage'] = 'findbad'
437                 return True
438         
439         def getCategory(self):
440                 return self.data['category'].lower()
441
442         def getState(self):
443                 return self.data['state'].lower()
444
445         def getDaysDown(cls, diag_record):
446                 daysdown = -1
447                 if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
448                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
449                 #elif diag_record['comonstats']['sshstatus'] != "null":
450                 #       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
451                 #elif diag_record['comonstats']['lastcotop'] != "null":
452                 #       daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
453                 else:
454                         now = time.time()
455                         last_contact = diag_record['plcnode']['last_contact']
456                         if last_contact == None:
457                                 # the node has never been up, so give it a break
458                                 daysdown = -1
459                         else:
460                                 diff = now - last_contact
461                                 daysdown = diff // (60*60*24)
462                 return daysdown
463         getDaysDown = classmethod(getDaysDown)
464
465         def getStrDaysDown(cls, diag_record):
466                 daysdown = "unknown"
467                 last_contact = diag_record['plcnode']['last_contact']
468                 date_created = diag_record['plcnode']['date_created']
469
470                 if      diag_record['comonstats']['uptime'] != "null" and \
471                         diag_record['comonstats']['uptime'] != "-1":
472                         daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
473                         daysdown = "%d days up" % daysdown
474
475                 elif last_contact is None:
476                         if date_created is not None:
477                                 now = time.time()
478                                 diff = now - date_created
479                                 daysdown = diff // (60*60*24)
480                                 daysdown = "Never contacted PLC, created %s days ago" % daysdown
481                         else:
482                                 daysdown = "Never contacted PLC"
483                 else:
484                         now = time.time()
485                         diff = now - last_contact
486                         daysdown = diff // (60*60*24)
487                         daysdown = "%s days down" % daysdown
488                 return daysdown
489         getStrDaysDown = classmethod(getStrDaysDown)
490
491         def getSendEmailFlag(self):
492                 if not config.mail:
493                         return False
494
495                 # resend if open & created longer than 30 days ago.
496                 if  'rt' in self.data and \
497                         'Status' in self.data['rt'] and \
498                         "open" in self.data['rt']['Status'] and \
499                         self.data['rt']['Created'] > int(time.time() - 60*60*24*30):
500                         # if created-time is greater than the thirty days ago from the current time
501                         return False
502
503                 return True
504
505         def getMostRecentStage(self):
506                 lastact = self.data['last_action_record']
507                 return lastact.stage
508
509         def getMostRecentTime(self):
510                 lastact = self.data['last_action_record']
511                 return lastact.date_action_taken
512
513         def takeAction(self, index=0):
514                 pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
515                 if 'improvement' in self.data['stage'] or self.improved() or \
516                         'monitor-end-record' in self.data['stage']:
517                         print "takeAction: decreasing penalty for %s"%self.hostname
518                         pp.decrease()
519                         pp.decrease()
520                 else:
521                         print "takeAction: increasing penalty for %s"%self.hostname
522                         pp.increase()
523                 pp.index = index
524                 pp.apply(self.hostname)
525                 pp.save()
526
527         def _format_diaginfo(self):
528                 info = self.data['info']
529                 print "FORMAT : STAGE: ", self.data['stage']
530                 if self.data['stage'] == 'monitor-end-record':
531                         if info[2] == "ALPHA": info = (info[0], info[1], "PROD")
532                         hlist = "    %s went from '%s' to '%s'\n" % (info[0], info[1], info[2]) 
533                 else:
534                         hlist = "    %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
535                 return hlist
536         def saveAction(self):
537                 if 'save_act_all' in self.data and self.data['save_act_all'] == True:
538                         return True
539                 else:
540                         return False
541
542         def getMessage(self, ticket_id=None):
543                 self.data['args']['hostname'] = self.hostname
544                 self.data['args']['loginbase'] = self.loginbase
545                 self.data['args']['hostname_list'] = self._format_diaginfo()
546                 #print self.data['message']
547                 if self.data['message']:
548                         message = PersistMessage(self.hostname, 
549                                                                  self.data['message'][0] % self.data['args'],
550                                                                  self.data['message'][1] % self.data['args'],
551                                                                  True, db='monitor_persistmessages',
552                                                                  ticket_id=ticket_id)
553                         if self.data['stage'] == "improvement":
554                                 message.reset()
555                         return message
556                 else:
557                         return None
558         
559         def getContacts(self):
560                 roles = self.data['email']
561
562                 if not config.mail and not config.debug and config.bcc:
563                         roles = ADMIN
564                 if config.mail and config.debug:
565                         roles = ADMIN
566
567                 # build targets
568                 contacts = []
569                 if ADMIN & roles:
570                         contacts += [config.email]
571                 if TECH & roles:
572                         contacts += [TECHEMAIL % self.loginbase]
573                 if PI & roles:
574                         contacts += [PIEMAIL % self.loginbase]
575                 if USER & roles:
576                         slices = plc.slices(self.loginbase)
577                         if len(slices) >= 1:
578                                 for slice in slices:
579                                         contacts += [SLICEMAIL % slice]
580                                 print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
581                         else:
582                                 print "SLIC: %20s : 0 slices" % self.loginbase
583
584                 return contacts
585
586
587 class NodeRecord:
588         def __init__(self, hostname, target):
589                 self.hostname = hostname
590                 self.ticket = None
591                 self.target = target
592
593 class Action(MonRecord):
594         def __init__(self, host, data):
595                 self.host = host
596                 MonRecord.__init__(self, data)
597                 return
598
599         def deltaDays(self, delta):
600                 t = datetime.fromtimestamp(self.__dict__['time'])
601                 d = t + timedelta(delta)
602                 self.__dict__['time'] = time.mktime(d.timetuple())
603                 
604 def node_end_record(node):
605         act_all = database.dbLoad("act_all")
606         if node not in act_all:
607                 del act_all
608                 return False
609
610         if len(act_all[node]) == 0:
611                 del act_all
612                 return False
613
614         pm = database.dbLoad("monitor_persistmessages")
615         if node not in pm:
616                 del pm
617                 return False
618         else:
619                 print "deleting node record"
620                 del pm[node]
621                 database.dbDump("monitor_persistmessages", pm)
622
623         a = Action(node, act_all[node][0])
624         a.delField('rt')
625         a.delField('found_rt_ticket')
626         a.delField('second-mail-at-oneweek')
627         a.delField('second-mail-at-twoweeks')
628         a.delField('first-found')
629         rec = a.get()
630         rec['action'] = ["close_rt"]
631         rec['category'] = "ALPHA"       # assume that it's up...
632         rec['stage'] = "monitor-end-record"
633         rec['ticket_id'] = None
634         rec['time'] = time.time() - 7*60*60*24
635         act_all[node].insert(0,rec)
636         database.dbDump("act_all", act_all)
637         del act_all
638         return True
639
640 if __name__ == "__main__":
641         #r = RT()
642         #r.email("test", "body of test message", ['database@cs.princeton.edu'])
643         #from emailTxt import mailtxt
644         print "loaded"
645         #database.dbDump("persistmessages", {});
646         #args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah -  days down\n'}
647         #m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
648         #m.send(['soltesz@cs.utk.edu'])
649         #m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
650         # TRICK timer to thinking some time has passed.
651         #m.actiontracker.time = time.time() - 6*60*60*24
652         #m.send(['soltesz@cs.utk.edu'])