-import reboot
-import soltesz
-import string
-from www.printbadnodes import cmpCategoryVal
-from config import config
-print "policy"
-config = config()
-
-DAT="./monitor.dat"
-
-logger = logging.getLogger("monitor")
-
-# Time to enforce policy
-POLSLEEP = 7200
-
-# Where to email the summary
-SUMTO = "soltesz@cs.princeton.edu"
-TECHEMAIL="tech-%s@sites.planet-lab.org"
-PIEMAIL="pi-%s@sites.planet-lab.org"
-SLICEMAIL="%s@slices.planet-lab.org"
-PLCEMAIL="support@planet-lab.org"
-
-#Thresholds (DAYS)
-SPERMIN = 60
-SPERHOUR = 60*60
-SPERDAY = 86400
-PITHRESH = 7 * SPERDAY
-SLICETHRESH = 7 * SPERDAY
-# Days before attempting rins again
-RINSTHRESH = 5 * SPERDAY
-
-# Days before calling the node dead.
-DEADTHRESH = 30 * SPERDAY
-# Minimum number of nodes up before squeezing
-MINUP = 2
-
-TECH=1
-PI=2
-USER=4
-ADMIN=8
-
-# IF:
-# no SSH, down.
-# bad disk, down
-# DNS, kinda down (sick)
-# clock, kinda down (sick)
-# Full disk, going to be down
-
-# Actions:
-# Email
-# suspend slice creation
-# kill slices
-def array_to_priority_map(array):
- """ Create a mapping where each entry of array is given a priority equal
- to its position in the array. This is useful for subsequent use in the
- cmpMap() function."""
- map = {}
- count = 0
- for i in array:
- map[i] = count
- count += 1
- return map
-
-def getdebug():
- return config.debug
-
-def print_stats(key, stats):
- if key in stats: print "%20s : %d" % (key, stats[key])
-
-class Merge(Thread):
- def __init__(self, l_merge, toRT):
- self.toRT = toRT
- self.merge_list = l_merge
- # the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-
- # Previous actions taken on nodes.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
- self.findbad = soltesz.if_cached_else(1, "findbad", lambda : {})
-
- self.cache_all = soltesz.if_cached_else(1, "act_all", lambda : {})
- self.sickdb = {}
- self.mergedb = {}
- Thread.__init__(self)
-
- def run(self):
- # populate sickdb
- self.accumSickSites()
- # read data from findbad and act_all
- self.mergeActionsAndBadDB()
- # pass node_records to RT
- self.sendToRT()
-
- def accumSickSites(self):
- """
- Take all nodes, from l_diagnose, look them up in the act_all database,
- and insert them into sickdb[] as:
-
- sickdb[loginbase][nodename] = fb_record
- """
- # look at all problems reported by findbad
- l_nodes = self.findbad['nodes'].keys()
- count = 0
- for nodename in l_nodes:
- if nodename not in self.merge_list:
- continue # skip this node, since it's not wanted
-
- count += 1
- loginbase = self.plcdb_hn2lb[nodename]
- values = self.findbad['nodes'][nodename]['values']
-
- fb_record = {}
- fb_record['nodename'] = nodename
- fb_record['category'] = values['category']
- fb_record['state'] = values['state']
- fb_record['comonstats'] = values['comonstats']
- fb_record['plcnode'] = values['plcnode']
- fb_record['kernel'] = self.getKernel(values['kernel'])
- fb_record['stage'] = "findbad"
- fb_record['message'] = None
- fb_record['bootcd'] = values['bootcd']
- fb_record['args'] = None
- fb_record['info'] = None
- fb_record['time'] = time.time()
- fb_record['date_created'] = time.time()
-
- if loginbase not in self.sickdb:
- self.sickdb[loginbase] = {}
-
- self.sickdb[loginbase][nodename] = fb_record
-
- print "Found %d nodes" % count
-
- def getKernel(self, unamestr):
- s = unamestr.split()
- if len(s) > 2:
- return s[2]
- else:
- return ""
-
- def mergeActionsAndBadDB(self):
- """
- - Look at the sick node_records as reported in findbad,
- - Then look at the node_records in act_all.
-
- There are four cases:
- 1) Problem in findbad, no problem in act_all
- this ok, b/c it just means it's a new problem
- 2) Problem in findbad, problem in act_all
- -Did the problem get better or worse?
- -If Same, or Worse, then continue looking for open tickets.
- -If Better, or No problem, then "back-off" penalties.
- This judgement may need to wait until 'Diagnose()'
-
- 3) No problem in findbad, problem in act_all
- The the node is operational again according to Findbad()
-
- 4) No problem in findbad, no problem in act_all
- There won't be a record in either db, so there's no code.
- """
-
- sorted_sites = self.sickdb.keys()
- sorted_sites.sort()
- # look at all problems reported by findbad
- for loginbase in sorted_sites:
- d_fb_nodes = self.sickdb[loginbase]
- sorted_nodes = d_fb_nodes.keys()
- sorted_nodes.sort()
- for nodename in sorted_nodes:
- fb_record = self.sickdb[loginbase][nodename]
- x = fb_record
- if loginbase not in self.mergedb:
- self.mergedb[loginbase] = {}
-
- # We must compare findbad state with act_all state
- if nodename not in self.act_all:
- # 1) ok, b/c it's a new problem. set ticket_id to null
- self.mergedb[loginbase][nodename] = {}
- self.mergedb[loginbase][nodename].update(x)
- self.mergedb[loginbase][nodename]['ticket_id'] = ""
- self.mergedb[loginbase][nodename]['prev_category'] = None
- else:
- if len(self.act_all[nodename]) == 0:
- print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
- continue
-
- y = self.act_all[nodename][0]
-
- # skip if end-stage
- if 'stage' in y and "monitor-end-record" in y['stage']:
- # 1) ok, b/c it's a new problem. set ticket_id to null
- self.mergedb[loginbase][nodename] = {}
- self.mergedb[loginbase][nodename].update(x)
- self.mergedb[loginbase][nodename]['ticket_id'] = ""
- self.mergedb[loginbase][nodename]['prev_category'] = None
- continue
-
- ## for legacy actions
- #if 'bucket' in y and y['bucket'][0] == 'dbg':
- # # Only bootcd debugs made it to the act_all db.
- # y['prev_category'] = "OLDBOOTCD"
- #elif 'bucket' in y and y['bucket'][0] == 'down':
- # y['prev_category'] = "ERROR"
- #elif 'bucket' not in y:
- # # for all other actions, just carry over the
- # # previous category
- # y['prev_category'] = y['category']
- #else:
- # print "UNKNOWN state for record: %s" % y
- # sys.exit(1)
-
- # determine through translation, if the buckets match
- #if 'category' in y and x['category'] == y['category']:
- # b_match = True
- #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
- # b_match = True
- #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
- # b_match = True
- #else:
- # b_match = False
-
- #if b_match:
- # # 2b) ok, b/c they agree that there's still a problem..
- # # 2b) Comon & Monitor still agree; RT ticket?
- # y['prev_category'] = y['category']
- #else:
- # # 2a) mismatch, need a policy for how to resolve
- # # resolution will be handled in __diagnoseNode()
- # # for now just record the two categories.
- # #if x['category'] == "PROD" and x['state'] == "BOOT" and \
- # # ( y['bucket'][0] == 'down' or y['bucket'][0] == 'dbg'):
- # print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
- # (x['category'], y['bucket'])
-
-
- self.mergedb[loginbase][nodename] = {}
- self.mergedb[loginbase][nodename].update(y)
- self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
- self.mergedb[loginbase][nodename]['category'] = x['category']
- self.mergedb[loginbase][nodename]['state'] = x['state']
- self.mergedb[loginbase][nodename]['kernel']=x['kernel']
- self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
- self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
- # delete the entry from cache_all to keep it out of case 3)
- del self.cache_all[nodename]
-
- # 3) nodes that remin in cache_all were not identified by findbad.
- # Do we keep them or not?
- # NOTE: i think that since the categories are performed before this
- # step now, and by a monitor-controlled agent.
-
- # TODO: This does not work correctly. Do we need this?
- #for hn in self.cache_all.keys():
- # y = self.act_all[hn][0]
- # if 'monitor' in y['bucket']:
- # loginbase = self.plcdb_hn2lb[hn]
- # if loginbase not in self.sickdb:
- # self.sickdb[loginbase] = {}
- # self.sickdb[loginbase][hn] = y
- # else:
- # del self.cache_all[hn]
-
- print "len of cache_all: %d" % len(self.cache_all.keys())
- return
+from optparse import OptionParser
+
+import bootman # debug nodes
+
+from monitor import util
+from monitor import const
+from monitor import reboot
+from monitor import config
+from monitor import database
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+
+class SiteInterface(HistorySiteRecord):
+ @classmethod
+ def get_or_make(cls, if_new_set={}, **kwargs):
+ if 'hostname' in kwargs:
+ kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+ del kwargs['hostname']
+ res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+ return SiteInterface(res)
+
+ def __init__(self, sitehist):
+ self.db = sitehist
+
+ def getRecentActions(self, **kwargs):
+ # TODO: make query only return records within a certin time range,
+ # i.e. greater than 0.5 days ago. or 5 days, etc.
+
+ #print "kwargs: ", kwargs
+
+ recent_actions = []
+ if 'loginbase' in kwargs:
+ recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+ elif 'hostname' in kwargs:
+ recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+ return recent_actions
+
+ def increasePenalty(self):
+ #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+ self.db.penalty_level += 1
+ # NOTE: this is to prevent overflow or index errors in applyPenalty.
+ # there's probably a better approach to this.
+ if self.db.penalty_level >= 2:
+ self.db.penalty_level = 2
+ self.db.penalty_applied = True
+
+ def applyPenalty(self):
+ penalty_map = []
+ penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None,
+ 'disable' : lambda site: None } )
+ penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site),
+ 'disable' : lambda site: plc.enableSiteSliceCreation(site) } )
+ penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site),
+ 'disable' : lambda site: plc.enableSiteSlices(site) } )
+
+ for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+ print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+ penalty_map[i]['disable'](self.db.loginbase)
+
+ for i in range(0,self.db.penalty_level+1):
+ print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+ penalty_map[i]['enable'](self.db.loginbase)