from findbad import main as findbad_main
from findbadpcu import main as findbadpcu_main
from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
import sys
if __name__ == '__main__':
parser = parsermodule.getParser(['nodesets'])
parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
- force=False, pcuselect=None, pcuid=None)
+ force=False, pcuselect=None, pcuid=None, pcu=None)
parser.add_option("", "--cachenodes", action="store_true",
help="Cache node lookup from PLC")
parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
cfg = parsermodule.parse_args(parser)
try:
+ print "findbad"
findbad_main()
+ print "findbadpcu"
findbadpcu_main()
+ print "nodebad"
+ nodebad_main()
+ print "pcubad"
+ pcubad_main()
+ print "sitebad"
sitebad_main()
except Exception, err:
import traceback
import time
import struct
-from pcucontrol import reboot
-
+from monitor import reboot
from monitor import util
from monitor import database
from monitor.wrapper import plc, plccache
-from datetime import datetime
+from datetime import datetime, timedelta
from monitor.model import PersistFlags, Message
esc = struct.pack('i', 27)
m=Message("exception running monitor", msg, False)
m.send([config.cc_email])
return
+
+def changed_lessthan(last_changed, days):
+ if datetime.now() - last_changed <= timedelta(days):
+ print "last changed less than %s" % timedelta(days)
+ return True
+ else:
+ print "last changed more than %s" % timedelta(days)
+ return False
+
+def changed_greaterthan(last_changed, days):
+ if datetime.now() - last_changed > timedelta(days):
+ print "last changed more than %s" % timedelta(days)
+ return True
+ else:
+ print "last changed less than %s" % timedelta(days)
+ return False
+
# ACCOUNTING
date_created = Field(DateTime,default=datetime.now)
+ loginbase = Field(String,default=None)
hostname = Field(String,default=None)
- loginbase = Field(String)
+ # NOTE:
+ # the expected kinds of actions are:
+ # * reboot node
+ # * open ticket, send notice
+ # * close ticket
+ # * apply penalty to site
+ # * backoff penalty to site
+ action = Field(String)
+
+ # NOTE: describes the kind of action. i.e. online-notice, offline-notice,
+ # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+ # penalty-disable-slices,
+ action_type = Field(String, default=None)
+
+ message_id = Field(Integer, default=0)
+ penalty_level = Field(Integer, default=0)
+
+ # NOTE: in case an exception is thrown while trying to perform an action.
+ error_string = Field(String, default=None)
#issue = ManyToOne('IssueRecord')
# NOTE: this is the parent relation to fb records. first create the
# OR
# - find fbnode records
# - create action record with fbnodes as argument
- findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+ # findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
# NOTE: can I move 'message_index, escellation_level, and penalty_level'
# into the same value? Maybe not penalty level, since there are only two;
# and, there may be additional message and escellation levels.
- send_email_to = Field(PickleType, default=None)
- action_description = Field(PickleType, default=None)
- message_arguments = Field(PickleType, default=None)
+ #send_email_to = Field(PickleType, default=None)
+ #action_description = Field(PickleType, default=None)
+ #message_arguments = Field(PickleType, default=None)
# NOTE: not sure this needs to be in the db.
- escellation_level = Field(Integer, default=0)
- stage = Field(String, default=None)
+ #escellation_level = Field(Integer, default=0)
+ #stage = Field(String, default=None)
observed_status = Field(String,default=None)
# NOTE: this is the child relation
- action = ManyToOne('ActionRecord', required=False)
+ #action = ManyToOne('ActionRecord', required=False)
class FindbadPCURecord(Entity):
@classmethod
status = Field(String,default="unknown")
+ message_id = Field(Int, default=0)
+ message_status = Field(String, default=None)
+ message_queue = Field(String, default=None)
+ message_created = Field(DateTime, default=None)
+
+ penalty_level = Field(Int, default=0)
+
@classmethod
def by_loginbase(cls, loginbase):
return cls.query.filter_by(loginbase=loginbase).first()
if not pcu:
logger.debug("no pcu for %s" % nodename)
print "no pcu for %s" % nodename
- return False # "%s has no pcu" % nodename
+ return "%s has no pcu" % nodename
values = get_pcu_values(pcu['pcu_id'])
if values == None:
logger.debug("No values for pcu probe %s" % nodename)
print "No values for pcu probe %s" % nodename
- return False #"no info for pcu_id %s" % pcu['pcu_id']
+ return "no info for pcu_id %s" % pcu['pcu_id']
# Try the PCU first
logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
with PlanetLab.
""")
+ pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+""")
+ online_notice=("""Host %(hostname)s is online""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+is online and operational. Thank you very much for your help!
+ """)
+ test_notice=("""Host %(hostname)s is testing""",
+ """
+This notice is simply to test whether notices work.
+ %(hostname)s
+
+Thank you very much for your help!
+ """)
+ offline_notice=("""Host %(hostname)s is offline""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+is offline and or non-operational. Please investigate, thank you very much for your help!
+ """)
+
+ clear_penalty=("""All penalties have been cleared from site %(loginbase)s""",
+ """
+This notice is to let you know that any penalties previously applied to your site have
+been removed: %(penalty_level)s.
+
+All privileges have been restored. If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+ 0 - no penalties applied
+ 1 - site is disabled. no new slices can be created.
+ 2+ - all existing slices will be disabled.
+ """)
+
+ increase_penalty=("""Penalty increased for site %(loginbase)s""",
+ """
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+ 0 - no penalty applied
+ 1 - site is disabled. no new slices can be created.
+ 2+ - all existing slices will be disabled.
+ """)
+
nmreset =("""NM Reset at %(loginbase)s""",
"""
Monitor restarted NM on the following machines:
api = plc.getCachedAuthAPI()
l_sites = api.GetSites({'peer_id':None},
['login_base', 'site_id', 'abbreviated_name', 'latitude',
- 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
+ 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled', 'date_created' ])
l_nodes = api.GetNodes({'peer_id':None},
['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated',
'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
round = 1
count = 0
+def main():
+ main2(config)
-def main(config):
+def main2(config):
l_plcnodes = plccache.l_nodes
l_nodes = get_nodeset(config)
checkAndRecordState(l_nodes, l_plcnodes)
+# Node states:
+
+def check_node_state(rec, node):
+
+ node_state = rec.observed_status
+ if rec.plc_node_stats:
+ boot_state = rec.plc_node_stats['boot_state']
+ last_contact = rec.plc_node_stats['last_contact']
+ else:
+ boot_state = "unknown"
+ last_contact = None
+
+ if node_state == 'DOWN' and ( node.status == 'online' or node.status == 'good' ):
+ print "changed status from %s to offline" % node.status
+ node.status = 'offline'
+ node.last_changed = datetime.now()
+
+ if node_state == 'BOOT' and changed_lessthan(node.last_changed, 0.5) and node.status != 'online':
+ print "changed status from %s to online" % node.status
+ node.status = 'online'
+ node.last_changed = datetime.now()
+
+ if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+ #send thank you notice, or on-line notice.
+ print "changed status from %s to good" % node.status
+ node.status = 'good'
+ # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+ #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1): # and pcu.status == 'good'
+ # # attempt reboots
+ # pass
+ #if node.status == 'offline' and changed_greaterthan(node.last_changed, 1.5): # and node.has_pcu
+ # # send PCU failure message
+ # pass
+
+ if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+ print "changed status from %s to down" % node.status
+ # send down node notice
+ node.status = 'down'
+ node.last_changed = datetime.now()
+
+ if ( boot_state == 'disabled' or last_contact == None ) and \
+ changed_greaterthan(node.last_changed, 2*30) and \
+ node.status != 'down':
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ node.last_changed = datetime.now()
+
def checkAndRecordState(l_nodes, l_plcnodes):
global count
for nodename in l_nodes:
- d_node = None
- for node in l_plcnodes:
- if node['hostname'] == nodename:
- d_node = node
- break
- if not d_node:
- continue
- pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
- pf.last_checked = datetime.now()
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
+ if_new_set={'status' : 'offline',
+ 'last_changed' : datetime.now()})
+ nodehist.last_checked = datetime.now()
try:
# Find the most recent record
noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
- #print "NODEREC: ", noderec.date_checked
except:
print "COULD NOT FIND %s" % nodename
import traceback
print "none object for %s"% nodename
continue
- node_state = noderec.observed_status
- if noderec.plc_node_stats:
- boot_state = noderec.plc_node_stats['boot_state']
- else:
- boot_state = "unknown"
-
- if node_state == "BOOT":
- if pf.status != "good":
- pf.last_changed = datetime.now()
- pf.status = "good"
- elif node_state == "DEBUG":
- if pf.status != boot_state:
- pf.last_changed = datetime.now()
- pf.status = boot_state
- else:
- if pf.status != "down":
- pf.last_changed = datetime.now()
- pf.status = "down"
+ check_node_state(noderec, nodehist)
count += 1
- print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+ print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
# NOTE: this commits all pending operations to the DB. Do not remove, or
# replace with another operations that also commits all pending ops, such
# as session.commit() or flush() or something
- print HistoryNodeRecord.query.count()
session.flush()
+ print HistoryNodeRecord.query.count()
return True
config = parsermodule.parse_args(parser)
try:
- main(config)
+ main2(config)
except Exception, err:
import traceback
print traceback.print_exc()
import sys
import string
import time
+import sets
from datetime import datetime,timedelta
from monitor import database
api = plc.getAuthAPI()
-def main(config):
+def main():
+ main2(config)
+
+def main2(config):
l_plcpcus = plccache.l_pcus
l_pcus = None
- if config.pcu:
+ if config.site is not None:
+ site = api.GetSites(config.site)
+ l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+ pcus = []
+ for node in l_nodes:
+ pcus += node['pcu_ids']
+ # clear out dups.
+ l_pcus = [pcu for pcu in sets.Set(pcus)]
+ elif config.pcu:
for pcu in l_plcpcus:
if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
hn2lb = plccache.plcdb_hn2lb
+def check_pcu_state(rec, pcu):
+
+ pcu_state = rec.reboot_trial_status
+
+ if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+ ( pcu.status == 'online' or pcu.status == 'good' ):
+ print "changed status from %s to offline" % pcu.status
+ pcu.status = 'offline'
+ pcu.last_changed = datetime.now()
+
+ if ( pcu_state == 0 or pcu_state == "0" ) and changed_lessthan(pcu.last_changed, 0.5) and pcu.status != 'online':
+ print "changed status from %s to online" % pcu.status
+ pcu.status = 'online'
+ pcu.last_changed = datetime.now()
+
+ if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+ #send thank you notice, or on-line notice.
+ print "changed status from %s to good" % pcu.status
+ pcu.status = 'good'
+ # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+ if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+ # send down pcu notice
+ print "changed status from %s to down" % pcu.status
+ pcu.status = 'down'
+ pcu.last_changed = datetime.now()
+
+ if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+ print "changed status from %s to down" % pcu.status
+ pcu.status = 'down'
+ pcu.last_changed = datetime.now()
+
def checkAndRecordState(l_pcus, l_plcpcus):
count = 0
for pcuname in l_pcus:
if not d_pcu:
continue
- pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
- pf.last_checked = datetime.now()
+ pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'],
+ if_new_set={'status' : 'offline',
+ 'last_changed' : datetime.now()})
+ pcuhist.last_checked = datetime.now()
try:
# Find the most recent record
pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
- print "NODEREC: ", pcurec.date_checked
except:
- print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+ print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
import traceback
print traceback.print_exc()
# don't have the info to create a new entry right now, so continue.
continue
- pcu_state = pcurec.reboot_trial_status
- current_state = pcu_state
-
- if current_state == 0 or current_state == "0":
- if pf.status != "good":
- pf.last_changed = datetime.now()
- pf.status = "good"
- elif current_state == 'NetDown':
- if pf.status != "netdown":
- pf.last_changed = datetime.now()
- pf.status = "netdown"
- elif current_state == 'Not_Run':
- if pf.status != "badconfig":
- pf.last_changed = datetime.now()
- pf.status = "badconfig"
- else:
- if pf.status != "error":
- pf.last_changed = datetime.now()
- pf.status = "error"
+ if not pcurec:
+ print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+ continue
+
+ check_pcu_state(pcurec, pcuhist)
count += 1
- print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+ print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
# NOTE: this commits all pending operations to the DB. Do not remove, or
# replace with another operations that also commits all pending ops, such
# as session.commit() or flush() or something
- print HistoryPCURecord.query.count()
session.flush()
+ print HistoryPCURecord.query.count()
return True
if __name__ == '__main__':
parser = parsermodule.getParser()
- parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+ parser.set_defaults(filename=None, pcu=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
parser.add_option("", "--pcu", dest="pcu", metavar="hostname",
help="Provide a single pcu to operate on")
+ parser.add_option("", "--site", dest="site", metavar="sitename",
+ help="Provide a single sitename to operate on")
parser.add_option("", "--pculist", dest="pculist", metavar="file.list",
help="Provide a list of files to operate on")
config = parsermodule.parse_args(parser)
try:
- main(config)
+ main2(config)
except Exception, err:
import traceback
print traceback.print_exc()
from monitor import database
from monitor import parser as parsermodule
from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session
from monitor.wrapper import plc, plccache
from monitor.const import MINUP
checkAndRecordState(l_sites, l_plcsites)
-def getnewsite(nodelist):
- new = True
- for node in nodelist:
- try:
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
- if noderec is not None and \
- noderec.plc_node_stats['last_contact'] != None:
- new = False
- except:
- import traceback
- print traceback.print_exc()
- return new
-
def getnodesup(nodelist):
up = 0
for node in nodelist:
try:
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
- #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'],
- # orderBy='date_checked').reversed()[0]
- if noderec is not None and noderec.observed_status == "BOOT":
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+ if nodehist is not None and nodehist.status == "good":
up = up + 1
except:
import traceback
print traceback.print_exc()
return up
+def check_site_state(rec, sitehist):
+
+ if sitehist.new and sitehist.status != 'new':
+ sitehist.status = 'new'
+ sitehist.last_changed = datetime.now()
+
+ if not sitehist.new:
+
+ if sitehist.nodes_up >= MINUP:
+
+ if sitehist.status != 'online' and sitehist.status != 'good':
+ sitehist.last_changed = datetime.now()
+
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+ print "changed status from %s to online" % sitehist.status
+ sitehist.status = 'online'
+
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+ print "changed status from %s to good" % sitehist.status
+ sitehist.status = 'good'
+
+ else: # sitehist.nodes_up < MINUP:
+
+ if sitehist.status != 'offline' and sitehist.status != 'down':
+ sitehist.last_changed = datetime.now()
+
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+ print "changed status from %s to offline" % sitehist.status
+ sitehist.status = 'offline'
+
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+ print "changed status from %s to down" % sitehist.status
+ sitehist.status = 'down'
+
def checkAndRecordState(l_sites, l_plcsites):
count = 0
lb2hn = plccache.plcdb_lb2hn
continue
if sitename in lb2hn:
- pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
- pf.last_checked = datetime.now()
- pf.slices_total = d_site['max_slices']
- pf.slices_used = len(d_site['slice_ids'])
- pf.nodes_total = len(lb2hn[sitename])
- pf.nodes_up = getnodesup(lb2hn[sitename])
- pf.new = getnewsite(lb2hn[sitename])
- pf.enabled = d_site['enabled']
-
- if pf.nodes_up >= MINUP:
- if pf.status != "good": pf.last_changed = datetime.now()
- pf.status = "good"
- else:
- if pf.status != "down": pf.last_changed = datetime.now()
- pf.status = "down"
+ sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+ if_new_set={'status' : 'unknown',
+ 'last_changed' : datetime.now(),
+ 'message_id': 0,
+ 'penalty_level' : 0})
+ sitehist.last_checked = datetime.now()
+
+ sitehist.slices_total = d_site['max_slices']
+ sitehist.slices_used = len(d_site['slice_ids'])
+ sitehist.nodes_total = len(lb2hn[sitename])
+ if sitehist.message_id != 0:
+ rtstatus = mailer.getTicketStatus(sitehist.message_id)
+ sitehist.message_status = rtstatus['Status']
+ sitehist.message_queue = rtstatus['Queue']
+ sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+ sitehist.nodes_up = getnodesup(lb2hn[sitename])
+ sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+ sitehist.enabled = d_site['enabled']
+
+ check_site_state(d_site, sitehist)
count += 1
- print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used,
- pf.nodes_total, pf.nodes_up, pf.status)
- pf.flush()
+ print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, sitehist.slices_used,
+ sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+ sitehist.flush()
print HistorySiteRecord.query.count()
session.flush()
@expose(template="monitorweb.templates.sitelist")
def site(self, filter='all'):
- filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+ filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
fbquery = HistorySiteRecord.query.all()
query = []
for site in fbquery:
#status-error { background-color: indianred; }\r
#status-none { background-color: white; }\r
\r
+#site-new { background-color: gold; }\r
#site-good { background-color : darkseagreen; }\r
+#site-online { background-color : lightgreen; }\r
+#site-offline { background-color: red; }\r
#site-down { background-color: indianred; }\r
\r
+#site-0 { background-color : white; }\r
+#site-1 { background-color: gold; }\r
+#site-2 { background-color: indianred; }\r
+\r
#node-BOOT { background-color: darkseagreen; }\r
#node-DOWN { background-color: indianred; }\r
#node-DEBUG { background-color: gold; }\r
<span class="icon">${site.loginbase}</span></a>
</td>
<td py:content="site.enabled"></td>
- <td>n/a</td>
+ <td id="site-${site.penalty_level}">${site.penalty_level}</td>
<td>${site.slices_used}/${site.slices_total}</td>
<td>${site.nodes_up} / ${site.nodes_total}</td>
<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
</div>
</td>
<td py:content="site.enabled"></td>
- <td>n/a</td>
+ <td id="site-${site.penalty_level}">${site.penalty_level}</td>
<td>${site.slices_used}/${site.slices_total}</td>
<td>${site.nodes_up} / ${site.nodes_total}</td>
<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>