Group: Applications/System
Requires: python
-Requires: python-sqlalchemy
-Requires: python-elixir
+#Requires: python-sqlalchemy
+#Requires: python-elixir
Requires: openssh-clients
Requires: perl-libwww-perl
Requires: PLCWWW >= 4.2
Requires: bootcd-planetlab-i386 >= 4.2
-Requires: zabbix-client
-Requires: zabbix-gui
-Requires: zabbix-server
+#Requires: zabbix-client
+#Requires: zabbix-gui
+#Requires: zabbix-server
%description server
The server side include all python modules and scripts needed to fully
php /usr/share/%{name}/zabbix/getdefines.php > %{python_sitearch}/monitor/database/zabbixapi/defines.py
# apply patches to zabbix
-patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
+#patch -d /var/www/html/zabbix/ -p0 < /usr/share/%{name}/zabbix/zabbix-auto-login.diff
#chkconfig --add monitor-server
#chkconfig monitor-server on
from monitor.model import *
from monitor.common import email_exception, found_within
from monitor.database.info.model import *
+from monitor.database.info.interface import *
from monitor.wrapper import plc
from monitor.wrapper import plccache
from monitor.wrapper.emailTxt import mailtxt
traceback.print_exc()
print self.c.modules.sys.path
except:
+ email_exception()
traceback.print_exc()
return "unknown"
def get_bootmanager_log(self):
download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
- os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
log = open("log/bm.%s.log" % self.node, 'r')
return log
sys.exit(1)
for node in nodes:
- reboot(node, config)
+ # get sitehist
+ lb = plccache.plcdb_hn2lb[node]
+ sitehist = SiteInterface.get_or_make(loginbase=lb)
+ #reboot(node, config)
+ restore(sitehist, node, config=None, forced_action=None)
if __name__ == "__main__":
main()
from nodebad import main as nodebad_main
from pcubad import main as pcubad_main
from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.common import *
import sys
if __name__ == '__main__':
cfg = parsermodule.parse_args(parser)
try:
- print "sync with plc"
- plccache.sync()
print "findbad"
findbad_main()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
print "findbadpcu"
findbadpcu_main()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
print "nodebad"
nodebad_main()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
print "pcubad"
pcubad_main()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
print "sitebad"
sitebad_main()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
except Exception, err:
import traceback
+ email_exception()
print traceback.print_exc()
print "Exception: %s" % err
print "Saving data... exitting."
l_pcus = [pcu for pcu in sets.Set(pcus)]
elif config.node is not None:
- l_nodes = plcacche.GetNodeByName(config.node)
- pcus = []
- for node in l_nodes:
- pcus += node['pcu_ids']
+ node = plccache.GetNodeByName(config.node)
+ print node
+ pcus = node['pcu_ids']
# clear out dups.
l_pcus = [pcu for pcu in sets.Set(pcus)]
check_monitor_conf
check_monitor_schema_and_data
- check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
- check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
+ #check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER
+ #check_user_and_db $ZABBIX_DB_NAME $ZABBIX_DB_USER
if [ -n "$WROTE_PG_CONFIG" ] ; then
# NOTE: restart db to enable access by users granted above.
dialog "$MESSAGE"
fi
- check_zabbix_schema_and_data
- check_zabbix_templates_and_import
+ #check_zabbix_schema_and_data
+ #check_zabbix_templates_and_import
# create /etc/httpd/conf.d/monitorweb.conf
start_tg_server
# START zabbix services. SETUP default config files.
- check_zab_server
- check_zab_agentd
- check_zab_webconfig
+ #check_zab_server
+ #check_zab_agentd
+ #check_zab_webconfig
result "$MESSAGE"
;;
dialog "$MESSAGE"
stop_tg_server
- service zabbix_server stop
- service zabbix_agentd stop
+ #service zabbix_server stop
+ #service zabbix_agentd stop
# TODO: is there anything to stop?
result "$MESSAGE"
;;
__session__ = mon_session
-#class FindbadNodeRecordSync(Entity):
-# hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
-# round = Field(Int,default=0)
-
-#class FindbadPCURecordSync(Entity):
-# plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
-# round = Field(Int,default=0)
-
class FindbadNodeRecord(Entity):
@classmethod
def get_all_latest(cls):
return cls.query.all()
- #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- #if fbsync:
- # return cls.query.filter_by(round=fbsync.round)
- #else:
- # return []
@classmethod
def get_latest_by(cls, **kwargs):
return cls.query.filter_by(**kwargs).first()
- #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- #if fbsync:
- # kwargs['round'] = fbsync.round
- # return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
- #else:
- # return []
@classmethod
def get_latest_n_by(cls, n=3, **kwargs):
return cls.query.filter_by(**kwargs)
- #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- #kwargs['round'] = fbsync.round
- #ret = []
- #for i in range(0,n):
- # kwargs['round'] = kwargs['round'] - i
- # f = cls.query.filter_by(**kwargs).first()
- # if f:
- # ret.append(f)
- #return ret
# ACCOUNTING
date_checked = Field(DateTime,default=datetime.now)
# ACCOUNTING
date_checked = Field(DateTime)
round = Field(Int,default=0)
- plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid')
+ plc_pcuid = Field(Int)
# EXTERNAL
plc_pcu_stats = Field(PickleType,default=None)
last_checked = Field(DateTime,default=datetime.now)
last_changed = Field(DateTime,default=datetime.now)
status = Field(String,default="unknown")
+ haspcu = Field(Boolean,default=False)
acts_as_versioned(ignore=['last_changed', 'last_checked'])
@classmethod
act = ActionRecord(loginbase=self.db.loginbase,
hostname=hostname,
action='reboot',
- action_type='first_try_reboot',
+ action_type='try_reboot',
error_string=err)
from monitor.wrapper import plc, plccache
import traceback
-from monitor.common import nmap_port_status
+from monitor.common import nmap_port_status, email_exception
COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
"table=table_nodeview&" + \
except:
print "ERROR:"
+ email_exception(nodename)
print traceback.print_exc()
pass
plc_lock.acquire()
d_node = None
try:
- d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id',
- 'date_created', 'last_updated',
- 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
+ d_node = plccache.GetNodeByName(nodename)
+ #d_node = plc.getNodes({'hostname': nodename}, ['pcu_ids', 'site_id',
+ # 'date_created', 'last_updated',
+ # 'last_contact', 'boot_state', 'nodegroup_ids'])[0]
except:
traceback.print_exc()
plc_lock.release()
d_site = None
values['loginbase'] = ""
try:
- d_site = plc.getSites({'site_id': site_id},
- ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
+ d_site = plccache.GetSitesById([ site_id ])[0]
+ #d_site = plc.getSites({'site_id': site_id},
+ # ['max_slices', 'slice_ids', 'node_ids', 'login_base'])[0]
values['loginbase'] = d_site['login_base']
except:
traceback.print_exc()
with PlanetLab.
""")
+ pcumissing_notice =("""MONTEST: No PCU available to reboot %(hostname)s""",
+"""As part of PlanetLab node monitoring and maintenance, we noticed that there is no PCU
+associated with %(hostname)s, so we could not reboot it ourselves.
+
+To save you time in the future, please take a moment to register the PCU functionality for
+your machines here:
+
+ http://www.planet-lab.org/db/sites/pcu.php
+
+Thank you very much for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+""")
pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
This notice is simply to let you know that:
%(hostname)s
-is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help!
+is down, disconnected from the network and/or non-operational.
+
+Please investigate, thank you very much for your help!
+
+ http://monitor.planet-lab.org:8082/pcuview?loginbase=%(loginbase)s
""")
clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
import sys
from monitor.wrapper import plc
from monitor.database.info.model import *
+import profile
def dsites_from_lsites(l_sites):
d_sites = {}
global plcdb_hn2lb
global plcdb_lb2hn
global plcdb_id2lb
+ print "initing plccache"
dbsites = PlcSite.query.all()
l_sites = [ s.plc_site_stats for s in dbsites ]
+ print "plcnode"
dbnodes = PlcNode.query.all()
l_nodes = [ s.plc_node_stats for s in dbnodes ]
+ print "plcpcu"
dbpcus = PlcPCU.query.all()
l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
+ print "dsites_from_lsites"
(d_sites,id2lb) = dsites_from_lsites(l_sites)
+ print "dsn_from_dsln"
(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
plcdb_hn2lb = hn2lb
ret.append(site.plc_site_stats)
return ret
+def GetSitesById(idlist):
+ ret = []
+ for site_id in idlist:
+ site = PlcSite.get_by(site_id=site_id)
+ ret.append(site.plc_site_stats)
+ return ret
+
+def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_base'):
+ dbobjs = objectClass.query.all()
+ dbobj_key = [ getattr(s, dbKey) for s in dbobjs ]
+ plcobj_key = [ s[plcKey] for s in l_plc ]
+ extra_key = set(dbobj_key) - set(plcobj_key)
+ for obj in extra_key:
+ print "deleting %s" % obj
+ dbobj = objectClass.get_by(**{dbKey : obj})
+ dbobj.delete()
+
def sync():
l_sites = plc.api.GetSites({'peer_id':None},
['login_base', 'site_id', 'abbreviated_name', 'latitude',
'longitude', 'max_slices', 'slice_ids', 'node_ids',
'enabled', 'date_created' ])
l_nodes = plc.api.GetNodes({'peer_id':None},
- ['hostname', 'node_id', 'ports', 'site_id',
- 'version', 'last_updated', 'date_created',
+ ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+ 'version', 'last_updated', 'date_created', 'key',
'last_contact', 'pcu_ids', 'nodenetwork_ids'])
l_pcus = plc.api.GetPCUs()
dbsite.loginbase = site['login_base']
dbsite.date_checked = datetime.now()
dbsite.plc_site_stats = site
- #dbsite.flush()
- # TODO: delete old records.
+ deleteExtra(l_sites, PlcSite, 'loginbase', 'login_base')
+ deleteExtra(l_sites, HistorySiteRecord, 'loginbase', 'login_base')
+ session.flush()
+
+ print "sync pcus"
+ for pcu in l_pcus:
+ dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+ dbpcu.date_checked = datetime.now()
+ dbpcu.plc_pcu_stats = pcu
+ deleteExtra(l_pcus, PlcPCU, 'pcu_id', 'pcu_id')
+ deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
session.flush()
print "sync nodes"
dbnode.hostname = node['hostname']
dbnode.date_checked = datetime.now()
dbnode.plc_node_stats = node
- #dbnode.flush()
- # TODO: delete old records.
- session.flush()
-
- print "sync pcus"
- for pcu in l_pcus:
- dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
- dbpcu.date_checked = datetime.now()
- dbpcu.plc_pcu_stats = pcu
- #dbpcu.flush()
- # TODO: delete old records.
+ deleteExtra(l_nodes, PlcNode, 'hostname', 'hostname')
+ deleteExtra(l_nodes, HistoryNodeRecord, 'hostname', 'hostname')
session.flush()
init()
return
if __name__ == '__main__':
- sync()
+ profile.run('sync()')
else:
init()
node_state = rec.observed_status
if rec.plc_node_stats:
+ print rec.plc_node_stats
boot_state = rec.plc_node_stats['boot_state']
last_contact = rec.plc_node_stats['last_contact']
else:
if boot_state == 'disable': boot_state = 'disabled'
if boot_state == 'diag': boot_state = 'diagnose'
+ if len(rec.plc_node_stats['pcu_ids']) > 0:
+ node.haspcu = True
+ else:
+ node.haspcu = False
+
# NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
# 'translations' into the node.status state
# 'BOOT' is a permanent state, but we want it to have a bit of
except:
print "COULD NOT FIND %s" % nodename
import traceback
+ email_exception()
print traceback.print_exc()
continue
count += 1
print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
- # NOTE: this commits all pending operations to the DB. Do not remove, or
- # replace with another operations that also commits all pending ops, such
- # as session.commit() or flush() or something
+ # NOTE: this commits all pending operations to the DB. Do not remove.
session.flush()
- print HistoryNodeRecord.query.count()
return True
# print k, "==" , net[k]
except:
print "Error with %s" % node
+ email_exception()
import traceback; print traceback.print_exc()
pass
l_pcus = [pcu for pcu in sets.Set(pcus)]
elif config.node:
- l_nodes = plccache.GetNodeByName(config.node)
- pcus = []
- for node in l_nodes:
- pcus += node['pcu_ids']
+ node = plccache.GetNodeByName(config.node)
+ pcus = node['pcu_ids']
# clear out dups.
l_pcus = [pcu for pcu in sets.Set(pcus)]
except:
print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
import traceback
+ email_exception()
print traceback.print_exc()
# don't have the info to create a new entry right now, so continue.
continue
try:
# TODO: make sleep backoff, before stopping.
- time.sleep(4)
+ time.sleep(8)
ret = s.recv(count, socket.MSG_DONTWAIT)
except socket.error, e:
if e[0] == errno.EAGAIN:
lb = plccache.plcdb_hn2lb[host]
except:
print "unknown host in plcdb_hn2lb %s" % host
+ email_exception(host)
continue
nodeblack = BlacklistRecord.get_by(hostname=host)
print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
if nodehist.status == 'good' and \
changed_lessthan(nodehist.last_changed, 1.0) and \
+ found_within(recent_actions, 'down_notice', 7.0) and \
not found_within(recent_actions, 'online_notice', 0.5):
+ # NOTE: searching for down_notice proves that the node has
+ # gone through a 'down' state first, rather than just
+ # flapping through: good, offline, online, ...
+ #
# NOTE: there is a narrow window in which this command must be
- # evaluated, otherwise the notice will not go out. this is not ideal.
+ # evaluated, otherwise the notice will not go out.
+ # this is not ideal.
sitehist.sendMessage('online_notice', hostname=host, viart=False)
print "send message for host %s online" % host
- pass
- if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+ # if a node is offline and doesn't have a PCU, remind the user that they should have one.
+ if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
changed_greaterthan(nodehist.last_changed,1.0) and \
- not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+ not found_within(recent_actions, 'pcumissing_notice', 7.0):
+
+ sitehist.sendMessage('pcumissing_notice', hostname=host)
+ print "send message for host %s pcumissing_notice" % host
+
+ # if it is offline and HAS a PCU, then try to use it.
+ if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+ changed_greaterthan(nodehist.last_changed,1.0) and \
+ not found_between(recent_actions, 'try_reboot', 3.5, 1):
sitehist.attemptReboot(host)
- print "send message for host %s first_try_reboot" % host
- pass
+ print "send message for host %s try_reboot" % host
- # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+ # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
# will be false for a day after the above condition is satisfied
- if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+ if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
changed_greaterthan(nodehist.last_changed,1.5) and \
- found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+ found_between(recent_actions, 'try_reboot', 3.5, 1) and \
not found_within(recent_actions, 'pcufailed_notice', 3.5):
- # found_within(recent_actions, 'first_try_reboot', 3.5) and \
# send pcu failure message
#act = ActionRecord(**kwargs)
sitehist.sendMessage('pcufailed_notice', hostname=host)
print "send message for host %s PCU Failure" % host
- pass
if nodehist.status == 'monitordebug' and \
changed_greaterthan(nodehist.last_changed, 1) and \
sitehist.sendMessage('down_notice', hostname=host)
print "send message for host %s down" % host
- pass
node_count = node_count + 1
+ print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+ sys.stdout.flush()
session.flush()
for i,site in enumerate(sitenames):
# find all ticket ids for site ( could be on the site record? )
# determine if there are penalties within the last 30 days?
# if so, add a 'pause_penalty' action.
- if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+ if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \
+ sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30):
# pause escalation
print "Pausing penalties for %s" % site
sitehist.pausePenalty()
site_count = site_count + 1
+ print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
+ sys.stdout.flush()
session.flush()
session.flush()
try:
main(hostnames, sitenames)
+ session.flush()
except KeyboardInterrupt:
print "Killed by interrupt"
session.flush()
import database
import comon
-from monitor.common import color_pcu_state, datetime_fromstr
+from monitor.common import color_pcu_state, datetime_fromstr, email_exception
from nodehistory import get_filefromglob
import time
import traceback
main()
except IOError:
pass
+ except:
+ email_exception()
up = 0
for node in nodelist:
try:
+ # NOTE: adding a condition for nodehist.haspcu would include pcus
+ # in the calculation
nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
nodebl = BlacklistRecord.get_by(hostname=node['hostname'])
if (nodehist is not None and nodehist.status != 'down') or \
up = up + 1
except:
import traceback
+ email_exception(node['hostname'])
print traceback.print_exc()
return up
--- /dev/null
+#!/usr/bin/python
+
+from monitor.wrapper import plccache
+
+if __name__ == "__main__":
+ plccache.sync()
@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
print "PCUVIEW------------------"
- session.clear()
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
sitequery=[]
pcuquery=[]
nodequery=[]
if loginbase:
actions = ActionRecord.query.filter_by(loginbase=loginbase
- ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+ ).filter(ActionRecord.date_created >= datetime.now() - timedelta(14)
).order_by(ActionRecord.date_created.desc())
actions = [ a for a in actions ]
sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
def nodehistory(self, hostname=None):
query = []
if hostname:
- fbnode = FindbadNodeRecord.get_by(hostname=hostname)
- # TODO: add links for earlier history if desired.
+ #fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+ ## TODO: add links for earlier history if desired.
+ #l = fbnode.versions[-100:]
+ #l.reverse()
+ #for node in l:
+ # prep_node_for_display(node)
+ # query.append(node)
+
+ fbnode = HistoryNodeRecord.get_by(hostname=hostname)
l = fbnode.versions[-100:]
l.reverse()
for node in l:
- prep_node_for_display(node)
+ #prep_node_for_display(node)
query.append(node)
+
return dict(query=query, hostname=hostname)
@expose(template="monitorweb.templates.sitehistory")
py:content="node.pcu_short_status">Reboot Status</span>
</div>
</td-->
- <td id="node-${node.observed_status}" nowrap="true">
+ <!--td id="node-${node.observed_status}" nowrap="true">
+ <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td-->
+ <!--td nowrap="true" py:content="node.kernel"></td-->
+ <!--td py:content="node.date_checked"></td-->
+ <td py:content="node.last_checked"></td>
+ <td nowrap="true">
<a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
- <td nowrap="true" py:content="node.kernel"></td>
- <td py:content="node.date_checked"></td>
+ <td py:content="node.status"></td>
</tr>
</tbody>
</table>