From: Stephen Soltesz Date: Sat, 6 Dec 2008 01:36:21 +0000 (+0000) Subject: added a variety of updates to templates, to reference each other. X-Git-Tag: Monitor-2.0-0~11 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=21f8190281cfb38c3e24dfba6c89420830435f1f added a variety of updates to templates, to reference each other. updates to init script to work better after install. updated automate fxn, and monitor-server.cron will run automate & sync from install time. --- diff --git a/automate-default.sh b/automate-default.sh index ef1cc2f..d01650e 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -64,26 +64,16 @@ source ${MONITOR_SCRIPT_ROOT}/agent.sh echo "Performing Findbad Nodes" ######################### # 1. FINDBAD NODES -rm -f ${MONITOR_DATA_ROOT}/production.findbad2.pkl -${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || : -cp ${MONITOR_DATA_ROOT}/production.findbad2.pkl ${MONITOR_DATA_ROOT}/production.findbad.pkl +${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || : ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : echo "Performing Findbad PCUs" ######################### # 2. FINDBAD PCUS -rm -f ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl -${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || : -cp ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_DATA_ROOT}/production.findbadpcus.pkl +${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || : # clean up stray 'locfg' processes that hang around inappropriately... ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : -#echo "Generating web data" -# badcsv.txt -#${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt -#cp badcsv.txt /plc/data/var/www/html/monitor/ -#${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print ""} { print ""} END{print "
", $0, "
"}' | sed -e 's\|\\g' > /plc/data/var/www/html/monitor/regions.html - echo "Performing uptime changes for sites, nodes, and pcus" ######################## # 3. record last-changed for sites, nodes and pcus. @@ -91,21 +81,6 @@ ${MONITOR_SCRIPT_ROOT}/sitebad.py --increment || : ${MONITOR_SCRIPT_ROOT}/nodebad.py --increment || : ${MONITOR_SCRIPT_ROOT}/pcubad.py --increment || : -echo "Converting pkl files to phpserial" -######################### -# 4. convert pkl to php serialize format. -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbadpcus2 -o findbadpcus -for f in act_all plcdb_hn2lb ; do - if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ]; then - ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i $f -o $f - else - echo "Warning: ${MONITOR_DATA_ROOT}/production.$f.pkl does not exist." - fi -done -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbad -o findbadnodes -#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets -#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets - echo "Archiving pkl files" ######################### # Archive pkl files. @@ -117,11 +92,11 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl fi done -echo "Running grouprins on all dbg nodes" +#echo "Running grouprins on all dbg nodes" ############################ # 5. Check if there are any nodes in dbg state. Clean up afterward. -${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || : -${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || : +#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || : +#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || : cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log rm -f $MONITOR_PID diff --git a/findbadpcu.py b/findbadpcu.py index 8ebd891..070ddac 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -361,12 +361,6 @@ def main(): global_round = fbsync.round - if config.increment: - # update global round number to force refreshes across all nodes - global_round += 1 - fbsync.round = global_round - - fbsync.flush() if config.site is not None: api = plc.getAuthAPI() @@ -379,6 +373,7 @@ def main(): l_pcus = [pcu for pcu in sets.Set(pcus)] elif config.pcuselect is not None: n, pcus = pcu_select(config.pcuselect) + print pcus # clear out dups. l_pcus = [pcu for pcu in sets.Set(pcus)] @@ -392,6 +387,12 @@ def main(): l_pcus = [ config.pcuid ] l_pcus = [int(pcu) for pcu in l_pcus] + if config.increment: + # update global round number to force refreshes across all nodes + global_round += 1 + fbsync.round = global_round + fbsync.flush() + checkAndRecordState(l_pcus, cohash) return 0 diff --git a/monitor-server.cron b/monitor-server.cron index f6062d6..ddcb076 100644 --- a/monitor-server.cron +++ b/monitor-server.cron @@ -2,5 +2,5 @@ # automated actions for debug nodes. 01 * * * * root /usr/share/monitor/automate.sh 2>&1 > /usr/share/monitor/monitor.log -01 * * * * root /usr/share/monitor/zabbix/zabbixsync.py 2>&1 > /usr/share/monitor/zabbixsync.log +30 * * * * root /etc/plc.d/monitor sync 2>&1 >> /var/log/monitorsync.log diff --git a/monitor-server.init b/monitor-server.init index 28a7df8..3fb1728 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -77,7 +77,7 @@ function check_user_and_db() fi # Create/update the unprivileged database user and password - if [ -z "$PLC_MONITOR_DBPASSWORD" ] ; then + if [ -z "$PLC_MONITOR_DBPASSWORD" || "$PLC_MONITOR_DBPASSWORD" = "None" ] ; then # Zabbix doesn't like plain uuidgen passwords PLC_MONITOR_DBPASSWORD=$( uuidgen | md5sum - | awk '{print $1}' ) plc-config --category=plc_monitor --variable=dbpassword --value="$PLC_MONITOR_DBPASSWORD" --save=$local_config $local_config @@ -196,6 +196,8 @@ MONITOR_ARCHIVE_ROOT=/var/lib/monitor/archive-pdb MONITOR_HOSTNAME=${PLC_MONITOR_HOST} MONITOR_IP=${PLC_MONITOR_IP} +PLC_WWW_HOSTNAME=${PLC_WWW_HOST} + # used for debug mode email= @@ -209,7 +211,7 @@ support_email=${PLC_MAIL_SUPPORT_ADDRESS} cc_email= [monitordatabase] -monitor_dburi=postgres://${MONITOR_DB_NAME}:${PLC_MONITOR_DBPASSWORD}@localhost:5432/${MONITOR_DB_NAME} +monitor_dburi=postgres://${MONITOR_DB_USER}:${PLC_MONITOR_DBPASSWORD}@localhost:5432/${MONITOR_DB_NAME} zabbix_dburi=postgres://${ZABBIX_DB_USER}:${PLC_MONITOR_DBPASSWORD}@localhost:5432/${ZABBIX_DB_NAME} cachetime=60 @@ -275,7 +277,7 @@ EOF function start_tg_server () { pushd ${MONITORPATH}/web/MonitorWeb/ - ./start-monitorweb.py ${MONITORPATH}/web/MonitorWeb/prod.cfg &> /var/log/monitorweb.log + ./start-monitorweb.py ${MONITORPATH}/web/MonitorWeb/prod.cfg &> /var/log/monitorweb.log & popd } @@ -350,12 +352,14 @@ fi case "$1" in start) - MESSAGE=$"Bootstrap Monitoring" + MESSAGE=$"Bootstrap Monitoring (please wait...)" dialog "$MESSAGE" # DATABASE acces, creation, and data loading check_pg_hba $MONITOR_DB_NAME $MONITOR_DB_USER check_user_and_db $MONITOR_DB_NAME $MONITOR_DB_USER + # WRITE default /etc/monitor.conf + check_monitor_conf check_monitor_schema_and_data check_pg_hba $ZABBIX_DB_NAME $ZABBIX_DB_USER @@ -365,19 +369,21 @@ case "$1" in # NOTE: restart db to enable access by users granted above. service plc restart postgresql service plc restart httpd + MESSAGE=$"Bootstrap Monitoring 2 (please wait...)" + dialog "$MESSAGE" fi check_zabbix_schema_and_data check_zabbix_templates_and_import - # WRITE default /etc/monitor.conf - check_monitor_conf # create /etc/httpd/conf.d/monitorweb.conf create_httpd_conf if [ -n "$WROTE_HTTP_CONFIG" ] ; then # NOTE: restart web server to enable access web cfg service plc restart httpd + MESSAGE=$"Bootstrap Monitoring 3 (please wait...)" + dialog "$MESSAGE" fi start_tg_server @@ -389,6 +395,16 @@ case "$1" in result "$MESSAGE" ;; + restartweb) + MESSAGE=$"Restarting monitor web app..." + dialog "$MESSAGE" + + stop_tg_server + start_tg_server + + result "$MESSAGE" + ;; + sync) MESSAGE=$"Syncing PLC db with Zabbix DB" dialog "$MESSAGE" diff --git a/nodebad.py b/nodebad.py index 5543f90..baa016c 100755 --- a/nodebad.py +++ b/nodebad.py @@ -13,7 +13,7 @@ from nodecommon import * from monitor import config from monitor.wrapper import plc,plccache from monitor.const import MINUP -from monitor.database import FindbadNodeRecord, HistoryNodeRecord +from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord from monitor.model import * diff --git a/nodequery.py b/nodequery.py index 3c5428a..48a5f73 100755 --- a/nodequery.py +++ b/nodequery.py @@ -17,7 +17,7 @@ from pcucontrol import reboot from monitor.wrapper import plc, plccache api = plc.getAuthAPI() -from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session +from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session from monitor import util from monitor import config @@ -75,10 +75,10 @@ def get(fb, path): indexes = path.split(".") values = fb for index in indexes: - if index in values: - values = values[index] - else: - raise NoKeyException(index) + if values and index in values: + values = values[index] + else: + raise NoKeyException(index) return values def verifyType(constraints, data): @@ -254,9 +254,10 @@ def query_to_dict(query): return ad def pcu_in(fbdata): - if 'plcnode' in fbdata: - if 'pcu_ids' in fbdata['plcnode']: - if len(fbdata['plcnode']['pcu_ids']) > 0: + #if 'plcnode' in fbdata: + if 'plc_node_stats' in fbdata: + if 'pcu_ids' in fbdata['plc_node_stats']: + if len(fbdata['plc_node_stats']['pcu_ids']) > 0: return True return False @@ -273,6 +274,7 @@ def pcu_select(str_query, nodelist=None): fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ] dict_query = query_to_dict(str_query) + print "dict_query", dict_query for noderec in fbquery: if nodelist is not None: @@ -280,13 +282,12 @@ def pcu_select(str_query, nodelist=None): fb_nodeinfo = noderec.to_dict() if pcu_in(fb_nodeinfo): - pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=get(fb_nodeinfo, 'plc_node_stats.pcu_ids')[0]) - pcuinfo = pcurec.to_dict() - if verify(dict_query, pcuinfo): - nodenames.append(noderec.hostname) - str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \ - (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password']) - pcunames.append(pcuinfo['plc_pcuid']) + pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=get(fb_nodeinfo, 'plc_node_stats.pcu_ids')[0]).first() + if pcurec: + pcuinfo = pcurec.to_dict() + if verify(dict_query, pcuinfo): + nodenames.append(noderec.hostname) + pcunames.append(pcuinfo['plc_pcuid']) return (nodenames, pcunames) def node_select(str_query, nodelist=None, fb=None): diff --git a/pcubad.py b/pcubad.py index 04118e2..b31599f 100755 --- a/pcubad.py +++ b/pcubad.py @@ -10,7 +10,7 @@ from monitor import database from pcucontrol import reboot from monitor import parser as parsermodule from monitor import config -from monitor.database import HistoryPCURecord, FindbadPCURecord +from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord from monitor.wrapper import plc,plccache from monitor.const import MINUP @@ -22,7 +22,6 @@ api = plc.getAuthAPI() def main(config): - #l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) l_plcpcus = plccache.l_pcus l_pcus = None diff --git a/pcucontrol/reboot.py b/pcucontrol/reboot.py index 5b322d3..bfb7f3c 100755 --- a/pcucontrol/reboot.py +++ b/pcucontrol/reboot.py @@ -24,6 +24,7 @@ sys.path.insert(0, os.path.dirname(sys.argv[0])) import pcucontrol.transports.telnetlib as telnetlib sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh") import pcucontrol.transports.pyssh as pyssh +from monitor import config # Timeouts in seconds TELNET_TIMEOUT = 45 @@ -601,7 +602,7 @@ class IntelAMT(PCUControl): cmd = command.CMD() # TODO: need to make this path universal; not relative to pwd. - cmd_str = "pcucontrol/models/intelamt/remoteControl" + cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/intelamt/remoteControl" if dryrun: # NOTE: -p checks the power state of the host. @@ -668,25 +669,29 @@ class HPiLOHttps(PCUControl): def run(self, node_port, dryrun): locfg = command.CMD() - cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( - self.host, "iloxml/Get_Network.xml", + + cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/" + + cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( + self.host, cmd_str+"iloxml/Get_Network.xml", self.username, self.password) sout, serr = locfg.run_noexcept(cmd) - if sout.strip() != "": + if sout.strip() != "" or serr.strip() != "": print "sout: %s" % sout.strip() - return sout.strip() + return sout.strip() + serr.strip() if not dryrun: locfg = command.CMD() - cmd = "cmdhttps/locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( - self.host, "iloxml/Reset_Server.xml", + cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( + self.host, cmd_str+"iloxml/Reset_Server.xml", self.username, self.password) sout, serr = locfg.run_noexcept(cmd) if sout.strip() != "": print "sout: %s" % sout.strip() #return sout.strip() + return 0 class BayTechAU(PCUControl): diff --git a/shconfig.py b/shconfig.py index 0b28577..0f97b71 100755 --- a/shconfig.py +++ b/shconfig.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import config +from monitor import config for attr in dir(config): val = config.__getattribute__(attr) diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 9202cac..2f7b3a6 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -33,12 +33,13 @@ def format_ports(pcu): def format_pcu_shortstatus(pcu): status = "error" - if pcu.reboot_trial_status == str(0): - status = "ok" - elif pcu.reboot_trial_status == "NetDown" or pcu.reboot_trial_status == "Not_Run": - status = pcu.reboot_trial_status - else: - status = "error" + if pcu: + if pcu.reboot_trial_status == str(0): + status = "ok" + elif pcu.reboot_trial_status == "NetDown" or pcu.reboot_trial_status == "Not_Run": + status = pcu.reboot_trial_status + else: + status = "error" return status diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid index c9ec477..cb62ec1 100644 --- a/web/MonitorWeb/monitorweb/templates/nodelist.kid +++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid @@ -1,6 +1,6 @@ @@ -40,14 +40,13 @@ from time import mktime - ${node.loginbase} - - - - - - - + ${node.loginbase} + + + + + + diff --git a/web/MonitorWeb/monitorweb/templates/nodeview.kid b/web/MonitorWeb/monitorweb/templates/nodeview.kid index 704ec24..f1acbae 100644 --- a/web/MonitorWeb/monitorweb/templates/nodeview.kid +++ b/web/MonitorWeb/monitorweb/templates/nodeview.kid @@ -1,7 +1,8 @@ - ${node.loginbase} - + + ${node.loginbase} + + + + ${node.hostname} + - + + ${node.pcu_short_status} + + ${node.pcu_short_status} diff --git a/web/MonitorWeb/monitorweb/templates/pculist.kid b/web/MonitorWeb/monitorweb/templates/pculist.kid index d37be12..510218e 100644 --- a/web/MonitorWeb/monitorweb/templates/pculist.kid +++ b/web/MonitorWeb/monitorweb/templates/pculist.kid @@ -1,15 +1,9 @@ - ${node.loginbase} + ${node.loginbase} - ${pcu_name(node.plc_pcu_stats)} + ${pcu_name(node.plc_pcu_stats)} diff --git a/web/MonitorWeb/monitorweb/templates/pcuview.kid b/web/MonitorWeb/monitorweb/templates/pcuview.kid index 4946fc9..013714e 100644 --- a/web/MonitorWeb/monitorweb/templates/pcuview.kid +++ b/web/MonitorWeb/monitorweb/templates/pcuview.kid @@ -1,13 +1,10 @@ - + ${node.loginbase} - + ${pcu_name(node.plc_pcu_stats)} diff --git a/web/MonitorWeb/monitorweb/templates/sitelist.kid b/web/MonitorWeb/monitorweb/templates/sitelist.kid index ab24355..299daa8 100644 --- a/web/MonitorWeb/monitorweb/templates/sitelist.kid +++ b/web/MonitorWeb/monitorweb/templates/sitelist.kid @@ -1,9 +1,10 @@ + xmlns:py="http://purl.org/kid/ns#" + xmlns:mochi="http://www.mochi.org">
@@ -19,21 +20,23 @@ layout_params['page_title'] = "Monitor Site View"
- +
+ - - + + - - - - + + + + +
Site name StatusSlices (created / max)Nodes (online / registered)Slices (created / max)Nodes (online / registered)
${site.loginbase}${site.slices_used}/${site.slices_total}${site.nodes_up} / ${site.nodes_total}${site.loginbase}${site.slices_used}/${site.slices_total}${site.nodes_up} / ${site.nodes_total}
diff --git a/web/MonitorWeb/monitorweb/templates/sitemenu.kid b/web/MonitorWeb/monitorweb/templates/sitemenu.kid index 262005a..73af304 100644 --- a/web/MonitorWeb/monitorweb/templates/sitemenu.kid +++ b/web/MonitorWeb/monitorweb/templates/sitemenu.kid @@ -3,7 +3,7 @@ App Name - ${page_title} - + @@ -16,9 +16,9 @@ - - + + diff --git a/web/MonitorWeb/monitorweb/templates/siteview.kid b/web/MonitorWeb/monitorweb/templates/siteview.kid index abbcfe2..7b56393 100644 --- a/web/MonitorWeb/monitorweb/templates/siteview.kid +++ b/web/MonitorWeb/monitorweb/templates/siteview.kid @@ -2,6 +2,7 @@ - + @@ -47,7 +50,10 @@ from monitor.util import diff_time - + + diff --git a/zabbix/zabbixsite.py b/zabbix/zabbixsite.py index e3ddc21..419b84b 100755 --- a/zabbix/zabbixsite.py +++ b/zabbix/zabbixsite.py @@ -62,6 +62,11 @@ def delete_site(loginbase): return +# NOTE: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +## These functions can ONLY be run when the server and gui are offline. +## Any changes to the db while this is running risks introducing a failure +## to commit, and therefore error. +# NOTE: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! def setup_global(): # GLOBAL: # update mediatype for email. @@ -100,7 +105,7 @@ def setup_global(): # copying that the php code does during a host add. # NOTE: Instead, reformat any *xml.in templates and import those # during /etc/plc.d/monitor sync - for file in glob.glob("%s/zabbix/templates/*.xml.in" config.MONITOR_SCRIPT_ROOT): + for file in glob.glob("%s/zabbix/templates/*.xml.in" % config.MONITOR_SCRIPT_ROOT): if 'zabbix_server' in file: buf = loadFile(file) args = {'hostname' : config.MONITOR_HOSTNAME, 'ip' : config.MONITOR_IP}
NodesPCUs SitesPCUsNodes Actions
${site.loginbase} + ${site.loginbase} + ${site.slices_used}/${site.slices_total} your.host.org + ${node.pcu_short_status} + ${node.pcu_short_status}