From ecea36bbf6aad3bbd3e544d336b89a45cff4ab34 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 20 Nov 2009 22:36:17 +0000 Subject: [PATCH] added templating to google gadget xml file in monitor-server; previously it was hard-coded to monitor.planet-lab.org ; now PLE can have their own google gadget. added policy to close tickets if all nodes & pcus at a site are ok, to prevent some leaking tickets. --- comonquery.py | 2 +- cron.d/copy-logs.sh | 1 + monitor-server.init | 38 ++++ monitor/common.py | 5 + monitor/wrapper/mailer.py | 18 +- policy.py | 48 ++++- todo | 196 ------------------ .../static/xml/{gadget.xml => gadget.xml.in} | 4 +- 8 files changed, 99 insertions(+), 213 deletions(-) delete mode 100644 todo rename web/MonitorWeb/monitorweb/static/xml/{gadget.xml => gadget.xml.in} (65%) diff --git a/comonquery.py b/comonquery.py index 72e5d13..db0bafe 100755 --- a/comonquery.py +++ b/comonquery.py @@ -94,7 +94,7 @@ def main(): # lastcotop measures whether cotop is actually running. this is a better # metric than sshstatus, or other values from CoMon - COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ + COMON_COTOPURL= "http://comon.cs.princeton.edu/status/tabulator.cgi?" + \ "table=table_nodeview&formatcsv" if config.dns: config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp" diff --git a/cron.d/copy-logs.sh b/cron.d/copy-logs.sh index 61754b5..5c13a00 100755 --- a/cron.d/copy-logs.sh +++ b/cron.d/copy-logs.sh @@ -3,6 +3,7 @@ cd /usr/share/monitor source agent.sh &> /dev/null +rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log diff --git a/monitor-server.init b/monitor-server.init index 8c26416..424c362 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -18,6 +18,8 @@ local_config=/etc/planetlab/configs/site.xml MONITORPATH=/usr/share/monitor +WEB_ROOT_PATH=web/MonitorWeb/monitorweb +WEB_XML_PATH=static/xml # Be verbose set -x @@ -40,6 +42,39 @@ if [ -z "$PLC_MONITOR_IP" ] ; then PLC_MONITOR_IP=$( gethostbyname $PLC_MONITOR_HOST ) fi +function update_config () +{ + pattern=$1 + with=$2 + file=$3 + sed -i -e "s/$pattern/$with/g" $file +} +function apply_template () +{ + TEMPLATE=$1 + DESTFILE=$2 + + tmp_file=$(mktemp) + cp $TEMPLATE $tmp_file + + update_config PLC_NAME "$PLC_NAME" $tmp_file + update_config PLC_WWW_HOSTNAME $PLC_WWW_HOST $tmp_file + update_config MONITOR_HOSTNAME $PLC_MONITOR_HOST $tmp_file + + cp $tmp_file $DESTFILE + rm -f $tmp_file +} + +function check_gadget_config () +{ + for input_file in $MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/*.in ; do + output_file=$MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/`basename $input_file | sed -e 's/.in\$//'` + if [ $input_file -nt $output_file ] ; then + apply_template $input_file $output_file + fi + done +} + function check_monitor_schema_and_data() { # NOTE: call create_all() to setup the database from the info model. @@ -150,6 +185,7 @@ function create_httpd_conf () # NOTE: redirect path without trailing '/' to path with. Favor SSL. Redirect /monitor https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor/ +#RedirectMatch ^/$ https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor # NOTE: this directive strips '/monitor/' from the requested path and pastes # the remaining part to the end of the ProxyPass url below. All TG urls @@ -201,6 +237,8 @@ case "$1" in # WRITE default /etc/monitor.conf check_monitor_conf + check_gadget_config + if [ -n "$WROTE_PG_CONFIG" ] ; then # NOTE: restart db to enable access by users granted above. service plc restart postgresql diff --git a/monitor/common.py b/monitor/common.py index 05a4ec2..850d36b 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -43,12 +43,17 @@ def get_current_state(fbnode): return l def color_pcu_state(fbnode): + if fbnode['plc_pcuid'] is None: + return 'NOPCU' + else: + return 'PCU' if 'plcnode' in fbnode and 'pcu_ids' in fbnode['plcnode'] and len(fbnode['plcnode']['pcu_ids']) > 0 : values = reboot.get_pcu_values(fbnode['plcnode']['pcu_ids'][0]) if values == None: return fbnode['pcu'] else: + print fbnode.keys() if 'pcu' not in fbnode: return 'NOPCU' else: diff --git a/monitor/wrapper/mailer.py b/monitor/wrapper/mailer.py index 1b45f0e..9f22c96 100755 --- a/monitor/wrapper/mailer.py +++ b/monitor/wrapper/mailer.py @@ -65,7 +65,11 @@ def getTicketStatus(ticket_id): r_values[key] = ":".join(vals[1:]) r_values[key] = r_values[key].strip() - r_values['Created'] = calendar.timegm(time.strptime(r_values['Created'])) + if 'Created' in r_values: + r_values['Created'] = calendar.timegm(time.strptime(r_values['Created'])) + else: + r_values['Created'] = calendar.timegm(time.localtime()) + #r_values['Told'] = calendar.timegm(time.strptime(r_values['Told'])) return r_values @@ -339,12 +343,7 @@ def email(subject, text, to): for mta in [MTA, 'golf.cs.princeton.edu']: try: # This is normal operation - #print MTA - #print FROM - #print to - #print msg server = smtplib.SMTP(mta) - #server = smtplib.SMTP('golf.cs.princeton.edu') server.sendmail(FROM, to, msg) if config.bcc and not config.debug: server.sendmail(FROM, config.email, msg) @@ -361,17 +360,10 @@ def email(subject, text, to): except Exception, err: print "Mailer error2: failed using MTA(%s) with: %s" % (mta, err) else: - #print "Would mail %s" %to logger.debug("Would send mail to %s" % to) if __name__=="__main__": import smtplib import emailTxt import plc - #email("[spam] bcc test from golf.cs.princeton.edu", - # "It gets to both recipients", - # "soltesz@cs.utk.edu") emailViaRT("mail via RT", "Let's see if this succeeds...", [FROM]) - #email("Re: [PL #21323] TEST 7", - # mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."}, - # [FROM]) diff --git a/policy.py b/policy.py index cdd311c..992e578 100755 --- a/policy.py +++ b/policy.py @@ -21,6 +21,7 @@ from optparse import OptionParser from monitor import config from monitor import parser as parsermodule from monitor.common import * +from monitor.const import MINUP from monitor.model import * from monitor.wrapper import plc from monitor.wrapper import plccache @@ -36,6 +37,41 @@ def logic(): plc.nodeBootState(host, 'reinstall') node_end_record(host) +def check_node_and_pcu_status_for(loginbase): + """ + this function checks whether all the nodes and associated pcus for a + given site are considered 'good'. + + If so, the function returns True. + Otherwise, the function returns False. + """ + + results = [] + for node in plccache.plcdb_lb2hn[loginbase]: + + noderec = FindbadNodeRecord.findby_or_create(hostname=node['hostname']) + nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) + nodebl = BlacklistRecord.get_by(hostname=node['hostname']) + pcuhist = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid) + + if (nodehist is not None and nodehist.status == 'good' and \ + ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ): + if nodebl is None: # no entry in blacklist table + results.append(True) + elif nodebl is not None and nodebl.expired(): # expired entry in blacklist table + results.append(True) + else: + results.append(False) # entry that is not expired. + else: + results.append(False) + + try: + print "test: %s" % results + # NOTE: incase results is empty, reduce does not work on an empty set. + return reduce(lambda x,y: x&y, results) and len(results) > MINUP + except: + return False + def main(hostnames, sitenames): # commands: i = 1 @@ -231,7 +267,17 @@ def main(hostnames, sitenames): sitehist.closeTicket() print "send message for site %s penalty cleared" % site - + + # check all nodes and pcus for this site; if they're all ok, + # close the ticket, else leave it open. + # NOTE: in the case where a PCU reboots and fails, a message is + # sent, but the PCU may appear to be ok according to tests. + # NOTE: Also, bootmanager sends messages regarding disks, + # configuration, etc. So, the conditions here are 'good' + # rather than 'not down' as it is in sitebad. + close_ticket = check_node_and_pcu_status_for(site) + if close_ticket: + sitehist.closeTicket() site_count = site_count + 1 diff --git a/todo b/todo deleted file mode 100644 index f69785f..0000000 --- a/todo +++ /dev/null @@ -1,196 +0,0 @@ -Structure: - -monitor module - plc wrapper - util functions - pkl database access - database models - third-party data sources - -pcucontrol - maps types to code - reboot.py - interface.py - - transport: - pyssh - ssh - telnetlib - models: - hpilo cmds - intelamt cmds - racadm cmd - ipmitool cmd - -web - cgi scripts - tgweb - project... - -cmds - py scripts - node - site - pcu - query - grouprins - -bootman - rpyc - - - - - -############################### -for each node: - Check Status -> - if Pass Threshold -> - Create Issue -> - Take Action -> - email - bm - pcu - plc reset - apply penalties - flag for admin - -for each issue - check issue.status - if issue.status is "open": - issue.take_next_action() - if issue.closed: - issue.shutdown() - if issue.paused: - pass - -action_list for issuetype (pcudown) - send email - yield - send email, apply penalty - yield - send email, apply second penalty - yield - send email - -action_list for issuetype (badhardware) -action_list for issuetype (dnserror) -action_list for issuetype (nodeconfig) -action_list for issuetype (oldbootcd) - -action_list for issuetype (nodedown) - if pcuok, reboot - yield - if pcuok, and reboot failed, set rins, reboot - yield - create_issue pcubroken - send email - yield - send email, apply penalty - yield - send email, apppy second penalty - yield - send email - - -TOOLS: - * add a '--nocache' to the default set of options. - * add a cache parameter in the monitor.conf file. - - - -TODO: - * install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1, MySQL-python - * had to mount -t devpts devpts /dev/pts to get ssh to work inside the - chroot. also, disable the pam modules in /etc/pam.d/sshd - - * blue - * auto configuration for php configuration. - maybe run translation of monitor.conf before loading monitorconfig.php? - * blue2 - - * A setup script of some kind would be nice that walked through : - - writing monitorconfig.py - - creation of monitorconfig.php - - run syncplcdb.py - - testapi.py - - findbad.py on sample site. - - nodebad.py - - findbadpcus.py - - nodequery.py - - nodegroups.py - - loads webpage for those retreived values to confirm setup succeeded. - - * reimplement the config.py / .config mechanism. I'd like for many commands - to share very similar argument or argument sets, as well as have some - common config options. I'm not sure the best way to do this. - - - features of config.py - * parse arguments and return an object with attributes equal to the - parser values. - * maintain values consistently across modules at run time. - * have default values that are not specified at each run time. - * easy to import and use - - - config module is available via 'import config' or as returned by - parsermodule.parse_args() - - python supports load-once modules, so subsequent imports refer to the - same module object. - - * have package pull in threadpool from easy_install - - * place PKL files in a real database - - * clean up plc.py; there's a lot of redundent code. - - * figure out python paths for user commands. - - directories for pickle files. - - add user in rpm install - - user permissions for data files for day-to-day operations. - - * fix BayTechCtrlCUnibe expect script. - - * separate modules into different, logical categories, and create a python - module as part of the install: - command line, - configuration, - policy, - data model, - data access, - object interfaces. - -Lower priority: - * Add a more structured, 'automate' library of scripts and means of making - batch calls, etc. - - * add a third package for user tools that will interact with the Monitor - service. Mostly, I'm guessing this would be queries for the live status of - nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently - availble with PLC. - -Done: - * Find a better location to place and pull the PKL files currently in the pdb - directory. Ultimately, these should be stored in a real DB. Until then, - they should sit in a location that is accessible from the www scripts, - backend scripts, and user utilities. - * nodebad loads plc_hn2lb unconditionally - * nodeinfo loads act_all unconditionally - * change findbad.py default db name - * remove deps on www.printbadnodes - * reboot.py loads findbadpcus unconditionally. - * nodequery loads findbad unconditionally - * unified_model loads findbad unconditionally - - * threadpool package. - * build cmdamt with g++ prior to packaging - - * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc. - - need to convert monitor.conf into monitorconf.sh and monitorconf.php - - * pull out global configuration information from various files, like rt_db, - mailer.py, auth.py, and any others. Create a single configuration file - from which all others pull. - - - convert plc and other files to use the new monitorconfig.py rather than - auth, or plc.* - - need to alter all import 'auth' statements. diff --git a/web/MonitorWeb/monitorweb/static/xml/gadget.xml b/web/MonitorWeb/monitorweb/static/xml/gadget.xml.in similarity index 65% rename from web/MonitorWeb/monitorweb/static/xml/gadget.xml rename to web/MonitorWeb/monitorweb/static/xml/gadget.xml.in index bae4356..66ec12e 100644 --- a/web/MonitorWeb/monitorweb/static/xml/gadget.xml +++ b/web/MonitorWeb/monitorweb/static/xml/gadget.xml.in @@ -1,6 +1,6 @@ - + ]]> -- 2.43.0