From: Stephen Soltesz Date: Wed, 29 Oct 2008 15:55:12 +0000 (+0000) Subject: updates to improve generalization and auto-installation. X-Git-Tag: Monitor-1.0-12~2 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;ds=sidebyside;h=1fe8e24fa4180892d0dd6aa30135ce9f137dec03;p=monitor.git updates to improve generalization and auto-installation. fixed links in web pages added config option for monitor-default for RT_QUEUE changed unified_model to return literal email addresses not to use the tech- & pi- aliases. --- diff --git a/automate-default.sh b/automate-default.sh index 8e7be9c..d23cf38 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -8,7 +8,7 @@ source $INSTALLPATH/monitorconfig.sh cd ${MONITOR_SCRIPT_ROOT} set -e DATE=`date +%Y-%m-%d-%T` -MONITOR_PID="$HOME/monitor/SKIP" +MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP" echo "Performing API test" API=$(./testapi.py) @@ -41,7 +41,9 @@ if [ ! -f ${MONITOR_SCRIPT_ROOT}/actallsetup.flag ]; then fi +set +e AGENT=`ps ax | grep ssh-agent | grep -v grep` +set -e if [ -z "$AGENT" ] ; then echo "starting ssh agent" # if no agent is running, set it up. @@ -71,11 +73,11 @@ cp ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_DATA_ROOT}/product # clean up stray 'locfg' processes that hang around inappropriately... ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill || : -echo "Generating web data" +#echo "Generating web data" # badcsv.txt -${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt -cp badcsv.txt /plc/data/var/www/html/monitor/ -${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print ""} { print ""} END{print "
", $0, "
"}' | sed -e 's\|\\g' > /plc/data/var/www/html/monitor/regions.html +#${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt +#cp badcsv.txt /plc/data/var/www/html/monitor/ +#${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print ""} { print ""} END{print "
", $0, "
"}' | sed -e 's\|\\g' > /plc/data/var/www/html/monitor/regions.html echo "Performing uptime changes for sites, nodes, and pcus" ######################## @@ -91,8 +93,8 @@ ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbadpcus2 -o findbadpcus ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i act_all -o act_all ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i plcdb_hn2lb -o plcdb_hn2lb ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbad -o findbadnodes -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets +#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets +#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets echo "Archiving pkl files" ######################### @@ -104,15 +106,8 @@ done echo "Running grouprins on all dbg nodes" ############################ # 5. Check if there are any nodes in dbg state. Clean up afterward. -${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 \ - --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' \ - --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \ - --reboot || : -${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || : - -echo "Collecting RT database dump" -########################## -# 6. cache the RT db locally. -python ${MONITOR_SCRIPT_ROOT}/rt.py +${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || : +${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || : +cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log rm -f $MONITOR_PID diff --git a/clean_policy.py b/clean_policy.py index aa8f7de..34099be 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -2,7 +2,6 @@ import config import database import time import mailer -from unified_model import cmpCategoryVal import sys import emailTxt import string @@ -240,7 +239,7 @@ class MonitorMergeDiagnoseSendEscellate: diag.save() else: - print "NOT sending email : %s %s" % (config.mail, record.data['rt']) + print "NOT sending email : %s" % config.mail return diff --git a/config.py b/config.py index b37e04a..feef515 100644 --- a/config.py +++ b/config.py @@ -48,8 +48,14 @@ if not config.imported: #from config import options as config options = Options() - update_section(options, 'commandline', True) - update_section(options, 'monitorconfig') + try: + update_section(options, 'commandline', True) + except: + pass + try: + update_section(options, 'monitorconfig') + except: + pass #for i in dir(config): # if "__" not in i: diff --git a/mailer.py b/mailer.py index d80d5d7..7c9ff11 100755 --- a/mailer.py +++ b/mailer.py @@ -218,7 +218,7 @@ def emailViaRT_NoTicket(subject, text, to): input_text = "Subject: %s\n" input_text += "Requestor: %s\n"% FROM input_text += "id: ticket/new\n" - input_text += "Queue: Monitor\n" + input_text += "Queue: %s\n" % config.RT_QUEUE for recipient in to: input_text += "AdminCc: %s\n" % recipient input_text += "Text: %s" diff --git a/monitor-default.conf b/monitor-default.conf index 74a0c18..d525196 100644 --- a/monitor-default.conf +++ b/monitor-default.conf @@ -10,6 +10,7 @@ RT_WEB_TOOLS_PATH= RT_WEB_USER= RT_WEB_PASSWORD= RT_WEB_DEBUG=0 +RT_QUEUE= # PLC admin account API_SERVER=https://boot.planet-lab.org/PLCAPI/ @@ -20,7 +21,7 @@ API_AUTH_PASSWORD= MONITOR_HOSTNAME=monitor.planet-lab.org MONITOR_SCRIPT_ROOT=/usr/share/monitor-server MONITOR_DATA_ROOT=/var/lib/monitor-server -MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb +MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb email=monitor@another-lab.org diff --git a/nodeinfo.py b/nodeinfo.py index fee8eb3..5aecea3 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -46,7 +46,10 @@ def plc_print_nodeinfo(plcnode): def fb_print_nodeinfo(fbnode): pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags') - fbnode['last_change'] = diff_time(pf.last_changed) + try: + fbnode['last_change'] = diff_time(pf.last_changed) + except: + fbnode['last_change'] = diff_time(time.time()) print " Checked: ", if 'checked' in fbnode: print "%11.11s " % diff_time(fbnode['checked']) diff --git a/plc.py b/plc.py index db14f50..3ef546e 100644 --- a/plc.py +++ b/plc.py @@ -76,7 +76,7 @@ def getTechEmails(loginbase): # get site details. s = api.GetSites(loginbase)[0] # get people at site - p = api.GetPersons(s['person_ids'])[0] + p = api.GetPersons(s['person_ids']) # pull out those with the right role. emails = [ person['email'] for person in filter(lambda x: 'tech' in x['roles'], p) ] return emails @@ -89,7 +89,7 @@ def getPIEmails(loginbase): # get site details. s = api.GetSites(loginbase)[0] # get people at site - p = api.GetPersons(s['person_ids'])[0] + p = api.GetPersons(s['person_ids']) # pull out those with the right role. emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], p) ] return emails diff --git a/printbadcsv.py b/printbadcsv.py index f064c11..efec3ca 100755 --- a/printbadcsv.py +++ b/printbadcsv.py @@ -117,13 +117,13 @@ def main(): print str keys = categories.keys() - for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', - 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD', + 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: if cat not in keys: categories[cat] = 0 keys = categories.keys() - for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', - 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD', + 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: if cat in keys: print "%d," % categories[cat], print "" diff --git a/unified_model.py b/unified_model.py index d79470c..76fb705 100755 --- a/unified_model.py +++ b/unified_model.py @@ -36,7 +36,11 @@ def cmpValMap(v1, v2, map): raise Exception("No index %s or %s in map" % (v1, v2)) def cmpCategoryVal(v1, v2): - map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) + # Terrible hack to manage migration to no more 'ALPHA' states. + if v1 == 'ALPHA': v1 = "PROD" + if v2 == 'ALPHA': v2 = "PROD" + #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) + map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) return cmpValMap(v1,v2,map) @@ -559,14 +563,17 @@ class Record(object): if ADMIN & roles: contacts += [config.email] if TECH & roles: - contacts += [TECHEMAIL % self.loginbase] + #contacts += [TECHEMAIL % self.loginbase] + contacts += plc.getTechEmails(loginbase) if PI & roles: - contacts += [PIEMAIL % self.loginbase] + #contacts += [PIEMAIL % self.loginbase] + contacts += plc.getSliceUserEmails(loginbase) if USER & roles: + contacts += plc.getSliceUserEmails(loginbase) slices = plc.slices(self.loginbase) if len(slices) >= 1: - for slice in slices: - contacts += [SLICEMAIL % slice] + #for slice in slices: + # contacts += [SLICEMAIL % slice] print "SLIC: %20s : %d slices" % (self.loginbase, len(slices)) else: print "SLIC: %20s : 0 slices" % self.loginbase diff --git a/www/printbadnodes.py b/www/printbadnodes.py index 5525eff..24a6dc4 100755 --- a/www/printbadnodes.py +++ b/www/printbadnodes.py @@ -62,11 +62,11 @@ def cmpState(l1, l2): return cmpMap(l1,l2,'state', map) def cmpCategoryVal(v1, v2): - map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) + map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) return cmpValMap(v1,v2,map) def cmpCategory(l1, l2): - map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ]) + map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ]) return cmpMap(l1,l2,'category', map) def cmpPCU(l1, l2): diff --git a/www/printbadpcus.php b/www/printbadpcus.php index 7db3e8e..500be1f 100644 --- a/www/printbadpcus.php +++ b/www/printbadpcus.php @@ -2,12 +2,12 @@ function plc_site_link($site_name) { - return "https://www.planet-lab.org/db/sites/index.php?site_pattern=" . $site_name; + return "https://" . MONITOR_HOSTNAME . "/db/sites/index.php?site_pattern=" . $site_name; } function pcu_link($pcu) { - return "https://www.planet-lab.org/db/sites/pcu.php?id=" . $pcu['pcu_id']; + return "https://" . MONITOR_HOSTNAME . "/db/sites/pcu.php?id=" . $pcu['pcu_id']; } function pcu_site($pcu)