# lastcotop measures whether cotop is actually running. this is a better
# metric than sshstatus, or other values from CoMon
- COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
+ COMON_COTOPURL= "http://comon.cs.princeton.edu/status/tabulator.cgi?" + \
"table=table_nodeview&formatcsv"
if config.dns:
config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp"
cd /usr/share/monitor
source agent.sh &> /dev/null
+rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log
rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log
rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log
local_config=/etc/planetlab/configs/site.xml
MONITORPATH=/usr/share/monitor
+WEB_ROOT_PATH=web/MonitorWeb/monitorweb
+WEB_XML_PATH=static/xml
# Be verbose
set -x
PLC_MONITOR_IP=$( gethostbyname $PLC_MONITOR_HOST )
fi
+function update_config ()
+{
+ pattern=$1
+ with=$2
+ file=$3
+ sed -i -e "s/$pattern/$with/g" $file
+}
+function apply_template ()
+{
+ TEMPLATE=$1
+ DESTFILE=$2
+
+ tmp_file=$(mktemp)
+ cp $TEMPLATE $tmp_file
+
+ update_config PLC_NAME "$PLC_NAME" $tmp_file
+ update_config PLC_WWW_HOSTNAME $PLC_WWW_HOST $tmp_file
+ update_config MONITOR_HOSTNAME $PLC_MONITOR_HOST $tmp_file
+
+ cp $tmp_file $DESTFILE
+ rm -f $tmp_file
+}
+
+function check_gadget_config ()
+{
+ for input_file in $MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/*.in ; do
+ output_file=$MONITORPATH/$WEB_ROOT_PATH/$WEB_XML_PATH/`basename $input_file | sed -e 's/.in\$//'`
+ if [ $input_file -nt $output_file ] ; then
+ apply_template $input_file $output_file
+ fi
+ done
+}
+
function check_monitor_schema_and_data()
{
# NOTE: call create_all() to setup the database from the info model.
# NOTE: redirect path without trailing '/' to path with. Favor SSL.
Redirect /monitor https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor/
+#RedirectMatch ^/$ https://${PLC_MONITOR_HOST}:${PLC_WWW_SSL_PORT}/monitor
# NOTE: this directive strips '/monitor/' from the requested path and pastes
# the remaining part to the end of the ProxyPass url below. All TG urls
# WRITE default /etc/monitor.conf
check_monitor_conf
+ check_gadget_config
+
if [ -n "$WROTE_PG_CONFIG" ] ; then
# NOTE: restart db to enable access by users granted above.
service plc restart postgresql
return l
def color_pcu_state(fbnode):
+ if fbnode['plc_pcuid'] is None:
+ return 'NOPCU'
+ else:
+ return 'PCU'
if 'plcnode' in fbnode and 'pcu_ids' in fbnode['plcnode'] and len(fbnode['plcnode']['pcu_ids']) > 0 :
values = reboot.get_pcu_values(fbnode['plcnode']['pcu_ids'][0])
if values == None:
return fbnode['pcu']
else:
+ print fbnode.keys()
if 'pcu' not in fbnode:
return 'NOPCU'
else:
r_values[key] = ":".join(vals[1:])
r_values[key] = r_values[key].strip()
- r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
+ if 'Created' in r_values:
+ r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
+ else:
+ r_values['Created'] = calendar.timegm(time.localtime())
+
#r_values['Told'] = calendar.timegm(time.strptime(r_values['Told']))
return r_values
for mta in [MTA, 'golf.cs.princeton.edu']:
try:
# This is normal operation
- #print MTA
- #print FROM
- #print to
- #print msg
server = smtplib.SMTP(mta)
- #server = smtplib.SMTP('golf.cs.princeton.edu')
server.sendmail(FROM, to, msg)
if config.bcc and not config.debug:
server.sendmail(FROM, config.email, msg)
except Exception, err:
print "Mailer error2: failed using MTA(%s) with: %s" % (mta, err)
else:
- #print "Would mail %s" %to
logger.debug("Would send mail to %s" % to)
if __name__=="__main__":
import smtplib
import emailTxt
import plc
- #email("[spam] bcc test from golf.cs.princeton.edu",
- # "It gets to both recipients",
- # "soltesz@cs.utk.edu")
emailViaRT("mail via RT", "Let's see if this succeeds...", [FROM])
- #email("Re: [PL #21323] TEST 7",
- # mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."},
- # [FROM])
from monitor import config
from monitor import parser as parsermodule
from monitor.common import *
+from monitor.const import MINUP
from monitor.model import *
from monitor.wrapper import plc
from monitor.wrapper import plccache
plc.nodeBootState(host, 'reinstall')
node_end_record(host)
+def check_node_and_pcu_status_for(loginbase):
+ """
+ this function checks whether all the nodes and associated pcus for a
+ given site are considered 'good'.
+
+ If so, the function returns True.
+ Otherwise, the function returns False.
+ """
+
+ results = []
+ for node in plccache.plcdb_lb2hn[loginbase]:
+
+ noderec = FindbadNodeRecord.findby_or_create(hostname=node['hostname'])
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+ nodebl = BlacklistRecord.get_by(hostname=node['hostname'])
+ pcuhist = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid)
+
+ if (nodehist is not None and nodehist.status == 'good' and \
+ ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ):
+ if nodebl is None: # no entry in blacklist table
+ results.append(True)
+ elif nodebl is not None and nodebl.expired(): # expired entry in blacklist table
+ results.append(True)
+ else:
+ results.append(False) # entry that is not expired.
+ else:
+ results.append(False)
+
+ try:
+ print "test: %s" % results
+ # NOTE: incase results is empty, reduce does not work on an empty set.
+ return reduce(lambda x,y: x&y, results) and len(results) > MINUP
+ except:
+ return False
+
def main(hostnames, sitenames):
# commands:
i = 1
sitehist.closeTicket()
print "send message for site %s penalty cleared" % site
-
+
+ # check all nodes and pcus for this site; if they're all ok,
+ # close the ticket, else leave it open.
+ # NOTE: in the case where a PCU reboots and fails, a message is
+ # sent, but the PCU may appear to be ok according to tests.
+ # NOTE: Also, bootmanager sends messages regarding disks,
+ # configuration, etc. So, the conditions here are 'good'
+ # rather than 'not down' as it is in sitebad.
+ close_ticket = check_node_and_pcu_status_for(site)
+ if close_ticket:
+ sitehist.closeTicket()
site_count = site_count + 1
+++ /dev/null
-Structure:
-
-monitor module
- plc wrapper
- util functions
- pkl database access
- database models
- third-party data sources
-
-pcucontrol
- maps types to code
- reboot.py
- interface.py
-
- transport:
- pyssh
- ssh
- telnetlib
- models:
- hpilo cmds
- intelamt cmds
- racadm cmd
- ipmitool cmd
-
-web
- cgi scripts
- tgweb
- project...
-
-cmds
- py scripts
- node
- site
- pcu
- query
- grouprins
-
-bootman
- rpyc
-
-
-
-
-
-###############################
-for each node:
- Check Status ->
- if Pass Threshold ->
- Create Issue ->
- Take Action ->
- email
- bm
- pcu
- plc reset
- apply penalties
- flag for admin
-
-for each issue
- check issue.status
- if issue.status is "open":
- issue.take_next_action()
- if issue.closed:
- issue.shutdown()
- if issue.paused:
- pass
-
-action_list for issuetype (pcudown)
- send email
- yield
- send email, apply penalty
- yield
- send email, apply second penalty
- yield
- send email
-
-action_list for issuetype (badhardware)
-action_list for issuetype (dnserror)
-action_list for issuetype (nodeconfig)
-action_list for issuetype (oldbootcd)
-
-action_list for issuetype (nodedown)
- if pcuok, reboot
- yield
- if pcuok, and reboot failed, set rins, reboot
- yield
- create_issue pcubroken
- send email
- yield
- send email, apply penalty
- yield
- send email, apppy second penalty
- yield
- send email
-
-
-TOOLS:
- * add a '--nocache' to the default set of options.
- * add a cache parameter in the monitor.conf file.
-
-
-
-TODO:
- * install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1, MySQL-python
- * had to mount -t devpts devpts /dev/pts to get ssh to work inside the
- chroot. also, disable the pam modules in /etc/pam.d/sshd
-
- * blue
- * auto configuration for php configuration.
- maybe run translation of monitor.conf before loading monitorconfig.php?
- * blue2
-
- * A setup script of some kind would be nice that walked through :
- - writing monitorconfig.py
- - creation of monitorconfig.php
- - run syncplcdb.py
- - testapi.py
- - findbad.py on sample site.
- - nodebad.py
- - findbadpcus.py
- - nodequery.py
- - nodegroups.py
- - loads webpage for those retreived values to confirm setup succeeded.
-
- * reimplement the config.py / .config mechanism. I'd like for many commands
- to share very similar argument or argument sets, as well as have some
- common config options. I'm not sure the best way to do this.
-
- - features of config.py
- * parse arguments and return an object with attributes equal to the
- parser values.
- * maintain values consistently across modules at run time.
- * have default values that are not specified at each run time.
- * easy to import and use
-
- - config module is available via 'import config' or as returned by
- parsermodule.parse_args()
- - python supports load-once modules, so subsequent imports refer to the
- same module object.
-
- * have package pull in threadpool from easy_install
-
- * place PKL files in a real database
-
- * clean up plc.py; there's a lot of redundent code.
-
- * figure out python paths for user commands.
- - directories for pickle files.
- - add user in rpm install
- - user permissions for data files for day-to-day operations.
-
- * fix BayTechCtrlCUnibe expect script.
-
- * separate modules into different, logical categories, and create a python
- module as part of the install:
- command line,
- configuration,
- policy,
- data model,
- data access,
- object interfaces.
-
-Lower priority:
- * Add a more structured, 'automate' library of scripts and means of making
- batch calls, etc.
-
- * add a third package for user tools that will interact with the Monitor
- service. Mostly, I'm guessing this would be queries for the live status of
- nodes and a more reliable 'reboot' and 'reinstall' mechanism than currently
- availble with PLC.
-
-Done:
- * Find a better location to place and pull the PKL files currently in the pdb
- directory. Ultimately, these should be stored in a real DB. Until then,
- they should sit in a location that is accessible from the www scripts,
- backend scripts, and user utilities.
- * nodebad loads plc_hn2lb unconditionally
- * nodeinfo loads act_all unconditionally
- * change findbad.py default db name
- * remove deps on www.printbadnodes
- * reboot.py loads findbadpcus unconditionally.
- * nodequery loads findbad unconditionally
- * unified_model loads findbad unconditionally
-
- * threadpool package.
- * build cmdamt with g++ prior to packaging
-
- * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc.
- - need to convert monitor.conf into monitorconf.sh and monitorconf.php
-
- * pull out global configuration information from various files, like rt_db,
- mailer.py, auth.py, and any others. Create a single configuration file
- from which all others pull.
-
- - convert plc and other files to use the new monitorconfig.py rather than
- auth, or plc.*
- - need to alter all import 'auth' statements.
<?xml version="1.0" encoding="UTF-8"?>
<Module>
-<ModulePrefs title="MyOps Summary" title_url="http://www.planet-lab.org">
+<ModulePrefs title="MyOps Summary PLC_NAME" title_url="http://PLC_WWW_HOSTNAME">
<Require feature="dynamic-height"/>
</ModulePrefs>
<Content type="html"><![CDATA[
_gel('content_div').innerHTML = responseText;
_IG_AdjustIFrameHeight();
};
-_IG_FetchContent('http://monitor.planet-lab.org/monitor/summary', displaycontent, { refreshInterval: 300 });
+_IG_FetchContent('http://MONITOR_HOSTNAME/monitor/summary', displaycontent, { refreshInterval: 300 });
</script>
]]></Content>
</Module>