added tests directory.
updated zabbix.spec
if len(l_nodes) == 0:
raise Exception("Host removed via blacklist: %s" % hostname)
- #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : [])
- #if ad_dbTickets == None:
- # raise Exception("Could not find cached dbTickets")
-
- #print "starting new thing"
mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
mon.run()
- #print "merge"
- #merge = Merge( [node['hostname'] for node in l_nodes])
- #record_list = merge.run()
- ##print "rt"
- #rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
- #record_list = rt.run()
- ##print "diagnose"
- #diag = Diagnose(record_list)
- #diagnose_out = diag.run()
- #print diagnose_out
- #print "action"
- #action = Action(diagnose_out)
- #action.run()
-
return True
-#def reboot2(hostname):
-# l_nodes = api.GetNodes(hostname)
-# if len(l_nodes) == 0:
-# raise Exception("No such host: %s" % hostname)
-#
-# l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-# l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
-#
-# l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
-# if len(l_nodes) == 0:
-# raise Exception("Host removed via blacklist: %s" % hostname)
-#
-# ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
-# if ad_dbTickets == None:
-# raise Exception("Could not find cached dbTickets")
-#
-#
-# args = {}
-# args['hostname'] = "%s" % hostname
-# args['hostname_list'] = "%s" % hostname
-# args['loginbase'] = plc.siteId(hostname)
-#
-# m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
-# mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
-#
-# #print "merge"
-# merge = Merge( [node['hostname'] for node in l_nodes])
-# record_list = merge.run()
-# #print "rt"
-# rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
-# record_list = rt.run()
-# #print "diagnose"
-# diag = Diagnose(record_list)
-# diagnose_out = diag.run()
-# #print diagnose_out
-# #print "action"
-# action = Action(diagnose_out)
-# action.run()
-#
-# return True
-
-
def main():
for host in sys.argv[1:]:
reboot(host)
--- /dev/null
+#!/bin/bash
+#
+# priority: 850
+#
+# Manage settings for the Zabbix installtion and
+# other monitor-related things
+#
+# Stephen Soltesz <soltesz@cs.princeton.edu>
+# Copyright (C) 2008 The Trustees of Princeton University
+#
+# $Id$
+#
+
+# Source function library and configuration
+. /etc/plc.d/functions
+. /etc/planetlab/plc_config
+local_config=/etc/planetlab/configs/site.xml
+
+# Be verbose
+set -x
+
+# Default locations
+PGDATA=/var/lib/pgsql/data
+postgresql_conf=$PGDATA/postgresql.conf
+pghba_conf=$PGDATA/pg_hba.conf
+
+# Export so that we do not have to specify -p to psql invocations
+export PGPORT=$PLC_DB_PORT
+
+
+# TODO:
+## setup database
+### import schema & data
+### tweak values
+### add zabbix to pg_hba.conf
+## setup zabbix.conf.php
+##
+DB_USER="zabbixuser"
+DB_NAME="zabbix"
+
+function check_user_and_db()
+{
+ # confirm user is present or create it
+ user_present=$( psql -U postgres -c "select * from pg_user;" -d template1 | grep $ZABBIX_DB_NAME )
+ if [ -z $user_present ] ; then
+ createuser --no-superuser --no-createdb --no-createrole --login --unencrypted --echo $ZABBIX_DB_NAME -U postgres
+ fi
+
+ # confirm database is present or create it
+ db_present=$( psql -U postgres -c "select * from pg_database;" -d template1 | grep $ZABBIX_DB_NAME )
+ if [ -z $db_present ] ; then
+ createdb --owner=$ZABBIX_DB_NAME $ZABBIX_DB_NAME -U postgres
+ fi
+
+ # Create/update the unprivileged database user and password
+ if [ -z "$PLC_MONITOR_DBPASSWORD" ] ; then
+ PLC_MONITOR_DBPASSWORD=$(uuidgen)
+ plc-config --category=plc_monitor --variable=dbpassword --value="$PLC_MONITOR_DBPASSWORD" --save=$local_config $local_config
+ psql -d template1 -U postgres -c "ALTER USER $ZABBIX_DB_NAME WITH PASSWORD '$PLC_MONITOR_DBPASSWORD';"
+ service plc reload
+ fi
+}
+
+function if_present_load ()
+{
+ file=$1
+ if [ -f $file ] ; then
+ psql -d $ZABBIX_DB_NAME -U $ZABBIX_DB_USER < $file
+ fi
+}
+
+function check_schema_and_data()
+{
+ schema_present=$( psql -U $ZABBIX_DB_USER $ZABBIX_DB_NAME -c "\d;" < /dev/null | grep hosts )
+ if [ -z $schema_present ] ; then
+ if_present_load "/usr/local/zabbix/misc/create/schema/postgresql.sql"
+ if_present_load "/usr/local/zabbix/misc/create/data/data.sql"
+ if_present_load "/usr/local/zabbix/misc/create/data/images_pgsql.sql"
+ ## TODO: update ZABBIX Server entry, "update hosts set status=0, host='MyPLC Server' where hostid=10017"
+ fi
+}
+
+
+case "$1" in
+ start)
+ if [ "$PLC_MONITOR_ENABLED" != "1" ] ; then
+ exit 0
+ fi
+ MESSAGE=$"Bootstrap Monitoring"
+ dialog "$MESSAGE"
+
+ check_user_and_db
+ check_schema_and_data
+
+ mkdir -p /var/lib/pgsql/data/pg_hba.conf.d
+ ZABCONF=/var/lib/pgsql/data/pg_hba.conf.d/zabbix.conf
+ if [ ! -f $ZABCONF ] ; then
+ echo "host $ZABBIX_DB_NAME $ZABBIX_DB_USER 127.0.0.1/32 password" > $ZAB
+ echo "host $ZABBIX_DB_NAME $ZABBIX_DB_USER $PLC_MONITOR_IP/32 password" >> $ZAB
+ fi
+
+ # UPDATE /etc/zabbix/*.conf
+ ZABBIXCFG=/etc/zabbix
+ TMP_FILE=`mktemp /tmp/zbxtmpXXXXXX`
+ # TODO: How to know if I need to restart the services?
+
+ if [ -f ${ZABBIXCFG}/zabbix_server.conf ] ; then
+ sed -e "s/#DBHost=.*/DBHost=$PLC_MONITOR_HOST/g" \
+ -e "s#DBName=.*#DBName=$ZABBIX_DB_NAME#g" \
+ -e "s#DBUser=.*#DBUser=$ZABBIX_DB_USER#g" \
+ -e "s#DBPassword=.*#$PLC_MONITOR_DBPASSWORD#g" \
+ ${ZABBIXCFG}/zabbix_server.conf > $TMP_FILE
+ cat $TMP_FILE > ${ZABBIXCFG}/zabbix_server.conf
+ fi
+ if [ -f ${ZABBIXCFG}/zabbix_agentd.conf ] ; then
+ HOST=`hostname`
+ sed -e "s#Server=.*#Server=$PLC_MONITOR_HOST#g" \
+ -e "s#Hostname=.*#Hostname=$HOST#g" \
+ ${ZABBIXCFG}/zabbix_agentd.conf > $TMP_FILE
+ cat $TMP_FILE > ${ZABBIXCFG}/zabbix_agentd.conf
+ fi
+ service zabbix_server start
+ service zabbix_agentd start
+
+ # SETUP zabbix gui configuration
+ ZABBIX_WEB_CFG=/var/www/html/zabbix/conf/zabbix.conf.php
+ if [ ! -f $ZABBIX_WEB_CFG ] ; then
+ touch $ZABBIX_WEB_CFG
+ cat <<EOF > $ZABBIX_WEB_CFG
+<?php
+global \$DB;
+
+\$DB["TYPE"] = "POSTGRESQL";
+\$DB["SERVER"] = "localhost";
+\$DB["PORT"] = "0";
+\$DB["DATABASE"] = "$ZABBIX_DB_NAME";
+\$DB["USER"] = "$ZABBIX_DB_USER";
+\$DB["PASSWORD"] = "$PLC_MONITOR_DBPASSWORD";
+\$ZBX_SERVER = "$PLC_MONITOR_HOST";
+\$ZBX_SERVER_PORT = "10051";
+\$IMAGE_FORMAT_DEFAULT = IMAGE_FORMAT_PNG;
+?>
+EOF
+ chmod 644 $ZABBIX_WEB_CFG
+ fi
+
+ result "$MESSAGE"
+ ;;
+
+ stop)
+ MESSAGE=$"Stopping Monitor"
+ dialog "$MESSAGE"
+
+ service zabbix_server stop
+ service zabbix_agentd stop
+ # TODO: is there anything to stop?
+ result "$MESSAGE"
+ ;;
+esac
+
+exit $ERRORS
--- /dev/null
+#!/usr/bin/python
+
+
+import emailTxt
+class Type:
+ def __init__(self, value):
+ self.value = value
+ def name(self):
+ return self.__class__.__name__
+
+class Is(Type): pass
+class Match(Type): pass
+class ListMatch(Type): pass
+class FilledIn(Type): pass
+class PortOpen(Type): pass
+class NodesUp(Type): pass
+
+# a failed constraint leads to a message-escelation process.
+# so, define a constraint, which defines the set of nodes it operates on, the
+# message to send if it fails, and maybe a thank you message when it's
+# satisfied (if previously failed.).
+standardnode = {
+ 'membership' : [ { 'plcnode/nodegroups' : Match('.*') } ],
+ 'site' : [ { 'nodes' : NodesUp(2), } ],
+ 'node' : { 'constraint' :
+ [ { 'state' : Match('BOOT'),
+ 'kernel' : Match('2.6.22.19-vs2.3.0.34'), } ],
+ 'failed_message' : [ emailTxt.mailtxt.newdown ],
+ 'resolved_message' : [ emailTxt.mailtxt.newthankyou ],
+ },
+ 'pcu' : {
+ 'constraint' : [ { 'hostname' : FilledIn(True),
+ 'password' : FilledIn(True), },
+ { 'ip' : FilledIn(True),
+ 'password' : FilledIn(True), },
+ ],
+ 'failed_message' : [ emailTxt.mailtxt.pcudown ],
+ 'resolved_message' : [ emailTxt.mailtxt.pcuthankyou ],
+ },
+}
+
+dc7800 = {
+ # if membership constraint it true, then apply the other constraints.
+ 'membership' : [ { 'plcnode/nodegroups' : ListMatch('DC7800Deployment'), } ],
+
+ 'pcu' : { 'constraint' : [ { 'hostname' : FilledIn(True),
+ 'ip' : FilledIn(True),
+ 'password' : FilledIn(True),
+ 'model' : Match('AMT'),
+ 'username' : Match('admin'),
+ 'portstatus': PortOpen(16992),
+ 'reboot' : Is(0),
+ },
+ { 'hostname' : FilledIn(True),
+ 'ip' : FilledIn(True),
+ 'password' : FilledIn(True),
+ 'reboot' : Is(0),
+ #'valid' : Is(True),
+ },],
+ 'failed_message' : [ emailTxt.mailtxt.donation_nopcu],
+ 'resolved_message' : [ emailTxt.mailtxt.pcuthankyou ],
+ },
+ 'node' : { 'constraint' :
+ [ { 'state' : Match('BOOT'),
+ 'kernel' : Match('2.6.22.19-vs2.3.0.34'), } ],
+ 'failed_message' : [ emailTxt.mailtxt.donation_down],
+ 'resolved_message' : [ emailTxt.mailtxt.newthankyou ],
+ },
+}
+
+#
+# data source, { constraints ... value }
+# action on failure of constraint
+# information about why it failed
+# # stop action if constraint is satisfied at a later time.
+# kind of like asynchronous constraint solving.
+# or stored procedures.
--- /dev/null
+#!/usr/bin/python
+
+from monitor import database
+import os
+import time
+from unified_model import *
+
+today = time.time()
+four_days_ago = today - 60*60*24*4
+eight_days_ago = today - 60*60*24*8
+
+def reset_time(hostname, new_time):
+ # update act_all entry
+ act_all = database.dbLoad("act_all")
+ act_all[hostname][0]['time'] = new_time
+ database.dbDump("act_all", act_all)
+ # update message timer.
+ m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages')
+ m.actiontracker.time = new_time
+ m.save()
+
+def get_record(hostname):
+ act_all = database.dbLoad("act_all")
+ rec = act_all[hostname][0]
+ return rec
+
+
+def bring_node_down(hostname):
+ fb = database.dbLoad("findbad")
+ fb['nodes'][hostname]['values'].update({'category' : 'ERROR',
+ 'kernel' : '',
+ 'state' : 'DOWN', 'ssh' : 'NOSSH'})
+ database.dbDump("findbad", fb)
+ # update message timer.
+ m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages')
+ m.actiontracker.time = time.time() - 60*60*24*4
+ m.save()
+
+def bring_node_up(hostname):
+ fb = database.dbLoad("findbad")
+ fb['nodes'][hostname]['values'].update({'category' : 'ALPHA',
+ 'kernel' : 'a b 2.6.22.19-vs2.3.0.34.24.planetlab',
+ 'state' : 'BOOT', 'ssh' : 'SSH'})
+ database.dbDump("findbad", fb)
+ # update message timer.
+ m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages')
+ m.actiontracker.time = time.time() - 60*60*24*4
+ m.save()
+
+#bring_node_down('fakenode.cs.princeton.edu')
+
+node_end_record('fakenode.cs.princeton.edu')
+node_end_record('eggplant.cs.princeton.edu')
+bring_node_down('eggplant.cs.princeton.edu')
+bring_node_down('fakenode.cs.princeton.edu')
+os.system("./nodebad.py --increment --site monitorsite")
+os.system("./sitebad.py --increment --site monitorsite")
+
+# week one
+# initial
+os.system("./grouprins.py --force --reboot --mail=1 \
+ --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+reset_time('eggplant.cs.princeton.edu', four_days_ago)
+reset_time('fakenode.cs.princeton.edu', four_days_ago)
+# second
+os.system("./grouprins.py --force --reboot --mail=1 \
+ --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+reset_time('eggplant.cs.princeton.edu', eight_days_ago)
+reset_time('fakenode.cs.princeton.edu', eight_days_ago)
+# week two
+# transition.
+os.system("./grouprins.py --force --reboot --mail=1 \
+ --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+reset_time('eggplant.cs.princeton.edu', four_days_ago)
+reset_time('fakenode.cs.princeton.edu', four_days_ago)
+# second for week two
+os.system("./grouprins.py --force --reboot --mail=1 \
+ --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+reset_time('eggplant.cs.princeton.edu', eight_days_ago)
+reset_time('fakenode.cs.princeton.edu', eight_days_ago)
+# week three
+ # transition
+os.system("./grouprins.py --force --reboot --mail=1 \
+ --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \
+ --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+
+# node is up.
+bring_node_up("eggplant.cs.princeton.edu")
+bring_node_up("fakenode.cs.princeton.edu")
+os.system("./nodebad.py --increment --site monitorsite")
+os.system("./sitebad.py --increment --site monitorsite")
+
+os.system("./grouprins.py --force --reboot --mail=1 --nodeselect 'hostname=eggplant.cs.princeton.edu&&state=BOOT' --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'")
+
--- /dev/null
+#!/usr/bin/python
+
+from monitor.database import FindbadNodeRecord, FindbadPCURecord
+from template import *
+from nodequery import *
+from monitor import util
+import sys
+
+
+dc7800list = util.file.getListFromFile("dc7800.txt")
+
+# get node info
+# if membership satisfied
+# get pcu info
+# verify pcu constraint
+# verify node constraint
+
+fbquery = FindbadNodeRecord.get_all_latest()
+for noderec in fbquery:
+ fbinfo = noderec.to_dict()
+ member = verifyType(dc7800['membership'], fbinfo)
+ if not member: continue
+
+ if pcu_in(fbinfo):
+ fbpcuinfo = FindbadPCURecord.get_latest_by(plc_pcuid=fbinfo['plc_node_stats']['pcu_ids'][0]).to_dict()
+ else:
+ fbpcuinfo = None
+ fbinfo['pcuinfo'] = fbpcuinfo
+
+ pcuok = verifyType(dc7800['pcu']['constraint'], fbpcuinfo)
+ nodeok = verifyType(dc7800['node']['constraint'], fbinfo)
+ print "pcuok : ", pcuok, " nodeok: ", nodeok , " ", hostname
+ continue
+ sys.exit(1)
+
+ if not pcuok and not nodeok:
+ # donation_down_one
+ pass
+ elif not pcuok and nodeok:
+ # donation_nopcu_one
+ pass
+ elif pcuok and not nodeok:
+ # reboot
+ pass
+ elif pcuok and nodeok:
+ # noop
+ pass
+
+ if pcuok:
+ print "PCU-OK ",
+ else:
+ print "PCU-BAD",
+ if nodeok:
+ print "NODE-OK ",
+ else:
+ print "NODE-BAD",
+ print " for %-45s" % hostname
+
+
+
#%define zabbix_piddir %{_tmppath}
#%define zabbix_logdir %{_tmppath}
-%define zabbix_piddir /var/run
-%define zabbix_logdir /var/log
+%define zabbix_piddir /var/tmp
+%define zabbix_logdir /var/tmp
%description
The ZABBIX server is a network monitor
%{zabbix_initdir}/zabbix_agentd > $TMP_FILE
cat $TMP_FILE > %{zabbix_initdir}/zabbix_agentd
# TODO: copy to /etc/init.d/
-cp %{zabbix_initdir}/zabbix_agentd /etc/init.d
+cp %{zabbix_initdir}/zabbix_agentd %{_initrddir}
rm -f $TMP_FILE
# configure ZABBIX server daemon
TMP_FILE=`mktemp $TMPDIR/zbxtmpXXXXXX`
-# SETUP DBHost, DBName, DBUser, DBPassword
-#SERVER=`grep PLC_MONITOR_HOST /etc/planetlab/plc_config | tr "'" ' ' | awk '{print $2}'`
-
sed -e "s#AlertScriptsPath=/home/zabbix/bin/#AlertScriptsPath=%{zabbix_bindir}/#g" \
-e "s#PidFile=/var/tmp/zabbix_server.pid#PidFile=%{zabbix_piddir}/zabbix_server.pid#g" \
-e "s#LogFile=/tmp/zabbix_server.log#LogFile=%{zabbix_logdir}/zabbix_server.log#g" \
%{zabbix_confdir}/zabbix_server.conf > $TMP_FILE
cat $TMP_FILE > %{zabbix_confdir}/zabbix_server.conf
-mkdir -p /etc/zabbix
-cp %{zabbix_confdir}/zabbix_server.conf /etc/zabbix/
+mkdir -p %{_sysconfdir}/zabbix
+cp %{zabbix_confdir}/zabbix_server.conf %{_sysconfdir}/zabbix/
sed -e "s#BASEDIR=/opt/zabbix#BASEDIR=%{_prefix}#g" \
-e "s#PIDFILE=/var/tmp/zabbix_server.pid#PIDFILE=%{zabbix_piddir}/zabbix_server.pid#g" \
%{zabbix_initdir}/zabbix_server > $TMP_FILE
cat $TMP_FILE > %{zabbix_initdir}/zabbix_server
-cp %{zabbix_initdir}/zabbix_server /etc/init.d
+cp %{zabbix_initdir}/zabbix_server %{_initrddir}
rm -f $TMP_FILE
chkconfig zabbix_server on
+%post gui
+# Setup the necessary values in /etc/php.ini
+# NOTE: Zabbix requires max_execution_time to be 300 seconds
+# NOTE: Zabbix requires a default date.timezone
+
+# also edit /var/www/html/zabbix/conf/zabbix.conf.php
+# touch /var/www/html/zabbix/conf/zabbix.conf.php
+# chmod 644 /var/www/html/zabbix/conf/zabbix.conf.php
+#
+
+TMP_FILE=`mktemp $TMPDIR/zbxtmpXXXXXX`
+sed -e "s#;date.timezone =#date.timezone = 'UTC'#g" \
+ -e "s#max_execution_time = 30 #max_execution_time = 300 #g" \
+ %{_sysconfdir}/php.ini > $TMP_FILE
+cat $TMP_FILE > %{_sysconfdir}/php.ini
+
+
%postun
rm -f %{zabbix_piddir}/zabbix_server.pid
rm -f %{zabbix_logdir}/zabbix_server.log