From: Stephen Soltesz Date: Thu, 13 Nov 2008 21:40:23 +0000 (+0000) Subject: Added init scripts for monitor-server and -client. X-Git-Tag: Monitor-2.0-0~63 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=03f339edc8521e36a12bfc1cdfd1b1a6ea3ab2d8 Added init scripts for monitor-server and -client. added tests directory. updated zabbix.spec --- diff --git a/mailmonitor.py b/mailmonitor.py index c9c1750..3257d63 100644 --- a/mailmonitor.py +++ b/mailmonitor.py @@ -30,72 +30,11 @@ def reboot(hostname): if len(l_nodes) == 0: raise Exception("Host removed via blacklist: %s" % hostname) - #ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : []) - #if ad_dbTickets == None: - # raise Exception("Could not find cached dbTickets") - - #print "starting new thing" mon = MonitorMergeDiagnoseSendEscellate(hostname, True) mon.run() - #print "merge" - #merge = Merge( [node['hostname'] for node in l_nodes]) - #record_list = merge.run() - ##print "rt" - #rt = RT(record_list, ad_dbTickets, l_ticket_blacklist) - #record_list = rt.run() - ##print "diagnose" - #diag = Diagnose(record_list) - #diagnose_out = diag.run() - #print diagnose_out - #print "action" - #action = Action(diagnose_out) - #action.run() - return True -#def reboot2(hostname): -# l_nodes = api.GetNodes(hostname) -# if len(l_nodes) == 0: -# raise Exception("No such host: %s" % hostname) -# -# l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) -# l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) -# -# l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) -# if len(l_nodes) == 0: -# raise Exception("Host removed via blacklist: %s" % hostname) -# -# ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) -# if ad_dbTickets == None: -# raise Exception("Could not find cached dbTickets") -# -# -# args = {} -# args['hostname'] = "%s" % hostname -# args['hostname_list'] = "%s" % hostname -# args['loginbase'] = plc.siteId(hostname) -# -# m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, -# mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') -# -# #print "merge" -# merge = Merge( [node['hostname'] for node in l_nodes]) -# record_list = merge.run() -# #print "rt" -# rt = RT(record_list, ad_dbTickets, l_ticket_blacklist) -# record_list = rt.run() -# #print "diagnose" -# diag = Diagnose(record_list) -# diagnose_out = diag.run() -# #print diagnose_out -# #print "action" -# action = Action(diagnose_out) -# action.run() -# -# return True - - def main(): for host in sys.argv[1:]: reboot(host) diff --git a/monitor.init b/monitor-client.init similarity index 100% rename from monitor.init rename to monitor-client.init diff --git a/monitor-server.init b/monitor-server.init new file mode 100644 index 0000000..451b65b --- /dev/null +++ b/monitor-server.init @@ -0,0 +1,161 @@ +#!/bin/bash +# +# priority: 850 +# +# Manage settings for the Zabbix installtion and +# other monitor-related things +# +# Stephen Soltesz +# Copyright (C) 2008 The Trustees of Princeton University +# +# $Id$ +# + +# Source function library and configuration +. /etc/plc.d/functions +. /etc/planetlab/plc_config +local_config=/etc/planetlab/configs/site.xml + +# Be verbose +set -x + +# Default locations +PGDATA=/var/lib/pgsql/data +postgresql_conf=$PGDATA/postgresql.conf +pghba_conf=$PGDATA/pg_hba.conf + +# Export so that we do not have to specify -p to psql invocations +export PGPORT=$PLC_DB_PORT + + +# TODO: +## setup database +### import schema & data +### tweak values +### add zabbix to pg_hba.conf +## setup zabbix.conf.php +## +DB_USER="zabbixuser" +DB_NAME="zabbix" + +function check_user_and_db() +{ + # confirm user is present or create it + user_present=$( psql -U postgres -c "select * from pg_user;" -d template1 | grep $ZABBIX_DB_NAME ) + if [ -z $user_present ] ; then + createuser --no-superuser --no-createdb --no-createrole --login --unencrypted --echo $ZABBIX_DB_NAME -U postgres + fi + + # confirm database is present or create it + db_present=$( psql -U postgres -c "select * from pg_database;" -d template1 | grep $ZABBIX_DB_NAME ) + if [ -z $db_present ] ; then + createdb --owner=$ZABBIX_DB_NAME $ZABBIX_DB_NAME -U postgres + fi + + # Create/update the unprivileged database user and password + if [ -z "$PLC_MONITOR_DBPASSWORD" ] ; then + PLC_MONITOR_DBPASSWORD=$(uuidgen) + plc-config --category=plc_monitor --variable=dbpassword --value="$PLC_MONITOR_DBPASSWORD" --save=$local_config $local_config + psql -d template1 -U postgres -c "ALTER USER $ZABBIX_DB_NAME WITH PASSWORD '$PLC_MONITOR_DBPASSWORD';" + service plc reload + fi +} + +function if_present_load () +{ + file=$1 + if [ -f $file ] ; then + psql -d $ZABBIX_DB_NAME -U $ZABBIX_DB_USER < $file + fi +} + +function check_schema_and_data() +{ + schema_present=$( psql -U $ZABBIX_DB_USER $ZABBIX_DB_NAME -c "\d;" < /dev/null | grep hosts ) + if [ -z $schema_present ] ; then + if_present_load "/usr/local/zabbix/misc/create/schema/postgresql.sql" + if_present_load "/usr/local/zabbix/misc/create/data/data.sql" + if_present_load "/usr/local/zabbix/misc/create/data/images_pgsql.sql" + ## TODO: update ZABBIX Server entry, "update hosts set status=0, host='MyPLC Server' where hostid=10017" + fi +} + + +case "$1" in + start) + if [ "$PLC_MONITOR_ENABLED" != "1" ] ; then + exit 0 + fi + MESSAGE=$"Bootstrap Monitoring" + dialog "$MESSAGE" + + check_user_and_db + check_schema_and_data + + mkdir -p /var/lib/pgsql/data/pg_hba.conf.d + ZABCONF=/var/lib/pgsql/data/pg_hba.conf.d/zabbix.conf + if [ ! -f $ZABCONF ] ; then + echo "host $ZABBIX_DB_NAME $ZABBIX_DB_USER 127.0.0.1/32 password" > $ZAB + echo "host $ZABBIX_DB_NAME $ZABBIX_DB_USER $PLC_MONITOR_IP/32 password" >> $ZAB + fi + + # UPDATE /etc/zabbix/*.conf + ZABBIXCFG=/etc/zabbix + TMP_FILE=`mktemp /tmp/zbxtmpXXXXXX` + # TODO: How to know if I need to restart the services? + + if [ -f ${ZABBIXCFG}/zabbix_server.conf ] ; then + sed -e "s/#DBHost=.*/DBHost=$PLC_MONITOR_HOST/g" \ + -e "s#DBName=.*#DBName=$ZABBIX_DB_NAME#g" \ + -e "s#DBUser=.*#DBUser=$ZABBIX_DB_USER#g" \ + -e "s#DBPassword=.*#$PLC_MONITOR_DBPASSWORD#g" \ + ${ZABBIXCFG}/zabbix_server.conf > $TMP_FILE + cat $TMP_FILE > ${ZABBIXCFG}/zabbix_server.conf + fi + if [ -f ${ZABBIXCFG}/zabbix_agentd.conf ] ; then + HOST=`hostname` + sed -e "s#Server=.*#Server=$PLC_MONITOR_HOST#g" \ + -e "s#Hostname=.*#Hostname=$HOST#g" \ + ${ZABBIXCFG}/zabbix_agentd.conf > $TMP_FILE + cat $TMP_FILE > ${ZABBIXCFG}/zabbix_agentd.conf + fi + service zabbix_server start + service zabbix_agentd start + + # SETUP zabbix gui configuration + ZABBIX_WEB_CFG=/var/www/html/zabbix/conf/zabbix.conf.php + if [ ! -f $ZABBIX_WEB_CFG ] ; then + touch $ZABBIX_WEB_CFG + cat < $ZABBIX_WEB_CFG + +EOF + chmod 644 $ZABBIX_WEB_CFG + fi + + result "$MESSAGE" + ;; + + stop) + MESSAGE=$"Stopping Monitor" + dialog "$MESSAGE" + + service zabbix_server stop + service zabbix_agentd stop + # TODO: is there anything to stop? + result "$MESSAGE" + ;; +esac + +exit $ERRORS diff --git a/bwlimit.py b/tests/bwlimit.py similarity index 100% rename from bwlimit.py rename to tests/bwlimit.py diff --git a/tests/template.py b/tests/template.py new file mode 100644 index 0000000..2e9150b --- /dev/null +++ b/tests/template.py @@ -0,0 +1,77 @@ +#!/usr/bin/python + + +import emailTxt +class Type: + def __init__(self, value): + self.value = value + def name(self): + return self.__class__.__name__ + +class Is(Type): pass +class Match(Type): pass +class ListMatch(Type): pass +class FilledIn(Type): pass +class PortOpen(Type): pass +class NodesUp(Type): pass + +# a failed constraint leads to a message-escelation process. +# so, define a constraint, which defines the set of nodes it operates on, the +# message to send if it fails, and maybe a thank you message when it's +# satisfied (if previously failed.). +standardnode = { + 'membership' : [ { 'plcnode/nodegroups' : Match('.*') } ], + 'site' : [ { 'nodes' : NodesUp(2), } ], + 'node' : { 'constraint' : + [ { 'state' : Match('BOOT'), + 'kernel' : Match('2.6.22.19-vs2.3.0.34'), } ], + 'failed_message' : [ emailTxt.mailtxt.newdown ], + 'resolved_message' : [ emailTxt.mailtxt.newthankyou ], + }, + 'pcu' : { + 'constraint' : [ { 'hostname' : FilledIn(True), + 'password' : FilledIn(True), }, + { 'ip' : FilledIn(True), + 'password' : FilledIn(True), }, + ], + 'failed_message' : [ emailTxt.mailtxt.pcudown ], + 'resolved_message' : [ emailTxt.mailtxt.pcuthankyou ], + }, +} + +dc7800 = { + # if membership constraint it true, then apply the other constraints. + 'membership' : [ { 'plcnode/nodegroups' : ListMatch('DC7800Deployment'), } ], + + 'pcu' : { 'constraint' : [ { 'hostname' : FilledIn(True), + 'ip' : FilledIn(True), + 'password' : FilledIn(True), + 'model' : Match('AMT'), + 'username' : Match('admin'), + 'portstatus': PortOpen(16992), + 'reboot' : Is(0), + }, + { 'hostname' : FilledIn(True), + 'ip' : FilledIn(True), + 'password' : FilledIn(True), + 'reboot' : Is(0), + #'valid' : Is(True), + },], + 'failed_message' : [ emailTxt.mailtxt.donation_nopcu], + 'resolved_message' : [ emailTxt.mailtxt.pcuthankyou ], + }, + 'node' : { 'constraint' : + [ { 'state' : Match('BOOT'), + 'kernel' : Match('2.6.22.19-vs2.3.0.34'), } ], + 'failed_message' : [ emailTxt.mailtxt.donation_down], + 'resolved_message' : [ emailTxt.mailtxt.newthankyou ], + }, +} + +# +# data source, { constraints ... value } +# action on failure of constraint +# information about why it failed +# # stop action if constraint is satisfied at a later time. +# kind of like asynchronous constraint solving. +# or stored procedures. diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..3a83545 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,98 @@ +#!/usr/bin/python + +from monitor import database +import os +import time +from unified_model import * + +today = time.time() +four_days_ago = today - 60*60*24*4 +eight_days_ago = today - 60*60*24*8 + +def reset_time(hostname, new_time): + # update act_all entry + act_all = database.dbLoad("act_all") + act_all[hostname][0]['time'] = new_time + database.dbDump("act_all", act_all) + # update message timer. + m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages') + m.actiontracker.time = new_time + m.save() + +def get_record(hostname): + act_all = database.dbLoad("act_all") + rec = act_all[hostname][0] + return rec + + +def bring_node_down(hostname): + fb = database.dbLoad("findbad") + fb['nodes'][hostname]['values'].update({'category' : 'ERROR', + 'kernel' : '', + 'state' : 'DOWN', 'ssh' : 'NOSSH'}) + database.dbDump("findbad", fb) + # update message timer. + m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages') + m.actiontracker.time = time.time() - 60*60*24*4 + m.save() + +def bring_node_up(hostname): + fb = database.dbLoad("findbad") + fb['nodes'][hostname]['values'].update({'category' : 'ALPHA', + 'kernel' : 'a b 2.6.22.19-vs2.3.0.34.24.planetlab', + 'state' : 'BOOT', 'ssh' : 'SSH'}) + database.dbDump("findbad", fb) + # update message timer. + m = PersistMessage(hostname, "d", "e", True, db='monitor_persistmessages') + m.actiontracker.time = time.time() - 60*60*24*4 + m.save() + +#bring_node_down('fakenode.cs.princeton.edu') + +node_end_record('fakenode.cs.princeton.edu') +node_end_record('eggplant.cs.princeton.edu') +bring_node_down('eggplant.cs.princeton.edu') +bring_node_down('fakenode.cs.princeton.edu') +os.system("./nodebad.py --increment --site monitorsite") +os.system("./sitebad.py --increment --site monitorsite") + +# week one +# initial +os.system("./grouprins.py --force --reboot --mail=1 \ + --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") +reset_time('eggplant.cs.princeton.edu', four_days_ago) +reset_time('fakenode.cs.princeton.edu', four_days_ago) +# second +os.system("./grouprins.py --force --reboot --mail=1 \ + --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") +reset_time('eggplant.cs.princeton.edu', eight_days_ago) +reset_time('fakenode.cs.princeton.edu', eight_days_ago) +# week two +# transition. +os.system("./grouprins.py --force --reboot --mail=1 \ + --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") +reset_time('eggplant.cs.princeton.edu', four_days_ago) +reset_time('fakenode.cs.princeton.edu', four_days_ago) +# second for week two +os.system("./grouprins.py --force --reboot --mail=1 \ + --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") +reset_time('eggplant.cs.princeton.edu', eight_days_ago) +reset_time('fakenode.cs.princeton.edu', eight_days_ago) +# week three + # transition +os.system("./grouprins.py --force --reboot --mail=1 \ + --nodeselect 'hostname=(eggplant|fakenode).cs.princeton.edu&&state=DOWN' \ + --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") + +# node is up. +bring_node_up("eggplant.cs.princeton.edu") +bring_node_up("fakenode.cs.princeton.edu") +os.system("./nodebad.py --increment --site monitorsite") +os.system("./sitebad.py --increment --site monitorsite") + +os.system("./grouprins.py --force --reboot --mail=1 --nodeselect 'hostname=eggplant.cs.princeton.edu&&state=BOOT' --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.24.planetlab'") + diff --git a/tests/testtemplate.py b/tests/testtemplate.py new file mode 100755 index 0000000..1d45da3 --- /dev/null +++ b/tests/testtemplate.py @@ -0,0 +1,60 @@ +#!/usr/bin/python + +from monitor.database import FindbadNodeRecord, FindbadPCURecord +from template import * +from nodequery import * +from monitor import util +import sys + + +dc7800list = util.file.getListFromFile("dc7800.txt") + +# get node info +# if membership satisfied +# get pcu info +# verify pcu constraint +# verify node constraint + +fbquery = FindbadNodeRecord.get_all_latest() +for noderec in fbquery: + fbinfo = noderec.to_dict() + member = verifyType(dc7800['membership'], fbinfo) + if not member: continue + + if pcu_in(fbinfo): + fbpcuinfo = FindbadPCURecord.get_latest_by(plc_pcuid=fbinfo['plc_node_stats']['pcu_ids'][0]).to_dict() + else: + fbpcuinfo = None + fbinfo['pcuinfo'] = fbpcuinfo + + pcuok = verifyType(dc7800['pcu']['constraint'], fbpcuinfo) + nodeok = verifyType(dc7800['node']['constraint'], fbinfo) + print "pcuok : ", pcuok, " nodeok: ", nodeok , " ", hostname + continue + sys.exit(1) + + if not pcuok and not nodeok: + # donation_down_one + pass + elif not pcuok and nodeok: + # donation_nopcu_one + pass + elif pcuok and not nodeok: + # reboot + pass + elif pcuok and nodeok: + # noop + pass + + if pcuok: + print "PCU-OK ", + else: + print "PCU-BAD", + if nodeok: + print "NODE-OK ", + else: + print "NODE-BAD", + print " for %-45s" % hostname + + + diff --git a/zabbix.spec b/zabbix.spec index 8d7aec5..665f3f2 100644 --- a/zabbix.spec +++ b/zabbix.spec @@ -28,8 +28,8 @@ Buildroot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot #%define zabbix_piddir %{_tmppath} #%define zabbix_logdir %{_tmppath} -%define zabbix_piddir /var/run -%define zabbix_logdir /var/log +%define zabbix_piddir /var/tmp +%define zabbix_logdir /var/tmp %description The ZABBIX server is a network monitor @@ -171,7 +171,7 @@ sed -e "s#BASEDIR=/opt/zabbix#BASEDIR=%{_prefix}#g" \ %{zabbix_initdir}/zabbix_agentd > $TMP_FILE cat $TMP_FILE > %{zabbix_initdir}/zabbix_agentd # TODO: copy to /etc/init.d/ -cp %{zabbix_initdir}/zabbix_agentd /etc/init.d +cp %{zabbix_initdir}/zabbix_agentd %{_initrddir} rm -f $TMP_FILE @@ -192,27 +192,41 @@ fi # configure ZABBIX server daemon TMP_FILE=`mktemp $TMPDIR/zbxtmpXXXXXX` -# SETUP DBHost, DBName, DBUser, DBPassword -#SERVER=`grep PLC_MONITOR_HOST /etc/planetlab/plc_config | tr "'" ' ' | awk '{print $2}'` - sed -e "s#AlertScriptsPath=/home/zabbix/bin/#AlertScriptsPath=%{zabbix_bindir}/#g" \ -e "s#PidFile=/var/tmp/zabbix_server.pid#PidFile=%{zabbix_piddir}/zabbix_server.pid#g" \ -e "s#LogFile=/tmp/zabbix_server.log#LogFile=%{zabbix_logdir}/zabbix_server.log#g" \ %{zabbix_confdir}/zabbix_server.conf > $TMP_FILE cat $TMP_FILE > %{zabbix_confdir}/zabbix_server.conf -mkdir -p /etc/zabbix -cp %{zabbix_confdir}/zabbix_server.conf /etc/zabbix/ +mkdir -p %{_sysconfdir}/zabbix +cp %{zabbix_confdir}/zabbix_server.conf %{_sysconfdir}/zabbix/ sed -e "s#BASEDIR=/opt/zabbix#BASEDIR=%{_prefix}#g" \ -e "s#PIDFILE=/var/tmp/zabbix_server.pid#PIDFILE=%{zabbix_piddir}/zabbix_server.pid#g" \ %{zabbix_initdir}/zabbix_server > $TMP_FILE cat $TMP_FILE > %{zabbix_initdir}/zabbix_server -cp %{zabbix_initdir}/zabbix_server /etc/init.d +cp %{zabbix_initdir}/zabbix_server %{_initrddir} rm -f $TMP_FILE chkconfig zabbix_server on +%post gui +# Setup the necessary values in /etc/php.ini +# NOTE: Zabbix requires max_execution_time to be 300 seconds +# NOTE: Zabbix requires a default date.timezone + +# also edit /var/www/html/zabbix/conf/zabbix.conf.php +# touch /var/www/html/zabbix/conf/zabbix.conf.php +# chmod 644 /var/www/html/zabbix/conf/zabbix.conf.php +# + +TMP_FILE=`mktemp $TMPDIR/zbxtmpXXXXXX` +sed -e "s#;date.timezone =#date.timezone = 'UTC'#g" \ + -e "s#max_execution_time = 30 #max_execution_time = 300 #g" \ + %{_sysconfdir}/php.ini > $TMP_FILE +cat $TMP_FILE > %{_sysconfdir}/php.ini + + %postun rm -f %{zabbix_piddir}/zabbix_server.pid rm -f %{zabbix_logdir}/zabbix_server.log