MONITOR_SCRIPT_ROOT=/usr/share/monitor-server
MONITOR_DATA_ROOT=/var/lib/monitor-server
MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb
+MONITOR_IP=
+MONITOR_HOSTNAME=
email=
+# all messages will appear to be from this address
+from_email=monitor@planet-lab.org
+
+# a separate address for support messages
+support_email=support@planet-lab.org
+
+# mailing list copied on all out-going messages
+cc_email=monitor-list@lists.planet-lab.org
+
+[monitordatabase]
+monitor_dburi=postgres://user:passwd@localhost:5432/infovacuum
+zabbix_dburi=postgres://zabbixuser:<...>@localhost:5432/zabbix
+
+cachetime=60
[commandline]
+echo=False
debug=0
mail=1
bcc=0
-#import pkg_resources
-#pkg_resources.require("SQLAlchemy>=0.4.9")
import sqlalchemy
import elixir
import monitor.config as config
-#elixir.metadata.bind = sqlalchemy.create_engine(config.databaseuri, echo=False)
-#elixir.session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=True,autocommit=True))
-#infovacuum_db = sqlalchemy.MetaData(bind=sqlalchemy.create_engine(config.monitor_dburi, echo=False))
-#infovacuum_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=True,autocommit=True))
+mon_metadata = sqlalchemy.MetaData()
+mon_metadata.bind = sqlalchemy.create_engine(config.monitor_dburi, echo=config.echo)
+mon_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
-zabbix_engine = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo)
-metadata = sqlalchemy.MetaData()
-metadata.bind = zabbix_engine
-session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
-elixir.session, elixir.metadata = session, metadata
+zab_metadata = sqlalchemy.MetaData()
+zab_metadata.bind = sqlalchemy.create_engine(config.zabbix_dburi, echo=config.echo)
+zab_session = sqlalchemy.orm.scoped_session(sqlalchemy.orm.sessionmaker(autoflush=False,autocommit=True))
#from monitor.database.infovacuum.model import *
-from monitor.database.zabbixapi.model import *
+#from monitor.database.zabbixapi.model import *
--- /dev/null
+#__connection__ = hub = PackageHub('infovacuum')
+try:
+ import pkg_resources
+ pkg_resources.require("SQLAlchemy>=0.4.9")
+ pkg_resources.require("Elixir>=0.4.0")
+ ## NOTE!!!!!!
+ # with this line enabled, other models cannot import this file.
+ # it results in the wrong metadata value being loaded, I think.
+ #from turbogears.database import metadata, mapper
+
+ #import pkg_resources
+ #pkg_resources.require("SQLObject>=0.7.1")
+ #from turbogears.database import PackageHub, AutoConnectHub
+ #from turbogears import config
+ #uri = config.get("sqlobject.dburi")
+ #from sqlobject import connectionForURI,sqlhub
+ #sqlhub.processConnection = connectionForURI(uri)
+
+except:
+ # NOTE: this try, will allow external modules to import the model without
+ # requring the turbogears garbage.
+ import traceback
+ print traceback.print_exc()
+ pass
+
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import String, Unicode, Integer, DateTime
+options_defaults['autosetup'] = False
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__ = mon_session
+
+def findby_or_create(cls, if_new_set={}, **kwargs):
+ result = cls.get_by(**kwargs)
+ if not result:
+ result = cls(**kwargs)
+ result.set(**if_new_set)
+ return result
+Entity.findby_or_create = classmethod(findby_or_create)
+
+from monitor.database.infovacuum.model.actionrecord import *
+from monitor.database.infovacuum.model.findbadrecord import *
+from monitor.database.infovacuum.model.historyrecord import *
+setup_all()
--- /dev/null
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all, has_one
+from elixir import String, Integer, DateTime, PickleType, Boolean
+from datetime import datetime,timedelta
+import elixir
+import traceback
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__ = mon_session
+
+class IssueType(Entity):
+ shortname = Field(String, default=None)
+ description = Field(String, default=None)
+ issue_record = ManyToMany('IssueRecord')
+
+class IssueRecord(Entity):
+ date_created = Field(DateTime,default=datetime.now)
+ date_last_updated = Field(DateTime,default=datetime.now)
+ date_action_taken = Field(DateTime,default=datetime.now)
+
+ hostname = Field(String,default=None)
+ loginbase = Field(String)
+
+ ticket_id = Field(Integer, default=0)
+ rt = Field(PickleType, default=None)
+
+ # open, paused, closed
+ status = Field(String, default="open")
+
+ take_action = Field(Boolean, default=False)
+ send_email = Field(Boolean, default=True)
+
+ message_series = Field(String, default="nodedown")
+ message_index = Field(Integer, default=0)
+ penalty_level = Field(Integer, default=0)
+
+ issue_type = ManyToMany('IssueType')
+ actions = OneToMany('ActionRecord', order_by='-date_created')
+
+
+class ActionRecord(Entity):
+ @classmethod
+ def get_latest_by(cls, **kwargs):
+ # TODO: need to sort on 'round' since actions will not be globally sync'd.
+ return cls.query.filter_by(**kwargs).order_by(ActionRecord.id.desc()).first()
+
+# ACCOUNTING
+ date_created = Field(DateTime,default=datetime.now)
+ hostname = Field(String,default=None)
+ loginbase = Field(String)
+
+ issue = ManyToOne('IssueRecord')
+ # NOTE: this is the parent relation to fb records. first create the
+ # action record, then append to this value all of the findbad records we
+ # want to have in our set.
+ # Model:
+ # - create action record
+ # - find fbnode records
+ # - append fbnode records to action record
+ # OR
+ # - find fbnode records
+ # - create action record with fbnodes as argument
+ findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+
+ # NOTE: can I move 'message_index, escellation_level, and penalty_level'
+ # into the same value? Maybe not penalty level, since there are only two;
+ # and, there may be additional message and escellation levels.
+ send_email_to = Field(PickleType, default=None)
+ action_description = Field(PickleType, default=None)
+ message_arguments = Field(PickleType, default=None)
+
+ # NOTE: not sure this needs to be in the db.
+ escellation_level = Field(Integer, default=0)
+ stage = Field(String, default=None)
--- /dev/null
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all, belongs_to
+from elixir import String, Integer as Int, DateTime, PickleType, Boolean
+from datetime import datetime,timedelta
+import elixir
+import traceback
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__ = mon_session
+
+
+class FindbadNodeRecordSync(Entity):
+ hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+ round = Field(Int,default=0)
+
+class FindbadPCURecordSync(Entity):
+ plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+ round = Field(Int,default=0)
+
+class FindbadNodeRecord(Entity):
+ @classmethod
+ def get_all_latest(cls):
+ fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ return cls.query.filter_by(round=fbsync.round)
+
+ @classmethod
+ def get_latest_by(cls, **kwargs):
+ fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ kwargs['round'] = fbsync.round
+ return cls.query.filter_by(**kwargs)
+
+ @classmethod
+ def get_latest_n_by(cls, n=3, **kwargs):
+ fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ kwargs['round'] = fbsync.round
+ ret = []
+ for i in range(0,n):
+ kwargs['round'] = kwargs['round'] - i
+ f = cls.query.filter_by(**kwargs).first()
+ if f:
+ ret.append(f)
+ return ret
+
+# ACCOUNTING
+ date_checked = Field(DateTime,default=datetime.now)
+ round = Field(Int,default=0)
+ hostname = Field(String,default=None)
+ loginbase = Field(String)
+
+# INTERNAL
+ kernel_version = Field(String,default=None)
+ bootcd_version = Field(String,default=None)
+ nm_status = Field(String,default=None)
+ fs_status = Field(String,default=None)
+ dns_status = Field(String,default=None)
+ princeton_comon_dir = Field(Boolean,default=False)
+ princeton_comon_running = Field(Boolean,default=False)
+ princeton_comon_procs = Field(Int,default=None)
+
+# EXTERNAL
+ plc_node_stats = Field(PickleType,default=None)
+ plc_site_stats = Field(PickleType,default=None)
+ plc_pcuid = Field(Int,default=None)
+ comon_stats = Field(PickleType,default=None)
+ port_status = Field(PickleType,default=None)
+ ssh_portused = Field(Int,default=22)
+ ssh_status = Field(Boolean,default=False)
+ ssh_error = Field(String,default=None) # set if ssh_access == False
+ ping_status = Field(Boolean,default=False)
+
+# INFERRED
+ observed_category = Field(String,default=None)
+ observed_status = Field(String,default=None)
+
+ # NOTE: this is the child relation
+ action = ManyToOne('ActionRecord', required=False)
+
+class FindbadPCURecord(Entity):
+ @classmethod
+ def get_all_latest(cls):
+ fbsync = cls.get_by(hostname="global")
+ return cls.query.filter_by(round=fbsync.round)
+
+ @classmethod
+ def get_latest_by(cls, **kwargs):
+ fbsync = cls.get_by(hostname="global")
+ kwargs['round'] = fbsync.round
+ return cls.query.filter_by(**kwargs)
+# ACCOUNTING
+ date_checked = Field(DateTime)
+ round = Field(Int,default=0)
+ plc_pcuid = Field(Int) #alternateID=True,alternateMethodName='by_pcuid')
+
+# EXTERNAL
+ plc_pcu_stats = Field(PickleType,default=None)
+ dns_status = Field(String)
+ port_status = Field(PickleType)
+ entry_complete = Field(String)
+
+# INTERNAL
+# INFERRED
+ reboot_trial_status = Field(String)
--- /dev/null
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import String, Integer as Int, DateTime, Boolean
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__ = mon_session
+
+# your data model
+class HistoryNodeRecord(Entity):
+ hostname = Field(String(250),primary_key=True)
+ last_checked = Field(DateTime,default=datetime.now)
+ last_changed = Field(DateTime,default=datetime.now)
+ status = Field(String,default="unknown")
+
+ @classmethod
+ def by_hostname(cls, hostname):
+ return cls.query.filter_by(hostname=hostname).first()
+
+class HistoryPCURecord(Entity):
+ plc_pcuid = Field(Int,primary_key=True)
+
+ last_checked = Field(DateTime,default=datetime.now)
+ last_changed = Field(DateTime,default=datetime.now)
+ status = Field(String,default="unknown")
+
+ last_valid = Field(DateTime,default=None)
+ valid = Field(String,default="unknown")
+
+ @classmethod
+ def by_pcuid(cls, pcuid):
+ return cls.query.filter_by(pcuid=pcuid).first()
+
+class HistorySiteRecord(Entity):
+ loginbase = Field(String(250),primary_key=True)
+
+ last_checked = Field(DateTime,default=datetime.now)
+ last_changed = Field(DateTime,default=datetime.now)
+
+ nodes_total = Field(Int,default=0)
+ nodes_up = Field(Int,default=0)
+ slices_used = Field(Int,default=0)
+
+ status = Field(String,default="unknown")
+
+ @classmethod
+ def by_loginbase(cls, loginbase):
+ return cls.query.filter_by(loginbase=loginbase).first()
+
--- /dev/null
+from actionrecord import *
+from findbadrecord import *
+from historyrecord import *
If the machine has booted successfully, you may check directly by logging in with your site_admin account, and running:
- sudo /usr/sbin/vps ax
+ sudo /usr/sbin/vps ax
If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide:
- https://www.planet-lab.org/doc/guides/bootcdsetup
+ https://www.planet-lab.org/doc/guides/bootcdsetup
If after following these directions, you are still experiencing problems, then you can acknowledge this notice by visiting, and letting us know what the problem is at mailto:%(support_email)s
- http://%(hostname)s/zabbix/acknow.php?eventid={EVENT.ID}
+ http://%(hostname)s/zabbix/acknow.php?eventid={EVENT.ID}
http://%(hostname)s/zabbix/tr_events.php?triggerid={TRIGGER.ID}&eventid={EVENT.ID}
Thank you for your help,
- -- PlanetLab Central (%(support_email)s)
- """
+ -- PlanetLab Central (%(support_email)s)
+"""
nodedown_four_subject="Server {HOSTNAME} is unreachable: Waiting Forever"
nodedown_four="""
Hello,
You can acknowledge this notice by visiting the link below or by letting us know what the problem is by replying to this message.
- http://%(hostname)s/zabbix/acknow.php?eventid={EVENT.ID}
+ http://%(hostname)s/zabbix/acknow.php?eventid={EVENT.ID}
http://%(hostname)s/zabbix/tr_events.php?triggerid={TRIGGER.ID}&eventid={EVENT.ID}
Thank you for your help,
- -- PlanetLab Central (%(support_email)s)
+ -- PlanetLab Central (%(support_email)s)
"""
thankyou_nodeup = """
While monitoring your site, we noticed that the following nodes *improved* their states:
# import the basic Elixir classes and functions for declaring the data model
# (see http://elixir.ematia.de/trac/wiki/TutorialDivingIn)
from elixir import EntityMeta, Entity, Field, OneToMany, ManyToOne, ManyToMany
-from elixir import options_defaults, using_options, setup_all, metadata, entities
+from elixir import options_defaults, using_options, setup_all, entities
# import some datatypes for table columns from Elixir
# (see http://www.sqlalchemy.org/docs/04/types.html for more)
from elixir import String, Unicode, Integer, DateTime
import defines
+from monitor.database.dborm import zab_metadata, zab_session
-#from elixir import metadata
-#from monitor.database.dborm import zabbix_db, zabbix_session
-#__metadata__ = zabbix_db
-#__session__ = zabbix_session
+__metadata__ = zab_metadata
+__session__ = zab_session
# TODO:
# - declare association between Media and MediaType so that look ups can
# currently since the rights table is merely treated as an intermediate
# table for the m2m between usrgrp and groups.
-rights = Table('rights', metadata, autoload=True)
-hostsgroups = Table('hosts_groups', metadata, autoload=True)
+rights = Table('rights', __metadata__, autoload=True)
+hostsgroups = Table('hosts_groups', __metadata__, autoload=True)
# m2m table between hosts and groups below
foreign_keys=lambda: [Media.userid],
ondelete='cascade')
-users_groups = Table('users_groups', metadata, autoload=True)
+users_groups = Table('users_groups', __metadata__, autoload=True)
class User(ZabbixEntity): # parent of media
using_options(
import md5
from monitor import config
-from monitor.database.dborm import *
+from monitor.database.dborm import zab_session as session
+from monitor.database.zabbixapi.model import *
from monitor.database.zabbixapi.emailZabbix import *
from monitor.database.zabbixapi import defines
zabbixserver = Host.get_by(host="ZABBIX Server")
if zabbixserver:
print "UPDATING Primary Zabbix server entry"
- zabbixserver.host="MyPLC Server"
+ zabbixserver.host=config.MONITOR_HOSTNAME
zabbixserver.ip=config.MONITOR_IP
zabbixserver.dns=config.MONITOR_HOSTNAME
zabbixserver.useip=1
def setup_site(loginbase, techemail, piemail, iplist):
- # TODO: Initially adding this info is ok. what about updates to users,
- # additional hosts, removed users from plc,
# TODO: send a message when host is discovered.
+
# TODO: update 'discovered' hosts with dns name.
# TODO: remove old nodes that are no longer in the plcdb.
+ # TODO: remove old users that are no longer in the plcdb.
+ # TODO: consider creating two user groups for Tech & PI emails
BI_WEEKLY_ESC_PERIOD = int(60*60*24)
BI_WEEKLY_ESC_PERIOD = int(60) # testing...
# User Group
- site_user_group = UsrGrp.find_or_create(name="%s_usergroup" % loginbase)
- for user in set(techemail + piemail):
+ site_user_group = UsrGrp.find_or_create(name=USERGROUP_NAME % loginbase)
+ for user in set(techemail + piemail + [config.cc_email]):
+ if not user: continue
# USER
u = User.find_or_create(alias=user, type=1,
set_if_new={'passwd' : md5.md5(user).hexdigest()},
+ # exec_if_new avoids creating a Media object that
+ # will not actually be used, if the user already exists
exec_if_new=lambda obj: \
obj.media_list.append( Media(mediatypeid=1, sendto=user)))
# HOST GROUP
plc_host_group = HostGroup.find_or_create(name="MyPLC Hosts")
- site_host_group = HostGroup.find_or_create(name="%s_hostgroup" % loginbase)
+ site_host_group = HostGroup.find_or_create(name=HOSTGROUP_NAME % loginbase)
plctemplate = Host.get_by(host="Template_Linux_PLHost")
escalation_action_name = ESCALATION_ACTION_NAME % loginbase
discovery_action_name = DISCOVERY_ACTION_NAME % loginbase
esc_step_from=10, esc_step_to=10,
esc_period=0,
shortdata="",
- longdata="zabbixserver:/usr/share/monitor-server/checkslices.py {HOSTNAME} disablesite",
+ longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} disablesite" % config.MONITOR_HOSTNAME,
operationcondition_list=[ OperationConditionNotAck() ]),
ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE,
shortdata=mailtxt.nodedown_two_subject,
esc_step_from=17, esc_step_to=17,
esc_period=0,
shortdata="",
- longdata="zabbixserver:/usr/share/monitor-server/checkslices.py {HOSTNAME} disableslices",
+ longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} disableslices" % config.MONITOR_HOSTNAME,
# TODO: send notice to users of slices
operationcondition_list=[ OperationConditionNotAck() ]),
ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE,
esc_step_from=21, esc_step_to=0,
esc_period=int(BI_WEEKLY_ESC_PERIOD*3.5),
shortdata="",
- longdata="zabbixserver:/usr/share/monitor-server/checkslices.py {HOSTNAME} forever",
+ longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} forever" % config.MONITOR_HOSTNAME,
operationcondition_list=[ OperationConditionNotAck() ]),
ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE,
shortdata=mailtxt.nodedown_four_subject,
#sites = api.GetSites({'peer_id' : None}, ['login_base'])
for loginbase in ['princeton', 'princetondsl', 'monitorsite']:
add_loginbase(loginbase)
+ session.flush()
+ # TODO: for any site that is in the db, call zabbixsite.delete_site()
- session.flush()
-
-## Scripts : includes external scripts to:
-# - reboot.py
-# - nmap
-
-## UserGroups
-# define technical contact, principal investigator groups
-# define a Group for every site
-
-## Users
-# define a User for every user with admin/tech/pi roles
-# get passwords from a combination of site&name, perhaps?
-# I'm guessing we could use the grpid or userid as part of the passwd,
-# so that it varies in general, and can be guessed from the templates
-# add user to groups
-
-## Discovery Rules and Actions
-# define a discovery rule for every site's hosts.
-# define discovery action for online hosts.
-
-## Messages & Escalations
-# define actions and escellations for trigger sources:
-# - unreachable host,
-
-## HostGroups
-# add host group for every site
-# add host group for global network (PLC name)
## Hosts & Templates
# no need to define hosts?