fi
echo $$ > $MONITOR_PID
+AGENT=`ps ax | grep ssh-agent | grep -v grep`
+if [ -z "$AGENT" ] ; then
+ echo "starting ssh agent"
+ # if no agent is running, set it up.
+ ssh-agent > ${MONITOR_SCRIPT_ROOT}/agent.sh
+ source ${MONITOR_SCRIPT_ROOT}/agent.sh
+ ssh-add /etc/planetlab/debug_ssh_key.rsa
+ ssh-add /etc/planetlab/root_ssh_key.rsa
+fi
+#TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
+source ${MONITOR_SCRIPT_ROOT}/agent.sh
+
+
echo "Performing Findbad Nodes"
#########################
# 1. FINDBAD NODES
('hardwarerequirefail' , 'Hardware requirements not met'),
('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+ ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
('modulefail' , 'Unable to get list of system modules'),
('writeerror' , 'write error: No space left on device'),
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
+Finally, you can track the current status of your machines using this Google Gadget:
+
+ http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
Thank you for your help,
-- PlanetLab Central (support@planet-lab.org)
""")
If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
+Finally, you can track the current status of your machines using this Google Gadget:
+
+ http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
After another week, we will disable all slices currently running on PlanetLab. Because this action will directly affect all users of these slices, these users will also be notified at that time.
Thank you for your help,
https://www.planet-lab.org/doc/guides/bootcdsetup
+Finally, you can track the current status of your machines using this Google Gadget:
+
+ http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
Thank you for your help,
oval = values['kernel']
if "2.6.17" in oval or "2.6.2" in oval:
values['ssh'] = 'SSH'
- values['category'] = 'ALPHA'
+ values['category'] = 'PROD'
if "bm.log" in values['bmlog']:
values['state'] = 'DEBUG'
else:
values['state'] = 'BOOT'
elif "2.6.12" in oval or "2.6.10" in oval:
values['ssh'] = 'SSH'
- values['category'] = 'PROD'
+ values['category'] = 'OLDPROD'
if "bm.log" in values['bmlog']:
values['state'] = 'DEBUG'
else:
logger = logging.getLogger("monitor")
MTA="localhost"
-FROM="monitor@planet-lab.org"
+FROM=config.email
def reformat_for_rt(text):
lines = text.split("\n")
# NOTE: AdminCc: (in PLC's RT configuration) gets an email sent.
# This is not the case (surprisingly) for Cc:
input_text = "Subject: %s\n"
- input_text += "Requestor: monitor@planet-lab.org\n"
+ input_text += "Requestor: %s\n"% FROM
input_text += "id: ticket/new\n"
input_text += "Queue: Monitor\n"
for recipient in to:
if config.bcc and not config.debug:
writer.addheader("Bcc", config.email)
- writer.addheader("Reply-To", 'monitor@planet-lab.org')
+ writer.addheader("Reply-To", FROM)
writer.addheader("MIME-Version", "1.0")
#
# "soltesz@cs.utk.edu")
email("Re: [PL #21323] TEST 7",
mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."},
- ['monitor@planet-lab.org'])
+ [FROM])
#print "ticketid: %d" % id
#id = plc.siteId(["alice.cs.princeton.edu"])
#print id
def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
- #print "CMD.run(%s)" % cmd
+ print "CMD.run(%s)" % cmd
s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
self.s = s
(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
MONITOR_DATA_ROOT=/var/lib/monitor-server
MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb
+email=monitor@another-lab.org
+
[commandline]
debug=0
mail=1
bcc=0
-email=
run=False
checkopt=False
squeeze=1
# $Id$
#
-%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/Monitor-server.spec $
+%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/monitor-server.spec $
%define name monitor-server
%define version 1.0
Requires: coreutils
Requires: openssh-clients
Requires: perl-libwww-perl
+Requires: perl-IO-Socket-SSL
Requires: MySQL-python
Requires: rt3 == 3.4.1
Requires: nmap
rm -rf $RPM_BUILD_ROOT
mkdir -p $RPM_BUILD_ROOT/usr/share/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}/archive-pdb
mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}
mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}/archive-pdb
mkdir -p $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
os.chdir(path)
#print glob_str
- file = glob.glob(glob_str)[0]
+ #file = glob.glob(glob_str)[0]
+ files = glob.glob(glob_str)
#print "loading %s" % file
os.chdir("..")
- return file[:-4]
+ files_chng = [ file[:-4] for file in files ]
+ return files_chng
#fb = archive.load(file[:-4])
verbose = 1
while True:
- file = get_filefromglob(d, "production.findbad")
- #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
try:
- fb = archive.load(file)
- if config.node in fb['nodes']:
- fb_nodeinfo = fb['nodes'][config.node]['values']
- fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+ for file in get_filefromglob(d, "production.findbad"):
+ #file = get_filefromglob(d, "production.findbad")
+ #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+ fb = archive.load(file)
+ if config.node in fb['nodes']:
+ fb_nodeinfo = fb['nodes'][config.node]['values']
+ fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+
+ del fb
+ verbose = 0
- del fb
- verbose = 0
except KeyboardInterrupt:
sys.exit(1)
except:
apc = APCBerlin(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
- elif values['pcu_id'] in [1173,1240]:
+ elif values['pcu_id'] in [1173,1240,47]:
apc = APCFolsom(values, verbose, ['22', '23'])
rb_ret = apc.reboot(values[nodename], dryrun)
import traceback
api = plc.getAuthAPI()
-loginbase = sys.argv[1] # "princeton"
try:
- site = api.GetSites(loginbase)[0]
+ # Just try the first site returned by the call
+ site = api.GetSites()[0]
site_nodes = api.GetNodes(site['node_ids'])
site_people = api.GetPersons(site['person_ids'])
for node in site_nodes:
vals['reboot'] = vals['reboot'].replace(" ", "_")
if 'nodename' in vals:
- url = "<a href='https://www.planet-lab.org/db/nodes/index.php?nodepattern=%s'>%s</a>" % (vals['nodename'], vals['nodename'])
+ url = "<a href='https://%s/db/nodes/index.php?nodepattern=%s'>%s</a>" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename'])
vals['nodename'] = url
try: