tweaks to improve the automated, rpm installation of monitor-server.
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 14 Oct 2008 17:40:19 +0000 (17:40 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 14 Oct 2008 17:40:19 +0000 (17:40 +0000)
12 files changed:
automate-default.sh
bootman.py
emailTxt.py
findbad.py
mailer.py
moncommands.py
monitor-default.conf
monitor-server.spec
nodehistory.py
reboot.py
testapi.py
www/printbadnodes.py

index 73dc110..1aaeb59 100755 (executable)
@@ -29,6 +29,19 @@ if [ -f $MONITOR_PID ] ; then
 fi
 echo $$ > $MONITOR_PID
 
+AGENT=`ps ax | grep ssh-agent | grep -v grep`
+if [ -z "$AGENT" ] ; then
+        echo "starting ssh agent"
+        # if no agent is running, set it up.
+        ssh-agent > ${MONITOR_SCRIPT_ROOT}/agent.sh
+        source ${MONITOR_SCRIPT_ROOT}/agent.sh
+        ssh-add /etc/planetlab/debug_ssh_key.rsa
+        ssh-add /etc/planetlab/root_ssh_key.rsa
+fi
+#TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
+source ${MONITOR_SCRIPT_ROOT}/agent.sh
+
+
 echo "Performing Findbad Nodes"
 #########################
 # 1. FINDBAD NODES 
index ff2a6d5..82ee201 100755 (executable)
@@ -514,6 +514,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                        ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
@@ -583,6 +584,8 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
index f764a41..d1bccaa 100644 (file)
@@ -36,6 +36,10 @@ If you have a BootCD older than 3.0, you will need to create a new BootImage on
 
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 Thank you for your help,
   -- PlanetLab Central (support@planet-lab.org)
 """)
@@ -65,6 +69,10 @@ If you have a BootCD older than 3.0, you will need to create a new Boot CD and c
 
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 After another week, we will disable all slices currently running on PlanetLab.  Because this action will directly affect all users of these slices, these users will also be notified at that time.
 
 Thank you for your help,
@@ -92,6 +100,10 @@ If you have a BootCD older than 3.0, you will need to create a new Boot CD and c
 
     https://www.planet-lab.org/doc/guides/bootcdsetup
 
+Finally, you can track the current status of your machines using this Google Gadget:
+
+    http://fusion.google.com/add?source=atgs&moduleurl=http://monitor.planet-lab.org/monitor/sitemonitor.xml
+
 If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message.  However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue.   Including this message in your reply will help us coordinate our records with the actions you've taken.
 
 Thank you for your help,
index e156856..2aabe01 100755 (executable)
@@ -93,14 +93,14 @@ EOF                 """)
        oval = values['kernel']
        if "2.6.17" in oval or "2.6.2" in oval:
                values['ssh'] = 'SSH'
-               values['category'] = 'ALPHA'
+               values['category'] = 'PROD'
                if "bm.log" in values['bmlog']:
                        values['state'] = 'DEBUG'
                else:
                        values['state'] = 'BOOT'
        elif "2.6.12" in oval or "2.6.10" in oval:
                values['ssh'] = 'SSH'
-               values['category'] = 'PROD'
+               values['category'] = 'OLDPROD'
                if "bm.log" in values['bmlog']:
                        values['state'] = 'DEBUG'
                else:
index 46fdcae..d80d5d7 100755 (executable)
--- a/mailer.py
+++ b/mailer.py
@@ -16,7 +16,7 @@ import time
 logger = logging.getLogger("monitor")
 
 MTA="localhost"
-FROM="monitor@planet-lab.org"
+FROM=config.email
 
 def reformat_for_rt(text):
        lines = text.split("\n")
@@ -216,7 +216,7 @@ def emailViaRT_NoTicket(subject, text, to):
        # NOTE: AdminCc: (in PLC's RT configuration) gets an email sent.
        # This is not the case (surprisingly) for Cc:
        input_text  = "Subject: %s\n"
-       input_text += "Requestor: monitor@planet-lab.org\n"
+       input_text += "Requestor: %s\n"% FROM
        input_text += "id: ticket/new\n"
        input_text += "Queue: Monitor\n"
        for recipient in to:
@@ -286,7 +286,7 @@ def email(subject, text, to):
        if config.bcc and not config.debug:
                writer.addheader("Bcc", config.email)
 
-       writer.addheader("Reply-To", 'monitor@planet-lab.org')
+       writer.addheader("Reply-To", FROM)
                
        writer.addheader("MIME-Version", "1.0")
        #
@@ -357,7 +357,7 @@ if __name__=="__main__":
        #         "soltesz@cs.utk.edu")
        email("Re: [PL #21323] TEST 7", 
                           mailtxt.newbootcd_one[1] % {'hostname_list':"hostname list..."},
-                          ['monitor@planet-lab.org'])
+                          [FROM])
        #print "ticketid: %d" % id
        #id = plc.siteId(["alice.cs.princeton.edu"])
        #print id
index 869cc96..bda2389 100644 (file)
@@ -46,7 +46,7 @@ class CMD:
 
        def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
 
-               #print "CMD.run(%s)" % cmd
+               print "CMD.run(%s)" % cmd
                s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                self.s = s
                (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
index bf01c52..74a0c18 100644 (file)
@@ -22,11 +22,12 @@ MONITOR_SCRIPT_ROOT=/usr/share/monitor-server
 MONITOR_DATA_ROOT=/var/lib/monitor-server
 MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb
 
+email=monitor@another-lab.org
+
 [commandline]
 debug=0
 mail=1
 bcc=0
-email=
 run=False
 checkopt=False
 squeeze=1
index d7fa61e..dbb5f28 100644 (file)
@@ -2,7 +2,7 @@
 # $Id$
 # 
 
-%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/Monitor-server.spec $
+%define url $URL: svn+ssh://svn.planet-lab.org/svn/Monitor/trunk/monitor-server.spec $
 
 %define name monitor-server
 %define version 1.0
@@ -30,6 +30,7 @@ Requires: curl
 Requires: coreutils
 Requires: openssh-clients
 Requires: perl-libwww-perl
+Requires: perl-IO-Socket-SSL 
 Requires: MySQL-python
 Requires: rt3 == 3.4.1
 Requires: nmap
@@ -55,6 +56,8 @@ cd ..
 
 rm -rf $RPM_BUILD_ROOT
 mkdir -p $RPM_BUILD_ROOT/usr/share/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}
+mkdir -p $RPM_BUILD_ROOT/data/var/lib/%{name}/archive-pdb
 mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}
 mkdir -p $RPM_BUILD_ROOT/var/lib/%{name}/archive-pdb
 mkdir -p $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
index a7f030b..e554e0a 100755 (executable)
@@ -23,10 +23,12 @@ def get_filefromglob(d, str):
        glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
        os.chdir(path)
        #print glob_str
-       file = glob.glob(glob_str)[0]
+       #file = glob.glob(glob_str)[0]
+       files = glob.glob(glob_str)
        #print "loading %s" % file
        os.chdir("..")
-       return file[:-4]
+       files_chng = [ file[:-4] for file in files ]
+       return files_chng
        #fb = archive.load(file[:-4])
 
 
@@ -106,17 +108,19 @@ def main():
        verbose = 1
 
        while True:
-               file = get_filefromglob(d, "production.findbad")
-               #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
                
                try:
-                       fb = archive.load(file)
-                       if config.node in fb['nodes']:
-                               fb_nodeinfo  = fb['nodes'][config.node]['values']
-                               fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+                       for file in get_filefromglob(d, "production.findbad"):
+                               #file = get_filefromglob(d, "production.findbad")
+                               #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+                               fb = archive.load(file)
+                               if config.node in fb['nodes']:
+                                       fb_nodeinfo  = fb['nodes'][config.node]['values']
+                                       fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d"))
+
+                               del fb
+                               verbose = 0
 
-                       del fb
-                       verbose = 0
                except KeyboardInterrupt:
                        sys.exit(1)
                except:
index e876a76..503ca63 100755 (executable)
--- a/reboot.py
+++ b/reboot.py
@@ -1260,7 +1260,7 @@ def reboot_test(nodename, values, continue_probe, verbose, dryrun):
                                apc = APCBerlin(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
-                       elif values['pcu_id'] in [1173,1240]:
+                       elif values['pcu_id'] in [1173,1240,47]:
                                apc = APCFolsom(values, verbose, ['22', '23'])
                                rb_ret = apc.reboot(values[nodename], dryrun)
 
index 5e7daa8..4876fe6 100755 (executable)
@@ -5,10 +5,10 @@ import sys
 import traceback
 
 api = plc.getAuthAPI()
-loginbase = sys.argv[1] # "princeton"
 
 try:
-       site = api.GetSites(loginbase)[0]
+       # Just try the first site returned by the call
+       site = api.GetSites()[0]
        site_nodes = api.GetNodes(site['node_ids'])
        site_people = api.GetPersons(site['person_ids'])
        for node in site_nodes:
index 3bfc7bd..9b5692c 100755 (executable)
@@ -428,7 +428,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
                        vals['reboot'] = vals['reboot'].replace(" ", "_")
 
                if 'nodename' in vals:
-                       url = "<a href='https://www.planet-lab.org/db/nodes/index.php?nodepattern=%s'>%s</a>" % (vals['nodename'], vals['nodename'])
+                       url = "<a href='https://%s/db/nodes/index.php?nodepattern=%s'>%s</a>" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename'])
                        vals['nodename'] = url
 
                try: