merge from improvements on the 1.0 branch:
[monitor.git] / bootman.py
index faf77a2..22201cb 100755 (executable)
@@ -2,29 +2,29 @@
 
 # Attempt to reboot a node in debug state.
 
-import plc
+from monitor import const
+from monitor.database.info.model import *
+from monitor.wrapper import plc
 api = plc.getAuthAPI()
 
 import sys
 import os
-import const
 
 from getsshkeys import SSHKnownHosts
 
 import subprocess
 import time
-import database
-import moncommands
+from monitor.util import command as moncommands
 from sets import Set
 
-import ssh.pxssh as pxssh
-import ssh.fdpexpect as fdpexpect
-import ssh.pexpect as pexpect
-from unified_model import *
-from emailTxt import mailtxt
+from pcucontrol.transports.ssh import pxssh as pxssh
+from pcucontrol.transports.ssh import fdpexpect as fdpexpect
+from pcucontrol.transports.ssh import pexpect as pexpect
+from monitor.model import *
+from monitor.wrapper.emailTxt import mailtxt
 from nodeconfig import network_config_to_str
 import traceback
-import monitorconfig
+from monitor import config
 
 import signal
 class Sopen(subprocess.Popen):
@@ -36,13 +36,6 @@ from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
 fb = None
 
-def get_fbnode(node):
-       global fb
-       if fb is None:
-               fb = database.dbLoad("findbad")
-       fbnode = fb['nodes'][node]['values']
-       return fbnode
-
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -207,7 +200,7 @@ class PlanetLabSession:
                args['port'] = self.port
                args['user'] = 'root'
                args['hostname'] = self.node
-               args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
                ssh_port = 22
 
                if self.nosetup:
@@ -314,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None):
 
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
-       fbnode = get_fbnode(hostname)
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
        if fbnode['category'] == "OLDBOOTCD":
                print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
                args = {}
@@ -324,7 +317,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                        mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
                loginbase = plc.siteId(hostname)
-               m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+               emails = plc.getTechEmails(loginbase)
+               m.send(emails) 
 
                print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -455,7 +449,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
                        return False
 
@@ -505,6 +500,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('nodehostname' , 'Configured node hostname does not resolve'),
                        ('implementerror', 'Implementation Error'),
                        ('readonlyfs'   , '[Errno 30] Read-only file system'),
+                       ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
@@ -514,6 +510,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                        ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
@@ -583,7 +580,13 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                       # actual solution appears to involve removing the bad files, and
+                       # continually trying to boot the node.
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
 
@@ -612,6 +615,7 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
                         ]:
                sequences.update({n: "restart_node_boot"})
 
@@ -668,7 +672,7 @@ def reboot(hostname, config=None, forced_action=None):
                m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
                                                                         mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
                m.reset()
-               m.send(['monitor-list@lists.planet-lab.org'])
+               m.send([config.cc_email]) 
 
                conn.restart_bootmanager('boot')
 
@@ -706,7 +710,7 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
                                                                                 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
                        m.reset()
-                       m.send(['monitor-list@lists.planet-lab.org'])
+                       m.send([config.cc_email]) 
 
                        conn.restart_bootmanager('boot')
 
@@ -717,7 +721,8 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodeid_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
                        conn.set_nodestate('disable')
 
@@ -729,7 +734,8 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
                        conn.set_nodestate('disable')
 
@@ -744,7 +750,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
 
                        print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                        conn.set_nodestate('disable')
@@ -762,7 +769,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
@@ -774,7 +782,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "bad_dns_email":
@@ -797,7 +806,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
        if flag_set:
@@ -810,7 +820,7 @@ def reboot(hostname, config=None, forced_action=None):
 # MAIN -------------------------------------------------------------------
 
 def main():
-       import parser as parsermodule
+       from monitor import parser as parsermodule
        parser = parsermodule.getParser()
 
        parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,