3 # Attempt to reboot a node in debug state.
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
13 from getsshkeys import SSHKnownHosts
17 from pcucontrol.util import command as moncommands
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
27 from monitor import config
30 class Sopen(subprocess.Popen):
31 def kill(self, signal = signal.SIGTERM):
32 os.kill(self.pid, signal)
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
40 def __init__(self, connection, node, config):
45 def get_boot_state(self):
46 if self.c.modules.os.path.exists('/tmp/source'):
48 elif self.c.modules.os.path.exists('/vservers'):
54 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56 log = open("log/dmesg.%s.log" % self.node, 'r')
59 def get_bootmanager_log(self):
60 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62 log = open("log/bm.%s.log" % self.node, 'r')
65 def dump_plconf_file(self):
67 self.c.modules.sys.path.append("/tmp/source/")
68 self.c.modules.os.chdir('/tmp/source')
70 log = c.modules.BootManager.log('/tmp/new.log')
71 bm = c.modules.BootManager.BootManager(log,'boot')
73 BootManagerException = c.modules.Exceptions.BootManagerException
74 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
78 InitializeBootManager.Run(bm.VARS, bm.LOG)
79 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
83 print " Possibly, unable to find valid configuration file"
85 if bm_continue and self.config and not self.config.quiet:
86 for key in bm.VARS.keys():
87 print key, " == ", bm.VARS[key]
89 if self.config and not self.config.quiet: print " Unable to read Node Configuration"
92 def compare_and_repair_nodekeys(self):
94 self.c.modules.sys.path.append("/tmp/source/")
95 self.c.modules.os.chdir('/tmp/source')
97 log = c.modules.BootManager.log('/tmp/new.log')
98 bm = c.modules.BootManager.BootManager(log,'boot')
100 BootManagerException = c.modules.Exceptions.BootManagerException
101 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
105 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107 InitializeBootManager.Run(bm.VARS, bm.LOG)
108 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
113 print " Possibly, unable to find valid configuration file"
116 print " NODE: %s" % bm.VARS['NODE_KEY']
117 print " PLC : %s" % plcnode['key']
119 if bm.VARS['NODE_KEY'] == plcnode['key']:
122 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123 print " Successfully updated NODE_KEY with PLC"
128 #for key in bm.VARS.keys():
129 # print key, " == ", bm.VARS[key]
131 print " Unable to retrieve NODE_KEY"
133 def bootmanager_running(self):
134 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
139 def set_nodestate(self, state='boot'):
140 return api.UpdateNode(self.node, {'boot_state' : state})
142 def restart_node(self, state='boot'):
143 api.UpdateNode(self.node, {'boot_state' : state})
145 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146 if not pflags.getRecentFlag('gentlekill'):
147 print " Killing all slice processes... : %s" % self.node
148 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149 self.c.modules.os.system(cmd_slicekill)
150 cmd = """ shutdown -r +1 & """
151 print " Restarting %s : %s" % ( self.node, cmd)
152 self.c.modules.os.system(cmd)
154 pflags.setRecentFlag('gentlekill')
157 print " Restarting with sysrq 'sub' %s" % self.node
158 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159 self.c.modules.os.system(cmd)
163 def restart_bootmanager(self, forceState):
165 self.c.modules.os.chdir('/tmp/source')
166 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167 print " BootManager is already running: try again soon..."
169 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170 cmd = "( touch /tmp/BM_RUNNING ; " + \
171 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
172 " rm -f /tmp/BM_RUNNING " + \
174 cmd = cmd % forceState
175 self.c.modules.os.system(cmd)
181 class PlanetLabSession:
182 globalport = 22000 + int(random.random()*1000)
184 def __init__(self, node, nosetup, verbose):
185 self.verbose = verbose
188 self.nosetup = nosetup
192 def get_connection(self, config):
193 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195 def setup_host(self):
196 self.port = PlanetLabSession.globalport
197 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
200 args['port'] = self.port
201 args['user'] = 'root'
202 args['hostname'] = self.node
203 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
207 print "Skipping setup"
210 # COPY Rpyc files to host
211 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212 if self.verbose: print cmd
215 localos = moncommands.CMD()
217 ret = localos.system(cmd, timeout)
220 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223 ret = localos.system(cmd, timeout)
226 print "\tFAILED TWICE"
228 raise Exception("Failed twice trying to login with updated ssh host key")
231 # KILL any already running servers.
232 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235 echo "kill server" >> out.log
236 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
237 echo "export" >> out.log
238 export PYTHONPATH=$HOME ;
239 echo "start server" >> out.log
240 python Rpyc/Servers/forking_server.py &> server.log &
241 echo "done" >> out.log
243 #cmd = """ssh %(user)s@%(hostname)s """ + \
244 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246 #if self.verbose: print cmd
248 #print localos.system(cmd,timeout)
250 ## START a new rpyc server.
251 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
254 #if self.verbose: print cmd
255 #print localos.system(cmd,timeout)
259 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
260 # and the following options seems to work well.
261 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263 """-o ConnectTimeout=120 """ + \
264 """-n -N -L %(port)s:localhost:18812 """ + \
265 """%(user)s@%(hostname)s"""
267 if self.verbose: print cmd
268 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269 # TODO: the read() here may block indefinitely. Need a better
270 # approach therefore, that includes a timeout.
271 #ret = self.command.stdout.read(5)
272 ret = moncommands.read_t(self.command.stdout, 5)
276 # NOTE: There is still a slight race for machines that are slow...
277 self.timeout = 2*(t2-t1)
278 print "Sleeping for %s sec" % self.timeout
279 time.sleep(self.timeout)
282 if self.command.returncode is not None:
283 print "Failed to establish tunnel!"
284 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
290 if self.verbose: print "Killing SSH session %s" % self.port
294 def steps_to_list(steps):
296 for (id,label) in steps:
297 ret_list.append(label)
300 def index_to_id(steps,index):
301 if index < len(steps):
302 return steps[index][0]
306 def reboot(hostname, config=None, forced_action=None):
308 # NOTE: Nothing works if the bootcd is REALLY old.
309 # So, this is the first step.
310 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311 if fbnode['category'] == "OLDBOOTCD":
312 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314 args['hostname_list'] = " %s" % hostname
316 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
317 mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319 loginbase = plc.siteId(hostname)
320 emails = plc.getTechEmails(loginbase)
323 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
324 api.UpdateNode(hostname, {'boot_state' : 'disable'})
328 print "Creating session for %s" % node
329 # update known_hosts file (in case the node has rebooted since last run)
330 if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
332 k = SSHKnownHosts(); k.update(node); k.write(); del k
334 from monitor.common import email_exception
336 print traceback.print_exc()
341 session = PlanetLabSession(node, False, True)
343 session = PlanetLabSession(node, config.nosetup, config.verbose)
345 msg = "ERROR setting up session for %s" % hostname
347 print traceback.print_exc()
348 from monitor.common import email_exception
354 conn = session.get_connection(config)
356 # NOTE: sometimes the wait in setup_host() is not long enough.
357 # So, here we try to wait a little longer before giving up entirely.
359 time.sleep(session.timeout*4)
360 conn = session.get_connection(config)
362 print traceback.print_exc()
363 from monitor.common import email_exception
367 if forced_action == "reboot":
368 conn.restart_node('rins')
371 boot_state = conn.get_boot_state()
372 if boot_state == "boot":
373 print "...Boot state of %s already completed : skipping..." % node
375 elif boot_state == "unknown":
376 print "...Unknown bootstate for %s : skipping..."% node
381 if conn.bootmanager_running():
382 print "...BootManager is currently running. Skipping host %s" % node
387 # conn.restart_bootmanager(config.force)
390 # Read persistent flags, tagged on one week intervals.
391 pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
394 if config and not config.quiet: print "...downloading dmesg from %s" % node
395 dmesg = conn.get_dmesg()
396 child = fdpexpect.fdspawn(dmesg)
401 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
402 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
403 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
405 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
407 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
408 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
410 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
411 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
413 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
414 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
416 ('floppytimeout','floppy0: floppy timeout called'),
417 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
419 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
420 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
422 # floppy0: floppy timeout called
423 # end_request: I/O error, dev fd0, sector 0
425 # Buffer I/O error on device dm-2, logical block 8888896
426 # ata1: status=0x51 { DriveReady SeekComplete Error }
427 # ata1: error=0x40 { UncorrectableError }
428 # SCSI error : <0 0 0 0> return code = 0x8000002
429 # sda: Current: sense key: Medium Error
430 # Additional sense: Unrecovered read error - auto reallocate failed
432 # SCSI error : <0 2 0 0> return code = 0x40001
433 # end_request: I/O error, dev sda, sector 572489600
435 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
442 if config and not config.quiet: print "\tSET: ", s
445 print "...Potential drive errors on %s" % node
446 if len(s) == 2 and 'floppyerror' in s:
447 print "...Should investigate. Continuing with node."
449 print "...Should investigate. Skipping node."
450 # TODO: send message related to these errors.
452 args['hostname'] = hostname
453 args['log'] = conn.get_dmesg().read()
455 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
456 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
458 loginbase = plc.siteId(hostname)
459 emails = plc.getTechEmails(loginbase)
461 conn.set_nodestate('disable')
464 print "...Downloading bm.log from %s" % node
465 log = conn.get_bootmanager_log()
466 child = fdpexpect.fdspawn(log)
469 if config.collect: return True
475 if config and not config.quiet: print "...Scanning bm.log for errors"
481 ('bminit' , 'Initializing the BootManager.'),
482 ('cfg' , 'Reading node configuration file.'),
483 ('auth' , 'Authenticating node with PLC.'),
484 ('getplc' , 'Retrieving details of node from PLC.'),
485 ('update' , 'Updating node boot state at PLC.'),
486 ('hardware' , 'Checking if hardware requirements met.'),
487 ('installinit' , 'Install: Initializing.'),
488 ('installdisk' , 'Install: partitioning disks.'),
489 ('installbootfs', 'Install: bootstrapfs tarball.'),
490 ('installcfg' , 'Install: Writing configuration files.'),
491 ('installstop' , 'Install: Shutting down installer.'),
492 ('update2' , 'Updating node boot state at PLC.'),
493 ('installinit2' , 'Install: Initializing.'),
494 ('validate' , 'Validating node installation.'),
495 ('rebuildinitrd', 'Rebuilding initrd'),
496 ('netcfg' , 'Install: Writing Network Configuration files.'),
497 ('update3' , 'Updating node configuration.'),
498 ('disk' , 'Checking for unused disks to add to LVM.'),
499 ('update4' , 'Sending hardware configuration to PLC.'),
500 ('debug' , 'Starting debug mode'),
501 ('bmexceptmount', 'BootManagerException during mount'),
502 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
503 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
504 ('exception' , 'Exception'),
505 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
506 ('protoerror' , 'XML RPC protocol error'),
507 ('nodehostname' , 'Configured node hostname does not resolve'),
508 ('implementerror', 'Implementation Error'),
509 ('readonlyfs' , '[Errno 30] Read-only file system'),
510 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
511 ('noinstall' , 'notinstalled'),
512 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
513 ('noblockdev' , "No block devices detected."),
514 ('dnserror' , 'Name or service not known'),
515 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
516 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
517 ('hardwarerequirefail' , 'Hardware requirements not met'),
518 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
519 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
520 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
521 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
522 ('modulefail' , 'Unable to get list of system modules'),
523 ('writeerror' , 'write error: No space left on device'),
524 ('nospace' , "No space left on device"),
525 ('nonode' , 'Failed to authenticate call: No such node'),
526 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
527 ('bootcheckfail' , 'BootCheckAuthentication'),
528 ('bootupdatefail' , 'BootUpdateNode'),
530 list = steps_to_list(steps)
531 index = child.expect( list + [ pexpect.EOF ])
532 id = index_to_id(steps,index)
535 if id == "exception":
536 if config and not config.quiet: print "...Found An Exception!!!"
537 elif index == len(list):
541 s = "-".join(sequence)
542 print " FOUND SEQUENCE: ", s
544 # NOTE: We get or set the flag based on the current sequence identifier.
545 # By using the sequence identifier, we guarantee that there will be no
546 # frequent loops. I'm guessing there is a better way to track loops,
548 #if not config.force and pflags.getRecentFlag(s):
549 # pflags.setRecentFlag(s)
551 # print "... flag is set or it has already run recently. Skipping %s" % node
557 # restart_bootmanager_boot
558 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
559 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
560 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
562 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
564 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
565 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
566 "bminit-cfg-auth-getplc-update-debug-done",
567 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
568 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
569 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
570 "bminit-cfg-auth-protoerror-exception-update-debug-done",
571 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
572 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
574 sequences.update({n : "restart_bootmanager_boot"})
576 # conn.restart_bootmanager('rins')
577 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
578 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
579 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
580 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
581 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
582 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
583 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
584 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
585 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
586 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
587 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
588 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
589 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
590 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
591 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
592 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
593 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
594 # actual solution appears to involve removing the bad files, and
595 # continually trying to boot the node.
596 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
598 sequences.update({n : "restart_bootmanager_rins"})
601 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
603 # conn.restart_node('rins')
604 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
605 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
606 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
607 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
608 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
609 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
610 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
611 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
612 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
613 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
614 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
615 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
617 sequences.update({n : "restart_node_rins"})
620 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
621 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
622 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
623 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
624 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
625 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
627 sequences.update({n: "restart_node_boot"})
629 # update_node_config_email
630 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
631 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
632 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
634 sequences.update({n : "update_node_config_email"})
636 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
637 "bminit-cfg-update-exception-nodehostname-update-debug-done",
639 sequences.update({n : "nodenetwork_email"})
641 # update_bootcd_email
642 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
643 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
644 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
645 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
646 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
648 sequences.update({n : "update_bootcd_email"})
650 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
652 sequences.update({n: "suspect_error_email"})
654 # update_hardware_email
655 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
656 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
658 # broken_hardware_email
659 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
663 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
664 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
666 sequences.update( { n : "bad_dns_email"})
671 if s not in sequences:
672 print " HOST %s" % hostname
673 print " UNKNOWN SEQUENCE: %s" % s
676 args['hostname'] = hostname
678 args['bmlog'] = conn.get_bootmanager_log().read()
679 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
680 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
682 m.send([config.cc_email])
684 conn.restart_bootmanager('boot')
686 # NOTE: Do not set the pflags value for this sequence if it's unknown.
687 # This way, we can check it again after we've fixed it.
692 if sequences[s] == "restart_bootmanager_boot":
693 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
694 conn.restart_bootmanager('boot')
695 elif sequences[s] == "restart_bootmanager_rins":
696 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
697 conn.restart_bootmanager('rins')
698 elif sequences[s] == "restart_node_rins":
699 conn.restart_node('rins')
700 elif sequences[s] == "restart_node_boot":
701 conn.restart_node('boot')
702 elif sequences[s] == "repair_node_keys":
703 if conn.compare_and_repair_nodekeys():
704 # the keys either are in sync or were forced in sync.
705 # so try to reboot the node again.
706 conn.restart_bootmanager('rins')
709 # there was some failure to synchronize the keys.
710 print "...Unable to repair node keys on %s" % node
712 elif sequences[s] == "suspect_error_email":
714 args['hostname'] = hostname
716 args['bmlog'] = conn.get_bootmanager_log().read()
717 m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
718 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
720 m.send([config.cc_email])
722 conn.restart_bootmanager('boot')
724 elif sequences[s] == "update_node_config_email":
725 print "...Sending message to UPDATE NODE CONFIG"
727 args['hostname'] = hostname
728 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
729 True, db='nodeid_persistmessages')
730 loginbase = plc.siteId(hostname)
731 emails = plc.getTechEmails(loginbase)
733 conn.dump_plconf_file()
734 conn.set_nodestate('disable')
736 elif sequences[s] == "nodenetwork_email":
737 print "...Sending message to LOOK AT NODE NETWORK"
739 args['hostname'] = hostname
740 args['bmlog'] = conn.get_bootmanager_log().read()
741 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
742 True, db='nodenet_persistmessages')
743 loginbase = plc.siteId(hostname)
744 emails = plc.getTechEmails(loginbase)
746 conn.dump_plconf_file()
747 conn.set_nodestate('disable')
749 elif sequences[s] == "update_bootcd_email":
750 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
753 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
754 args['hostname_list'] = "%s" % hostname
756 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
757 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
759 loginbase = plc.siteId(hostname)
760 emails = plc.getTechEmails(loginbase)
763 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
764 conn.set_nodestate('disable')
766 elif sequences[s] == "broken_hardware_email":
767 # MAKE An ACTION record that this host has failed hardware. May
768 # require either an exception "/minhw" or other manual intervention.
769 # Definitely need to send out some more EMAIL.
770 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
771 # TODO: email notice of broken hardware
773 args['hostname'] = hostname
774 args['log'] = conn.get_dmesg().read()
775 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
776 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
778 loginbase = plc.siteId(hostname)
779 emails = plc.getTechEmails(loginbase)
781 conn.set_nodestate('disable')
783 elif sequences[s] == "update_hardware_email":
784 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
786 args['hostname'] = hostname
787 args['bmlog'] = conn.get_bootmanager_log().read()
788 m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
789 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
791 loginbase = plc.siteId(hostname)
792 emails = plc.getTechEmails(loginbase)
794 conn.set_nodestate('disable')
796 elif sequences[s] == "bad_dns_email":
797 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
800 node = api.GetNodes(hostname)[0]
801 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
803 from monitor.common import email_exception
805 print traceback.print_exc()
806 # TODO: api error. skip email, b/c all info is not available,
807 # flag_set will not be recorded.
809 nodenet_str = network_config_to_str(net)
811 args['hostname'] = hostname
812 args['network_config'] = nodenet_str
813 args['nodenetwork_id'] = net['nodenetwork_id']
814 m = PersistMessage(hostname, mailtxt.baddns[0] % args,
815 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
817 loginbase = plc.siteId(hostname)
818 emails = plc.getTechEmails(loginbase)
820 conn.set_nodestate('disable')
823 pflags.setRecentFlag(s)
829 # MAIN -------------------------------------------------------------------
832 from monitor import parser as parsermodule
833 parser = parsermodule.getParser()
835 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
836 force=None, quiet=False)
837 parser.add_option("", "--child", dest="child", action="store_true",
838 help="This is the child mode of this process.")
839 parser.add_option("", "--force", dest="force", metavar="boot_state",
840 help="Force a boot state passed to BootManager.py.")
841 parser.add_option("", "--quiet", dest="quiet", action="store_true",
842 help="Extra quiet output messages.")
843 parser.add_option("", "--verbose", dest="verbose", action="store_true",
844 help="Extra debug output messages.")
845 parser.add_option("", "--nonet", dest="nonet", action="store_true",
846 help="Do not setup the network, use existing log files to re-run a test pass.")
847 parser.add_option("", "--collect", dest="collect", action="store_true",
848 help="No action, just collect dmesg, and bm.log")
849 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
850 help="Do not perform the orginary setup phase.")
852 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
853 config = parsermodule.parse_args(parser)
856 nodes = config.getListFromFile(config.nodelist)
858 nodes = [ config.node ]
866 if __name__ == "__main__":