3 # Attempt to reboot a node in debug state.
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
13 from getsshkeys import SSHKnownHosts
17 from monitor.util import command as moncommands
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
27 from monitor import config
30 class Sopen(subprocess.Popen):
31 def kill(self, signal = signal.SIGTERM):
32 os.kill(self.pid, signal)
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
40 def __init__(self, connection, node, config):
45 def get_boot_state(self):
46 if self.c.modules.os.path.exists('/tmp/source'):
48 elif self.c.modules.os.path.exists('/vservers'):
54 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56 log = open("log/dmesg.%s.log" % self.node, 'r')
59 def get_bootmanager_log(self):
60 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62 log = open("log/bm.%s.log" % self.node, 'r')
65 def dump_plconf_file(self):
67 self.c.modules.sys.path.append("/tmp/source/")
68 self.c.modules.os.chdir('/tmp/source')
70 log = c.modules.BootManager.log('/tmp/new.log')
71 bm = c.modules.BootManager.BootManager(log,'boot')
73 BootManagerException = c.modules.Exceptions.BootManagerException
74 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
78 InitializeBootManager.Run(bm.VARS, bm.LOG)
79 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
83 print " Possibly, unable to find valid configuration file"
85 if bm_continue and self.config and not self.config.quiet:
86 for key in bm.VARS.keys():
87 print key, " == ", bm.VARS[key]
89 if self.config and not self.config.quiet: print " Unable to read Node Configuration"
92 def compare_and_repair_nodekeys(self):
94 self.c.modules.sys.path.append("/tmp/source/")
95 self.c.modules.os.chdir('/tmp/source')
97 log = c.modules.BootManager.log('/tmp/new.log')
98 bm = c.modules.BootManager.BootManager(log,'boot')
100 BootManagerException = c.modules.Exceptions.BootManagerException
101 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
105 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107 InitializeBootManager.Run(bm.VARS, bm.LOG)
108 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
113 print " Possibly, unable to find valid configuration file"
116 print " NODE: %s" % bm.VARS['NODE_KEY']
117 print " PLC : %s" % plcnode['key']
119 if bm.VARS['NODE_KEY'] == plcnode['key']:
122 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123 print " Successfully updated NODE_KEY with PLC"
128 #for key in bm.VARS.keys():
129 # print key, " == ", bm.VARS[key]
131 print " Unable to retrieve NODE_KEY"
133 def bootmanager_running(self):
134 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
139 def set_nodestate(self, state='boot'):
140 return api.UpdateNode(self.node, {'boot_state' : state})
142 def restart_node(self, state='boot'):
143 api.UpdateNode(self.node, {'boot_state' : state})
145 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146 if not pflags.getRecentFlag('gentlekill'):
147 print " Killing all slice processes... : %s" % self.node
148 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149 self.c.modules.os.system(cmd_slicekill)
150 cmd = """ shutdown -r +1 & """
151 print " Restarting %s : %s" % ( self.node, cmd)
152 self.c.modules.os.system(cmd)
154 pflags.setRecentFlag('gentlekill')
157 print " Restarting with sysrq 'sub' %s" % self.node
158 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159 self.c.modules.os.system(cmd)
163 def restart_bootmanager(self, forceState):
165 self.c.modules.os.chdir('/tmp/source')
166 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167 print " BootManager is already running: try again soon..."
169 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170 cmd = "( touch /tmp/BM_RUNNING ; " + \
171 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
172 " rm -f /tmp/BM_RUNNING " + \
174 cmd = cmd % forceState
175 self.c.modules.os.system(cmd)
181 class PlanetLabSession:
182 globalport = 22000 + int(random.random()*1000)
184 def __init__(self, node, nosetup, verbose):
185 self.verbose = verbose
188 self.nosetup = nosetup
192 def get_connection(self, config):
193 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195 def setup_host(self):
196 self.port = PlanetLabSession.globalport
197 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
200 args['port'] = self.port
201 args['user'] = 'root'
202 args['hostname'] = self.node
203 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
207 print "Skipping setup"
210 # COPY Rpyc files to host
211 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212 if self.verbose: print cmd
215 localos = moncommands.CMD()
217 ret = localos.system(cmd, timeout)
220 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223 ret = localos.system(cmd, timeout)
226 print "\tFAILED TWICE"
228 raise Exception("Failed twice trying to login with updated ssh host key")
231 # KILL any already running servers.
232 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235 echo "kill server" >> out.log
236 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
237 echo "export" >> out.log
238 export PYTHONPATH=$HOME ;
239 echo "start server" >> out.log
240 python Rpyc/Servers/forking_server.py &> server.log &
241 echo "done" >> out.log
243 #cmd = """ssh %(user)s@%(hostname)s """ + \
244 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246 #if self.verbose: print cmd
248 #print localos.system(cmd,timeout)
250 ## START a new rpyc server.
251 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
254 #if self.verbose: print cmd
255 #print localos.system(cmd,timeout)
259 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
260 # and the following options seems to work well.
261 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263 """-o ConnectTimeout=120 """ + \
264 """-n -N -L %(port)s:localhost:18812 """ + \
265 """%(user)s@%(hostname)s"""
267 if self.verbose: print cmd
268 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269 # TODO: the read() here may block indefinitely. Need a better
270 # approach therefore, that includes a timeout.
271 #ret = self.command.stdout.read(5)
272 ret = moncommands.read_t(self.command.stdout, 5)
276 # NOTE: There is still a slight race for machines that are slow...
277 self.timeout = 2*(t2-t1)
278 print "Sleeping for %s sec" % self.timeout
279 time.sleep(self.timeout)
282 if self.command.returncode is not None:
283 print "Failed to establish tunnel!"
284 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
290 if self.verbose: print "Killing SSH session %s" % self.port
294 def steps_to_list(steps):
296 for (id,label) in steps:
297 ret_list.append(label)
300 def index_to_id(steps,index):
301 if index < len(steps):
302 return steps[index][0]
306 def reboot(hostname, config=None, forced_action=None):
308 # NOTE: Nothing works if the bootcd is REALLY old.
309 # So, this is the first step.
310 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311 if fbnode['category'] == "OLDBOOTCD":
312 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314 args['hostname_list'] = " %s" % hostname
316 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
317 mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319 loginbase = plc.siteId(hostname)
320 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
322 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
323 api.UpdateNode(hostname, {'boot_state' : 'disable'})
327 print "Creating session for %s" % node
328 # update known_hosts file (in case the node has rebooted since last run)
329 if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
331 k = SSHKnownHosts(); k.update(node); k.write(); del k
333 print traceback.print_exc()
338 session = PlanetLabSession(node, False, True)
340 session = PlanetLabSession(node, config.nosetup, config.verbose)
342 print "ERROR setting up session for %s" % hostname
343 print traceback.print_exc()
348 conn = session.get_connection(config)
350 # NOTE: sometimes the wait in setup_host() is not long enough.
351 # So, here we try to wait a little longer before giving up entirely.
353 time.sleep(session.timeout*4)
354 conn = session.get_connection(config)
356 print traceback.print_exc()
359 if forced_action == "reboot":
360 conn.restart_node('rins')
363 boot_state = conn.get_boot_state()
364 if boot_state == "boot":
365 print "...Boot state of %s already completed : skipping..." % node
367 elif boot_state == "unknown":
368 print "...Unknown bootstate for %s : skipping..."% node
373 if conn.bootmanager_running():
374 print "...BootManager is currently running. Skipping host %s" % node
379 # conn.restart_bootmanager(config.force)
382 # Read persistent flags, tagged on one week intervals.
383 pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
386 if config and not config.quiet: print "...downloading dmesg from %s" % node
387 dmesg = conn.get_dmesg()
388 child = fdpexpect.fdspawn(dmesg)
393 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
394 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
395 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
397 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
399 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
400 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
402 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
403 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
405 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
406 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
408 ('floppytimeout','floppy0: floppy timeout called'),
409 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
411 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
412 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
414 # floppy0: floppy timeout called
415 # end_request: I/O error, dev fd0, sector 0
417 # Buffer I/O error on device dm-2, logical block 8888896
418 # ata1: status=0x51 { DriveReady SeekComplete Error }
419 # ata1: error=0x40 { UncorrectableError }
420 # SCSI error : <0 0 0 0> return code = 0x8000002
421 # sda: Current: sense key: Medium Error
422 # Additional sense: Unrecovered read error - auto reallocate failed
424 # SCSI error : <0 2 0 0> return code = 0x40001
425 # end_request: I/O error, dev sda, sector 572489600
427 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
434 if config and not config.quiet: print "\tSET: ", s
437 print "...Potential drive errors on %s" % node
438 if len(s) == 2 and 'floppyerror' in s:
439 print "...Should investigate. Continuing with node."
441 print "...Should investigate. Skipping node."
442 # TODO: send message related to these errors.
444 args['hostname'] = hostname
445 args['log'] = conn.get_dmesg().read()
447 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
448 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
450 loginbase = plc.siteId(hostname)
451 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
452 conn.set_nodestate('disable')
455 print "...Downloading bm.log from %s" % node
456 log = conn.get_bootmanager_log()
457 child = fdpexpect.fdspawn(log)
460 if config.collect: return True
466 if config and not config.quiet: print "...Scanning bm.log for errors"
472 ('bminit' , 'Initializing the BootManager.'),
473 ('cfg' , 'Reading node configuration file.'),
474 ('auth' , 'Authenticating node with PLC.'),
475 ('getplc' , 'Retrieving details of node from PLC.'),
476 ('update' , 'Updating node boot state at PLC.'),
477 ('hardware' , 'Checking if hardware requirements met.'),
478 ('installinit' , 'Install: Initializing.'),
479 ('installdisk' , 'Install: partitioning disks.'),
480 ('installbootfs', 'Install: bootstrapfs tarball.'),
481 ('installcfg' , 'Install: Writing configuration files.'),
482 ('installstop' , 'Install: Shutting down installer.'),
483 ('update2' , 'Updating node boot state at PLC.'),
484 ('installinit2' , 'Install: Initializing.'),
485 ('validate' , 'Validating node installation.'),
486 ('rebuildinitrd', 'Rebuilding initrd'),
487 ('netcfg' , 'Install: Writing Network Configuration files.'),
488 ('update3' , 'Updating node configuration.'),
489 ('disk' , 'Checking for unused disks to add to LVM.'),
490 ('update4' , 'Sending hardware configuration to PLC.'),
491 ('debug' , 'Starting debug mode'),
492 ('bmexceptmount', 'BootManagerException during mount'),
493 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
494 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
495 ('exception' , 'Exception'),
496 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
497 ('protoerror' , 'XML RPC protocol error'),
498 ('nodehostname' , 'Configured node hostname does not resolve'),
499 ('implementerror', 'Implementation Error'),
500 ('readonlyfs' , '[Errno 30] Read-only file system'),
501 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
502 ('noinstall' , 'notinstalled'),
503 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
504 ('noblockdev' , "No block devices detected."),
505 ('dnserror' , 'Name or service not known'),
506 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
507 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
508 ('hardwarerequirefail' , 'Hardware requirements not met'),
509 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
510 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
511 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
512 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
513 ('modulefail' , 'Unable to get list of system modules'),
514 ('writeerror' , 'write error: No space left on device'),
515 ('nospace' , "No space left on device"),
516 ('nonode' , 'Failed to authenticate call: No such node'),
517 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
518 ('bootcheckfail' , 'BootCheckAuthentication'),
519 ('bootupdatefail' , 'BootUpdateNode'),
521 list = steps_to_list(steps)
522 index = child.expect( list + [ pexpect.EOF ])
523 id = index_to_id(steps,index)
526 if id == "exception":
527 if config and not config.quiet: print "...Found An Exception!!!"
528 elif index == len(list):
532 s = "-".join(sequence)
533 print " FOUND SEQUENCE: ", s
535 # NOTE: We get or set the flag based on the current sequence identifier.
536 # By using the sequence identifier, we guarantee that there will be no
537 # frequent loops. I'm guessing there is a better way to track loops,
539 #if not config.force and pflags.getRecentFlag(s):
540 # pflags.setRecentFlag(s)
542 # print "... flag is set or it has already run recently. Skipping %s" % node
548 # restart_bootmanager_boot
549 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
550 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
551 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
553 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
555 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
556 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
557 "bminit-cfg-auth-getplc-update-debug-done",
558 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
559 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
560 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
561 "bminit-cfg-auth-protoerror-exception-update-debug-done",
562 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
563 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
565 sequences.update({n : "restart_bootmanager_boot"})
567 # conn.restart_bootmanager('rins')
568 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
569 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
570 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
571 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
572 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
573 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
574 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
575 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
576 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
577 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
578 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
579 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
580 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
581 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
582 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
583 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
584 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
585 # actual solution appears to involve removing the bad files, and
586 # continually trying to boot the node.
587 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
589 sequences.update({n : "restart_bootmanager_rins"})
592 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
594 # conn.restart_node('rins')
595 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
596 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
597 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
598 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
599 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
600 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
601 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
602 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
603 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
604 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
605 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
606 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
608 sequences.update({n : "restart_node_rins"})
611 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
612 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
613 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
614 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
615 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
617 sequences.update({n: "restart_node_boot"})
619 # update_node_config_email
620 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
621 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
622 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
624 sequences.update({n : "update_node_config_email"})
626 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
627 "bminit-cfg-update-exception-nodehostname-update-debug-done",
629 sequences.update({n : "nodenetwork_email"})
631 # update_bootcd_email
632 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
633 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
634 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
635 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
636 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
638 sequences.update({n : "update_bootcd_email"})
640 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
642 sequences.update({n: "suspect_error_email"})
644 # update_hardware_email
645 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
646 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
648 # broken_hardware_email
649 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
653 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
654 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
656 sequences.update( { n : "bad_dns_email"})
661 if s not in sequences:
662 print " HOST %s" % hostname
663 print " UNKNOWN SEQUENCE: %s" % s
666 args['hostname'] = hostname
668 args['bmlog'] = conn.get_bootmanager_log().read()
669 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
670 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
672 m.send(['monitor-list@lists.planet-lab.org'])
674 conn.restart_bootmanager('boot')
676 # NOTE: Do not set the pflags value for this sequence if it's unknown.
677 # This way, we can check it again after we've fixed it.
682 if sequences[s] == "restart_bootmanager_boot":
683 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
684 conn.restart_bootmanager('boot')
685 elif sequences[s] == "restart_bootmanager_rins":
686 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
687 conn.restart_bootmanager('rins')
688 elif sequences[s] == "restart_node_rins":
689 conn.restart_node('rins')
690 elif sequences[s] == "restart_node_boot":
691 conn.restart_node('boot')
692 elif sequences[s] == "repair_node_keys":
693 if conn.compare_and_repair_nodekeys():
694 # the keys either are in sync or were forced in sync.
695 # so try to reboot the node again.
696 conn.restart_bootmanager('rins')
699 # there was some failure to synchronize the keys.
700 print "...Unable to repair node keys on %s" % node
702 elif sequences[s] == "suspect_error_email":
704 args['hostname'] = hostname
706 args['bmlog'] = conn.get_bootmanager_log().read()
707 m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
708 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
710 m.send(['monitor-list@lists.planet-lab.org'])
712 conn.restart_bootmanager('boot')
714 elif sequences[s] == "update_node_config_email":
715 print "...Sending message to UPDATE NODE CONFIG"
717 args['hostname'] = hostname
718 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
719 True, db='nodeid_persistmessages')
720 loginbase = plc.siteId(hostname)
721 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
722 conn.dump_plconf_file()
723 conn.set_nodestate('disable')
725 elif sequences[s] == "nodenetwork_email":
726 print "...Sending message to LOOK AT NODE NETWORK"
728 args['hostname'] = hostname
729 args['bmlog'] = conn.get_bootmanager_log().read()
730 m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
731 True, db='nodenet_persistmessages')
732 loginbase = plc.siteId(hostname)
733 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
734 conn.dump_plconf_file()
735 conn.set_nodestate('disable')
737 elif sequences[s] == "update_bootcd_email":
738 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
741 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
742 args['hostname_list'] = "%s" % hostname
744 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
745 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
747 loginbase = plc.siteId(hostname)
748 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
750 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
751 conn.set_nodestate('disable')
753 elif sequences[s] == "broken_hardware_email":
754 # MAKE An ACTION record that this host has failed hardware. May
755 # require either an exception "/minhw" or other manual intervention.
756 # Definitely need to send out some more EMAIL.
757 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
758 # TODO: email notice of broken hardware
760 args['hostname'] = hostname
761 args['log'] = conn.get_dmesg().read()
762 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
763 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
765 loginbase = plc.siteId(hostname)
766 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
767 conn.set_nodestate('disable')
769 elif sequences[s] == "update_hardware_email":
770 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
772 args['hostname'] = hostname
773 args['bmlog'] = conn.get_bootmanager_log().read()
774 m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
775 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
777 loginbase = plc.siteId(hostname)
778 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
779 conn.set_nodestate('disable')
781 elif sequences[s] == "bad_dns_email":
782 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
785 node = api.GetNodes(hostname)[0]
786 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
788 print traceback.print_exc()
789 # TODO: api error. skip email, b/c all info is not available,
790 # flag_set will not be recorded.
792 nodenet_str = network_config_to_str(net)
794 args['hostname'] = hostname
795 args['network_config'] = nodenet_str
796 args['nodenetwork_id'] = net['nodenetwork_id']
797 m = PersistMessage(hostname, mailtxt.baddns[0] % args,
798 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
800 loginbase = plc.siteId(hostname)
801 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
802 conn.set_nodestate('disable')
805 pflags.setRecentFlag(s)
811 # MAIN -------------------------------------------------------------------
814 from monitor import parser as parsermodule
815 parser = parsermodule.getParser()
817 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
818 force=None, quiet=False)
819 parser.add_option("", "--child", dest="child", action="store_true",
820 help="This is the child mode of this process.")
821 parser.add_option("", "--force", dest="force", metavar="boot_state",
822 help="Force a boot state passed to BootManager.py.")
823 parser.add_option("", "--quiet", dest="quiet", action="store_true",
824 help="Extra quiet output messages.")
825 parser.add_option("", "--verbose", dest="verbose", action="store_true",
826 help="Extra debug output messages.")
827 parser.add_option("", "--nonet", dest="nonet", action="store_true",
828 help="Do not setup the network, use existing log files to re-run a test pass.")
829 parser.add_option("", "--collect", dest="collect", action="store_true",
830 help="No action, just collect dmesg, and bm.log")
831 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
832 help="Do not perform the orginary setup phase.")
834 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
835 config = parsermodule.parse_args(parser)
838 nodes = config.getListFromFile(config.nodelist)
840 nodes = [ config.node ]
848 if __name__ == "__main__":