3 # Attempt to reboot a node in debug state.
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
13 from getsshkeys import SSHKnownHosts
17 from monitor.util import command as moncommands
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
27 from monitor import config
30 class Sopen(subprocess.Popen):
31 def kill(self, signal = signal.SIGTERM):
32 os.kill(self.pid, signal)
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
40 def __init__(self, connection, node, config):
45 def get_boot_state(self):
46 if self.c.modules.os.path.exists('/tmp/source'):
48 elif self.c.modules.os.path.exists('/vservers'):
54 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56 log = open("log/dmesg.%s.log" % self.node, 'r')
59 def get_bootmanager_log(self):
60 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62 log = open("log/bm.%s.log" % self.node, 'r')
65 def dump_plconf_file(self):
67 self.c.modules.sys.path.append("/tmp/source/")
68 self.c.modules.os.chdir('/tmp/source')
70 log = c.modules.BootManager.log('/tmp/new.log')
71 bm = c.modules.BootManager.BootManager(log,'boot')
73 BootManagerException = c.modules.Exceptions.BootManagerException
74 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
78 InitializeBootManager.Run(bm.VARS, bm.LOG)
79 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
83 print " Possibly, unable to find valid configuration file"
85 if bm_continue and self.config and not self.config.quiet:
86 for key in bm.VARS.keys():
87 print key, " == ", bm.VARS[key]
89 if self.config and not self.config.quiet: print " Unable to read Node Configuration"
92 def compare_and_repair_nodekeys(self):
94 self.c.modules.sys.path.append("/tmp/source/")
95 self.c.modules.os.chdir('/tmp/source')
97 log = c.modules.BootManager.log('/tmp/new.log')
98 bm = c.modules.BootManager.BootManager(log,'boot')
100 BootManagerException = c.modules.Exceptions.BootManagerException
101 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
105 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107 InitializeBootManager.Run(bm.VARS, bm.LOG)
108 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
113 print " Possibly, unable to find valid configuration file"
116 print " NODE: %s" % bm.VARS['NODE_KEY']
117 print " PLC : %s" % plcnode['key']
119 if bm.VARS['NODE_KEY'] == plcnode['key']:
122 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123 print " Successfully updated NODE_KEY with PLC"
128 #for key in bm.VARS.keys():
129 # print key, " == ", bm.VARS[key]
131 print " Unable to retrieve NODE_KEY"
133 def bootmanager_running(self):
134 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
139 def set_nodestate(self, state='boot'):
140 return api.UpdateNode(self.node, {'boot_state' : state})
142 def restart_node(self, state='boot'):
143 api.UpdateNode(self.node, {'boot_state' : state})
145 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146 if not pflags.getRecentFlag('gentlekill'):
147 print " Killing all slice processes... : %s" % self.node
148 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149 self.c.modules.os.system(cmd_slicekill)
150 cmd = """ shutdown -r +1 & """
151 print " Restarting %s : %s" % ( self.node, cmd)
152 self.c.modules.os.system(cmd)
154 pflags.setRecentFlag('gentlekill')
157 print " Restarting with sysrq 'sub' %s" % self.node
158 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159 self.c.modules.os.system(cmd)
163 def restart_bootmanager(self, forceState):
165 self.c.modules.os.chdir('/tmp/source')
166 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167 print " BootManager is already running: try again soon..."
169 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170 cmd = "( touch /tmp/BM_RUNNING ; " + \
171 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
172 " rm -f /tmp/BM_RUNNING " + \
174 cmd = cmd % forceState
175 self.c.modules.os.system(cmd)
181 class PlanetLabSession:
182 globalport = 22000 + int(random.random()*1000)
184 def __init__(self, node, nosetup, verbose):
185 self.verbose = verbose
188 self.nosetup = nosetup
192 def get_connection(self, config):
193 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195 def setup_host(self):
196 self.port = PlanetLabSession.globalport
197 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
200 args['port'] = self.port
201 args['user'] = 'root'
202 args['hostname'] = self.node
203 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
207 print "Skipping setup"
210 # COPY Rpyc files to host
211 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212 if self.verbose: print cmd
215 localos = moncommands.CMD()
217 ret = localos.system(cmd, timeout)
220 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223 ret = localos.system(cmd, timeout)
226 print "\tFAILED TWICE"
228 raise Exception("Failed twice trying to login with updated ssh host key")
231 # KILL any already running servers.
232 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235 echo "kill server" >> out.log
236 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
237 echo "export" >> out.log
238 export PYTHONPATH=$HOME ;
239 echo "start server" >> out.log
240 python Rpyc/Servers/forking_server.py &> server.log &
241 echo "done" >> out.log
243 #cmd = """ssh %(user)s@%(hostname)s """ + \
244 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246 #if self.verbose: print cmd
248 #print localos.system(cmd,timeout)
250 ## START a new rpyc server.
251 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
254 #if self.verbose: print cmd
255 #print localos.system(cmd,timeout)
259 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
260 # and the following options seems to work well.
261 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263 """-o ConnectTimeout=120 """ + \
264 """-n -N -L %(port)s:localhost:18812 """ + \
265 """%(user)s@%(hostname)s"""
267 if self.verbose: print cmd
268 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269 # TODO: the read() here may block indefinitely. Need a better
270 # approach therefore, that includes a timeout.
271 #ret = self.command.stdout.read(5)
272 ret = moncommands.read_t(self.command.stdout, 5)
276 # NOTE: There is still a slight race for machines that are slow...
277 self.timeout = 2*(t2-t1)
278 print "Sleeping for %s sec" % self.timeout
279 time.sleep(self.timeout)
282 if self.command.returncode is not None:
283 print "Failed to establish tunnel!"
284 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
290 if self.verbose: print "Killing SSH session %s" % self.port
294 def steps_to_list(steps):
296 for (id,label) in steps:
297 ret_list.append(label)
300 def index_to_id(steps,index):
301 if index < len(steps):
302 return steps[index][0]
306 def reboot(hostname, config=None, forced_action=None):
308 # NOTE: Nothing works if the bootcd is REALLY old.
309 # So, this is the first step.
310 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311 if fbnode['category'] == "OLDBOOTCD":
312 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314 args['hostname_list'] = " %s" % hostname
316 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
317 mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319 loginbase = plc.siteId(hostname)
320 emails = plc.getTechEmails(loginbase)
323 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
324 api.UpdateNode(hostname, {'boot_state' : 'disable'})
328 print "Creating session for %s" % node
329 # update known_hosts file (in case the node has rebooted since last run)
330 if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
332 k = SSHKnownHosts(); k.update(node); k.write(); del k
334 print traceback.print_exc()
339 session = PlanetLabSession(node, False, True)
341 session = PlanetLabSession(node, config.nosetup, config.verbose)
343 print "ERROR setting up session for %s" % hostname
344 print traceback.print_exc()
349 conn = session.get_connection(config)
351 # NOTE: sometimes the wait in setup_host() is not long enough.
352 # So, here we try to wait a little longer before giving up entirely.
354 time.sleep(session.timeout*4)
355 conn = session.get_connection(config)
357 print traceback.print_exc()
360 if forced_action == "reboot":
361 conn.restart_node('rins')
364 boot_state = conn.get_boot_state()
365 if boot_state == "boot":
366 print "...Boot state of %s already completed : skipping..." % node
368 elif boot_state == "unknown":
369 print "...Unknown bootstate for %s : skipping..."% node
374 if conn.bootmanager_running():
375 print "...BootManager is currently running. Skipping host %s" % node
380 # conn.restart_bootmanager(config.force)
383 # Read persistent flags, tagged on one week intervals.
384 pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
387 if config and not config.quiet: print "...downloading dmesg from %s" % node
388 dmesg = conn.get_dmesg()
389 child = fdpexpect.fdspawn(dmesg)
394 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
395 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
396 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
398 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
400 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
401 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
403 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
404 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
406 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
407 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
409 ('floppytimeout','floppy0: floppy timeout called'),
410 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
412 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
413 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
415 # floppy0: floppy timeout called
416 # end_request: I/O error, dev fd0, sector 0
418 # Buffer I/O error on device dm-2, logical block 8888896
419 # ata1: status=0x51 { DriveReady SeekComplete Error }
420 # ata1: error=0x40 { UncorrectableError }
421 # SCSI error : <0 0 0 0> return code = 0x8000002
422 # sda: Current: sense key: Medium Error
423 # Additional sense: Unrecovered read error - auto reallocate failed
425 # SCSI error : <0 2 0 0> return code = 0x40001
426 # end_request: I/O error, dev sda, sector 572489600
428 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
435 if config and not config.quiet: print "\tSET: ", s
438 print "...Potential drive errors on %s" % node
439 if len(s) == 2 and 'floppyerror' in s:
440 print "...Should investigate. Continuing with node."
442 print "...Should investigate. Skipping node."
443 # TODO: send message related to these errors.
445 args['hostname'] = hostname
446 args['log'] = conn.get_dmesg().read()
448 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
449 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
451 loginbase = plc.siteId(hostname)
452 emails = plc.getTechEmails(loginbase)
454 conn.set_nodestate('disable')
457 print "...Downloading bm.log from %s" % node
458 log = conn.get_bootmanager_log()
459 child = fdpexpect.fdspawn(log)
462 if config.collect: return True
468 if config and not config.quiet: print "...Scanning bm.log for errors"
474 ('bminit' , 'Initializing the BootManager.'),
475 ('cfg' , 'Reading node configuration file.'),
476 ('auth' , 'Authenticating node with PLC.'),
477 ('getplc' , 'Retrieving details of node from PLC.'),
478 ('update' , 'Updating node boot state at PLC.'),
479 ('hardware' , 'Checking if hardware requirements met.'),
480 ('installinit' , 'Install: Initializing.'),
481 ('installdisk' , 'Install: partitioning disks.'),
482 ('installbootfs', 'Install: bootstrapfs tarball.'),
483 ('installcfg' , 'Install: Writing configuration files.'),
484 ('installstop' , 'Install: Shutting down installer.'),
485 ('update2' , 'Updating node boot state at PLC.'),
486 ('installinit2' , 'Install: Initializing.'),
487 ('validate' , 'Validating node installation.'),
488 ('rebuildinitrd', 'Rebuilding initrd'),
489 ('netcfg' , 'Install: Writing Network Configuration files.'),
490 ('update3' , 'Updating node configuration.'),
491 ('disk' , 'Checking for unused disks to add to LVM.'),
492 ('update4' , 'Sending hardware configuration to PLC.'),
493 ('debug' , 'Starting debug mode'),
494 ('bmexceptmount', 'BootManagerException during mount'),
495 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
496 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
497 ('exception' , 'Exception'),
498 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
499 ('protoerror' , 'XML RPC protocol error'),
500 ('nodehostname' , 'Configured node hostname does not resolve'),
501 ('implementerror', 'Implementation Error'),
502 ('readonlyfs' , '[Errno 30] Read-only file system'),
503 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
504 ('noinstall' , 'notinstalled'),
505 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
506 ('noblockdev' , "No block devices detected."),
507 ('dnserror' , 'Name or service not known'),
508 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
509 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
510 ('hardwarerequirefail' , 'Hardware requirements not met'),
511 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
512 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
513 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
514 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
515 ('modulefail' , 'Unable to get list of system modules'),
516 ('writeerror' , 'write error: No space left on device'),
517 ('nospace' , "No space left on device"),
518 ('nonode' , 'Failed to authenticate call: No such node'),
519 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
520 ('bootcheckfail' , 'BootCheckAuthentication'),
521 ('bootupdatefail' , 'BootUpdateNode'),
523 list = steps_to_list(steps)
524 index = child.expect( list + [ pexpect.EOF ])
525 id = index_to_id(steps,index)
528 if id == "exception":
529 if config and not config.quiet: print "...Found An Exception!!!"
530 elif index == len(list):
534 s = "-".join(sequence)
535 print " FOUND SEQUENCE: ", s
537 # NOTE: We get or set the flag based on the current sequence identifier.
538 # By using the sequence identifier, we guarantee that there will be no
539 # frequent loops. I'm guessing there is a better way to track loops,
541 #if not config.force and pflags.getRecentFlag(s):
542 # pflags.setRecentFlag(s)
544 # print "... flag is set or it has already run recently. Skipping %s" % node
550 # restart_bootmanager_boot
551 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
552 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
553 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
555 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
557 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
558 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
559 "bminit-cfg-auth-getplc-update-debug-done",
560 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
561 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
562 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
563 "bminit-cfg-auth-protoerror-exception-update-debug-done",
564 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
565 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
567 sequences.update({n : "restart_bootmanager_boot"})
569 # conn.restart_bootmanager('rins')
570 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
571 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
572 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
573 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
574 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
575 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
576 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
577 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
578 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
579 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
580 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
581 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
582 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
583 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
584 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
585 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
586 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
587 # actual solution appears to involve removing the bad files, and
588 # continually trying to boot the node.
589 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
591 sequences.update({n : "restart_bootmanager_rins"})
594 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
596 # conn.restart_node('rins')
597 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
598 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
599 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
600 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
601 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
602 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
603 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
604 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
605 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
606 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
607 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
608 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
610 sequences.update({n : "restart_node_rins"})
613 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
614 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
615 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
616 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
617 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
618 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
620 sequences.update({n: "restart_node_boot"})
622 # update_node_config_email
623 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
624 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
625 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
627 sequences.update({n : "update_node_config_email"})
629 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
630 "bminit-cfg-update-exception-nodehostname-update-debug-done",
632 sequences.update({n : "nodenetwork_email"})
634 # update_bootcd_email
635 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
636 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
637 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
638 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
639 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
641 sequences.update({n : "update_bootcd_email"})
643 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
645 sequences.update({n: "suspect_error_email"})
647 # update_hardware_email
648 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
649 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
651 # broken_hardware_email
652 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
656 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
657 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
659 sequences.update( { n : "bad_dns_email"})
664 if s not in sequences:
665 print " HOST %s" % hostname
666 print " UNKNOWN SEQUENCE: %s" % s
669 args['hostname'] = hostname
671 args['bmlog'] = conn.get_bootmanager_log().read()
672 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
673 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
675 m.send([config.cc_email])
677 conn.restart_bootmanager('boot')
679 # NOTE: Do not set the pflags value for this sequence if it's unknown.
680 # This way, we can check it again after we've fixed it.
685 if sequences[s] == "restart_bootmanager_boot":
686 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
687 conn.restart_bootmanager('boot')
688 elif sequences[s] == "restart_bootmanager_rins":
689 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
690 conn.restart_bootmanager('rins')
691 elif sequences[s] == "restart_node_rins":
692 conn.restart_node('rins')
693 elif sequences[s] == "restart_node_boot":
694 conn.restart_node('boot')
695 elif sequences[s] == "repair_node_keys":
696 if conn.compare_and_repair_nodekeys():
697 # the keys either are in sync or were forced in sync.
698 # so try to reboot the node again.
699 conn.restart_bootmanager('rins')
702 # there was some failure to synchronize the keys.
703 print "...Unable to repair node keys on %s" % node
705 elif sequences[s] == "suspect_error_email":
707 args['hostname'] = hostname
709 args['bmlog'] = conn.get_bootmanager_log().read()
710 m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
711 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
713 m.send([config.cc_email])
715 conn.restart_bootmanager('boot')
717 elif sequences[s] == "update_node_config_email":
718 print "...Sending message to UPDATE NODE CONFIG"
720 args['hostname'] = hostname
721 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
722 True, db='nodeid_persistmessages')
723 loginbase = plc.siteId(hostname)
724 emails = plc.getTechEmails(loginbase)
726 conn.dump_plconf_file()
727 conn.set_nodestate('disable')
729 elif sequences[s] == "nodenetwork_email":
730 print "...Sending message to LOOK AT NODE NETWORK"
732 args['hostname'] = hostname
733 args['bmlog'] = conn.get_bootmanager_log().read()
734 m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
735 True, db='nodenet_persistmessages')
736 loginbase = plc.siteId(hostname)
737 emails = plc.getTechEmails(loginbase)
739 conn.dump_plconf_file()
740 conn.set_nodestate('disable')
742 elif sequences[s] == "update_bootcd_email":
743 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
746 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
747 args['hostname_list'] = "%s" % hostname
749 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
750 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
752 loginbase = plc.siteId(hostname)
753 emails = plc.getTechEmails(loginbase)
756 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
757 conn.set_nodestate('disable')
759 elif sequences[s] == "broken_hardware_email":
760 # MAKE An ACTION record that this host has failed hardware. May
761 # require either an exception "/minhw" or other manual intervention.
762 # Definitely need to send out some more EMAIL.
763 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
764 # TODO: email notice of broken hardware
766 args['hostname'] = hostname
767 args['log'] = conn.get_dmesg().read()
768 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
769 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
771 loginbase = plc.siteId(hostname)
772 emails = plc.getTechEmails(loginbase)
774 conn.set_nodestate('disable')
776 elif sequences[s] == "update_hardware_email":
777 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
779 args['hostname'] = hostname
780 args['bmlog'] = conn.get_bootmanager_log().read()
781 m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
782 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
784 loginbase = plc.siteId(hostname)
785 emails = plc.getTechEmails(loginbase)
787 conn.set_nodestate('disable')
789 elif sequences[s] == "bad_dns_email":
790 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
793 node = api.GetNodes(hostname)[0]
794 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
796 print traceback.print_exc()
797 # TODO: api error. skip email, b/c all info is not available,
798 # flag_set will not be recorded.
800 nodenet_str = network_config_to_str(net)
802 args['hostname'] = hostname
803 args['network_config'] = nodenet_str
804 args['nodenetwork_id'] = net['nodenetwork_id']
805 m = PersistMessage(hostname, mailtxt.baddns[0] % args,
806 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
808 loginbase = plc.siteId(hostname)
809 emails = plc.getTechEmails(loginbase)
811 conn.set_nodestate('disable')
814 pflags.setRecentFlag(s)
820 # MAIN -------------------------------------------------------------------
823 from monitor import parser as parsermodule
824 parser = parsermodule.getParser()
826 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
827 force=None, quiet=False)
828 parser.add_option("", "--child", dest="child", action="store_true",
829 help="This is the child mode of this process.")
830 parser.add_option("", "--force", dest="force", metavar="boot_state",
831 help="Force a boot state passed to BootManager.py.")
832 parser.add_option("", "--quiet", dest="quiet", action="store_true",
833 help="Extra quiet output messages.")
834 parser.add_option("", "--verbose", dest="verbose", action="store_true",
835 help="Extra debug output messages.")
836 parser.add_option("", "--nonet", dest="nonet", action="store_true",
837 help="Do not setup the network, use existing log files to re-run a test pass.")
838 parser.add_option("", "--collect", dest="collect", action="store_true",
839 help="No action, just collect dmesg, and bm.log")
840 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
841 help="Do not perform the orginary setup phase.")
843 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
844 config = parsermodule.parse_args(parser)
847 nodes = config.getListFromFile(config.nodelist)
849 nodes = [ config.node ]
857 if __name__ == "__main__":