3 # Attempt to reboot a node in debug state.
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
13 from getsshkeys import SSHKnownHosts
17 from pcucontrol.util import command as moncommands
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
27 from monitor import config
30 class Sopen(subprocess.Popen):
31 def kill(self, signal = signal.SIGTERM):
32 os.kill(self.pid, signal)
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
40 def __init__(self, connection, node, config):
45 def get_boot_state(self):
46 if self.c.modules.os.path.exists('/tmp/source'):
48 elif self.c.modules.os.path.exists('/vservers'):
54 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56 log = open("log/dmesg.%s.log" % self.node, 'r')
59 def get_bootmanager_log(self):
60 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62 log = open("log/bm.%s.log" % self.node, 'r')
65 def dump_plconf_file(self):
67 self.c.modules.sys.path.append("/tmp/source/")
68 self.c.modules.os.chdir('/tmp/source')
70 log = c.modules.BootManager.log('/tmp/new.log')
71 bm = c.modules.BootManager.BootManager(log,'boot')
73 BootManagerException = c.modules.Exceptions.BootManagerException
74 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
78 InitializeBootManager.Run(bm.VARS, bm.LOG)
79 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
83 print " Possibly, unable to find valid configuration file"
86 for key in bm.VARS.keys():
87 print key, " == ", bm.VARS[key]
89 print " Unable to read Node Configuration"
92 def compare_and_repair_nodekeys(self):
94 self.c.modules.sys.path.append("/tmp/source/")
95 self.c.modules.os.chdir('/tmp/source')
97 log = c.modules.BootManager.log('/tmp/new.log')
98 bm = c.modules.BootManager.BootManager(log,'boot')
100 BootManagerException = c.modules.Exceptions.BootManagerException
101 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
105 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107 InitializeBootManager.Run(bm.VARS, bm.LOG)
108 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
113 print " Possibly, unable to find valid configuration file"
116 print " NODE: %s" % bm.VARS['NODE_KEY']
117 print " PLC : %s" % plcnode['key']
119 if bm.VARS['NODE_KEY'] == plcnode['key']:
122 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123 print " Successfully updated NODE_KEY with PLC"
128 #for key in bm.VARS.keys():
129 # print key, " == ", bm.VARS[key]
131 print " Unable to retrieve NODE_KEY"
133 def bootmanager_running(self):
134 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
139 def set_nodestate(self, state='boot'):
140 return api.UpdateNode(self.node, {'boot_state' : state})
142 def restart_node(self, state='boot'):
143 api.UpdateNode(self.node, {'boot_state' : state})
145 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146 if not pflags.getRecentFlag('gentlekill'):
147 print " Killing all slice processes... : %s" % self.node
148 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149 self.c.modules.os.system(cmd_slicekill)
150 cmd = """ shutdown -r +1 & """
151 print " Restarting %s : %s" % ( self.node, cmd)
152 self.c.modules.os.system(cmd)
154 pflags.setRecentFlag('gentlekill')
157 print " Restarting with sysrq 'sub' %s" % self.node
158 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159 self.c.modules.os.system(cmd)
163 def restart_bootmanager(self, forceState):
165 self.c.modules.os.chdir('/tmp/source')
166 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167 print " BootManager is already running: try again soon..."
169 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170 cmd = "( touch /tmp/BM_RUNNING ; " + \
171 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
172 " rm -f /tmp/BM_RUNNING " + \
174 cmd = cmd % forceState
175 self.c.modules.os.system(cmd)
181 class PlanetLabSession:
182 globalport = 22000 + int(random.random()*1000)
184 def __init__(self, node, nosetup, verbose):
185 self.verbose = verbose
188 self.nosetup = nosetup
192 def get_connection(self, config):
193 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195 def setup_host(self):
196 self.port = PlanetLabSession.globalport
197 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
200 args['port'] = self.port
201 args['user'] = 'root'
202 args['hostname'] = self.node
203 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
207 print "Skipping setup"
210 # COPY Rpyc files to host
211 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212 if self.verbose: print cmd
215 localos = moncommands.CMD()
217 ret = localos.system(cmd, timeout)
220 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223 ret = localos.system(cmd, timeout)
226 print "\tFAILED TWICE"
228 raise Exception("Failed twice trying to login with updated ssh host key")
231 # KILL any already running servers.
232 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235 echo "kill server" >> out.log
236 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
237 echo "export" >> out.log
238 export PYTHONPATH=$HOME ;
239 echo "start server" >> out.log
240 python Rpyc/Servers/forking_server.py &> server.log &
241 echo "done" >> out.log
243 #cmd = """ssh %(user)s@%(hostname)s """ + \
244 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246 #if self.verbose: print cmd
248 #print localos.system(cmd,timeout)
250 ## START a new rpyc server.
251 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
254 #if self.verbose: print cmd
255 #print localos.system(cmd,timeout)
259 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
260 # and the following options seems to work well.
261 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263 """-o ConnectTimeout=120 """ + \
264 """-n -N -L %(port)s:localhost:18812 """ + \
265 """%(user)s@%(hostname)s"""
267 if self.verbose: print cmd
268 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269 # TODO: the read() here may block indefinitely. Need a better
270 # approach therefore, that includes a timeout.
271 #ret = self.command.stdout.read(5)
272 ret = moncommands.read_t(self.command.stdout, 5)
276 # NOTE: There is still a slight race for machines that are slow...
277 self.timeout = 2*(t2-t1)
278 print "Sleeping for %s sec" % self.timeout
279 time.sleep(self.timeout)
282 if self.command.returncode is not None:
283 print "Failed to establish tunnel!"
284 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
290 if self.verbose: print "Killing SSH session %s" % self.port
294 def steps_to_list(steps):
296 for (id,label) in steps:
297 ret_list.append(label)
300 def index_to_id(steps,index):
301 if index < len(steps):
302 return steps[index][0]
306 def reboot(hostname, config=None, forced_action=None):
308 # NOTE: Nothing works if the bootcd is REALLY old.
309 # So, this is the first step.
310 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
312 if fbnode['observed_category'] == "OLDBOOTCD":
313 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
315 args['hostname_list'] = " %s" % hostname
317 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
318 mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
320 loginbase = plc.siteId(hostname)
321 emails = plc.getTechEmails(loginbase)
324 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
325 api.UpdateNode(hostname, {'boot_state' : 'disable'})
329 print "Creating session for %s" % node
330 # update known_hosts file (in case the node has rebooted since last run)
331 if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
333 k = SSHKnownHosts(); k.update(node); k.write(); del k
335 from monitor.common import email_exception
337 print traceback.print_exc()
342 session = PlanetLabSession(node, False, True)
344 session = PlanetLabSession(node, config.nosetup, config.verbose)
346 msg = "ERROR setting up session for %s" % hostname
348 print traceback.print_exc()
349 from monitor.common import email_exception
355 conn = session.get_connection(config)
357 # NOTE: sometimes the wait in setup_host() is not long enough.
358 # So, here we try to wait a little longer before giving up entirely.
360 time.sleep(session.timeout*4)
361 conn = session.get_connection(config)
363 print traceback.print_exc()
364 from monitor.common import email_exception
368 if forced_action == "reboot":
369 conn.restart_node('rins')
372 boot_state = conn.get_boot_state()
373 if boot_state == "boot":
374 print "...Boot state of %s already completed : skipping..." % node
376 elif boot_state == "unknown":
377 print "...Unknown bootstate for %s : skipping..."% node
382 if conn.bootmanager_running():
383 print "...BootManager is currently running. Skipping host %s" % node
388 # conn.restart_bootmanager(config.force)
391 # Read persistent flags, tagged on one week intervals.
392 pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
395 if config and not config.quiet: print "...downloading dmesg from %s" % node
396 dmesg = conn.get_dmesg()
397 child = fdpexpect.fdspawn(dmesg)
402 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
403 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
404 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
406 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
408 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
409 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
411 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
412 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
414 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
415 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
417 ('floppytimeout','floppy0: floppy timeout called'),
418 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
420 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
421 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
423 # floppy0: floppy timeout called
424 # end_request: I/O error, dev fd0, sector 0
426 # Buffer I/O error on device dm-2, logical block 8888896
427 # ata1: status=0x51 { DriveReady SeekComplete Error }
428 # ata1: error=0x40 { UncorrectableError }
429 # SCSI error : <0 0 0 0> return code = 0x8000002
430 # sda: Current: sense key: Medium Error
431 # Additional sense: Unrecovered read error - auto reallocate failed
433 # SCSI error : <0 2 0 0> return code = 0x40001
434 # end_request: I/O error, dev sda, sector 572489600
436 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
443 if config and not config.quiet: print "\tSET: ", s
446 print "...Potential drive errors on %s" % node
447 if len(s) == 2 and 'floppyerror' in s:
448 print "...Should investigate. Continuing with node."
450 print "...Should investigate. Skipping node."
451 # TODO: send message related to these errors.
453 args['hostname'] = hostname
454 args['log'] = conn.get_dmesg().read()
456 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
457 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
459 loginbase = plc.siteId(hostname)
460 emails = plc.getTechEmails(loginbase)
462 conn.set_nodestate('disable')
465 print "...Downloading bm.log from %s" % node
466 log = conn.get_bootmanager_log()
467 child = fdpexpect.fdspawn(log)
470 if config.collect: return True
476 if config and not config.quiet: print "...Scanning bm.log for errors"
482 ('bminit' , 'Initializing the BootManager.'),
483 ('cfg' , 'Reading node configuration file.'),
484 ('auth' , 'Authenticating node with PLC.'),
485 ('getplc' , 'Retrieving details of node from PLC.'),
486 ('update' , 'Updating node boot state at PLC.'),
487 ('hardware' , 'Checking if hardware requirements met.'),
488 ('installinit' , 'Install: Initializing.'),
489 ('installdisk' , 'Install: partitioning disks.'),
490 ('installbootfs', 'Install: bootstrapfs tarball.'),
491 ('installcfg' , 'Install: Writing configuration files.'),
492 ('installstop' , 'Install: Shutting down installer.'),
493 ('update2' , 'Updating node boot state at PLC.'),
494 ('installinit2' , 'Install: Initializing.'),
495 ('validate' , 'Validating node installation.'),
496 ('rebuildinitrd', 'Rebuilding initrd'),
497 ('netcfg' , 'Install: Writing Network Configuration files.'),
498 ('update3' , 'Updating node configuration.'),
499 ('disk' , 'Checking for unused disks to add to LVM.'),
500 ('update4' , 'Sending hardware configuration to PLC.'),
501 ('debug' , 'Starting debug mode'),
502 ('bmexceptmount', 'BootManagerException during mount'),
503 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
504 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
505 ('exception' , 'Exception'),
506 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
507 ('protoerror' , 'XML RPC protocol error'),
508 ('nodehostname' , 'Configured node hostname does not resolve'),
509 ('implementerror', 'Implementation Error'),
510 ('readonlyfs' , '[Errno 30] Read-only file system'),
511 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
512 ('noinstall' , 'notinstalled'),
513 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
514 ('noblockdev' , "No block devices detected."),
515 ('dnserror' , 'Name or service not known'),
516 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
517 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
518 ('hardwarerequirefail' , 'Hardware requirements not met'),
519 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
520 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
521 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
522 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
523 ('modulefail' , 'Unable to get list of system modules'),
524 ('writeerror' , 'write error: No space left on device'),
525 ('nospace' , "No space left on device"),
526 ('nonode' , 'Failed to authenticate call: No such node'),
527 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
528 ('bootcheckfail' , 'BootCheckAuthentication'),
529 ('bootupdatefail' , 'BootUpdateNode'),
531 list = steps_to_list(steps)
532 index = child.expect( list + [ pexpect.EOF ])
533 id = index_to_id(steps,index)
536 if id == "exception":
537 if config and not config.quiet: print "...Found An Exception!!!"
538 elif index == len(list):
542 s = "-".join(sequence)
543 print " FOUND SEQUENCE: ", s
545 # NOTE: We get or set the flag based on the current sequence identifier.
546 # By using the sequence identifier, we guarantee that there will be no
547 # frequent loops. I'm guessing there is a better way to track loops,
549 #if not config.force and pflags.getRecentFlag(s):
550 # pflags.setRecentFlag(s)
552 # print "... flag is set or it has already run recently. Skipping %s" % node
558 # restart_bootmanager_boot
559 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
560 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
561 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
563 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
565 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
566 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
567 "bminit-cfg-auth-getplc-update-debug-done",
568 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
569 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
570 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
571 "bminit-cfg-auth-protoerror-exception-update-debug-done",
572 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
573 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
575 sequences.update({n : "restart_bootmanager_boot"})
577 # conn.restart_bootmanager('rins')
578 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
579 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
580 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
581 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
582 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
583 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
584 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
585 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
586 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
587 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
588 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
589 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
590 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
591 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
592 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
593 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
594 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
595 # actual solution appears to involve removing the bad files, and
596 # continually trying to boot the node.
597 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
598 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
600 sequences.update({n : "restart_bootmanager_rins"})
603 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
605 # conn.restart_node('rins')
606 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
607 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
608 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
609 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
610 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
611 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
612 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
613 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
614 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
615 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
616 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
617 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
619 sequences.update({n : "restart_node_rins"})
622 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
623 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
624 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
625 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
626 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
627 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
629 sequences.update({n: "restart_node_boot"})
631 # update_node_config_email
632 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
633 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
634 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
636 sequences.update({n : "update_node_config_email"})
638 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
639 "bminit-cfg-update-exception-nodehostname-update-debug-done",
641 sequences.update({n : "nodenetwork_email"})
643 # update_bootcd_email
644 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
645 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
646 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
647 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
648 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
650 sequences.update({n : "update_bootcd_email"})
652 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
654 sequences.update({n: "suspect_error_email"})
656 # update_hardware_email
657 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
658 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
660 # broken_hardware_email
661 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
665 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
666 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
668 sequences.update( { n : "bad_dns_email"})
673 if s not in sequences:
674 print " HOST %s" % hostname
675 print " UNKNOWN SEQUENCE: %s" % s
678 args['hostname'] = hostname
680 args['bmlog'] = conn.get_bootmanager_log().read()
681 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
682 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
684 m.send([config.cc_email])
686 conn.restart_bootmanager('boot')
688 # NOTE: Do not set the pflags value for this sequence if it's unknown.
689 # This way, we can check it again after we've fixed it.
694 if sequences[s] == "restart_bootmanager_boot":
695 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
696 conn.restart_bootmanager('boot')
697 elif sequences[s] == "restart_bootmanager_rins":
698 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
699 conn.restart_bootmanager('rins')
700 elif sequences[s] == "restart_node_rins":
701 conn.restart_node('rins')
702 elif sequences[s] == "restart_node_boot":
703 conn.restart_node('boot')
704 elif sequences[s] == "repair_node_keys":
705 if conn.compare_and_repair_nodekeys():
706 # the keys either are in sync or were forced in sync.
707 # so try to reboot the node again.
708 conn.restart_bootmanager('rins')
711 # there was some failure to synchronize the keys.
712 print "...Unable to repair node keys on %s" % node
714 elif sequences[s] == "suspect_error_email":
716 args['hostname'] = hostname
718 args['bmlog'] = conn.get_bootmanager_log().read()
719 m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
720 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
722 m.send([config.cc_email])
724 conn.restart_bootmanager('boot')
726 elif sequences[s] == "update_node_config_email":
727 print "...Sending message to UPDATE NODE CONFIG"
729 args['hostname'] = hostname
730 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
731 True, db='nodeid_persistmessages')
732 loginbase = plc.siteId(hostname)
733 emails = plc.getTechEmails(loginbase)
735 conn.dump_plconf_file()
736 conn.set_nodestate('disable')
738 elif sequences[s] == "nodenetwork_email":
739 print "...Sending message to LOOK AT NODE NETWORK"
741 args['hostname'] = hostname
742 args['bmlog'] = conn.get_bootmanager_log().read()
743 m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
744 True, db='nodenet_persistmessages')
745 loginbase = plc.siteId(hostname)
746 emails = plc.getTechEmails(loginbase)
748 conn.dump_plconf_file()
749 conn.set_nodestate('disable')
751 elif sequences[s] == "update_bootcd_email":
752 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
755 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
756 args['hostname_list'] = "%s" % hostname
758 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
759 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
761 loginbase = plc.siteId(hostname)
762 emails = plc.getTechEmails(loginbase)
765 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
766 conn.set_nodestate('disable')
768 elif sequences[s] == "broken_hardware_email":
769 # MAKE An ACTION record that this host has failed hardware. May
770 # require either an exception "/minhw" or other manual intervention.
771 # Definitely need to send out some more EMAIL.
772 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
773 # TODO: email notice of broken hardware
775 args['hostname'] = hostname
776 args['log'] = conn.get_dmesg().read()
777 m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
778 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
780 loginbase = plc.siteId(hostname)
781 emails = plc.getTechEmails(loginbase)
783 conn.set_nodestate('disable')
785 elif sequences[s] == "update_hardware_email":
786 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
788 args['hostname'] = hostname
789 args['bmlog'] = conn.get_bootmanager_log().read()
790 m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
791 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
793 loginbase = plc.siteId(hostname)
794 emails = plc.getTechEmails(loginbase)
796 conn.set_nodestate('disable')
798 elif sequences[s] == "bad_dns_email":
799 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
802 node = api.GetNodes(hostname)[0]
803 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
805 from monitor.common import email_exception
807 print traceback.print_exc()
808 # TODO: api error. skip email, b/c all info is not available,
809 # flag_set will not be recorded.
811 nodenet_str = network_config_to_str(net)
813 args['hostname'] = hostname
814 args['network_config'] = nodenet_str
815 args['nodenetwork_id'] = net['nodenetwork_id']
816 m = PersistMessage(hostname, mailtxt.baddns[0] % args,
817 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
819 loginbase = plc.siteId(hostname)
820 emails = plc.getTechEmails(loginbase)
822 conn.set_nodestate('disable')
825 pflags.setRecentFlag(s)
831 # MAIN -------------------------------------------------------------------
834 from monitor import parser as parsermodule
835 parser = parsermodule.getParser()
837 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
838 force=None, quiet=False)
839 parser.add_option("", "--child", dest="child", action="store_true",
840 help="This is the child mode of this process.")
841 parser.add_option("", "--force", dest="force", metavar="boot_state",
842 help="Force a boot state passed to BootManager.py.")
843 parser.add_option("", "--quiet", dest="quiet", action="store_true",
844 help="Extra quiet output messages.")
845 parser.add_option("", "--verbose", dest="verbose", action="store_true",
846 help="Extra debug output messages.")
847 parser.add_option("", "--nonet", dest="nonet", action="store_true",
848 help="Do not setup the network, use existing log files to re-run a test pass.")
849 parser.add_option("", "--collect", dest="collect", action="store_true",
850 help="No action, just collect dmesg, and bm.log")
851 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
852 help="Do not perform the orginary setup phase.")
854 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855 config = parsermodule.parse_args(parser)
858 nodes = config.getListFromFile(config.nodelist)
860 nodes = [ config.node ]
868 if __name__ == "__main__":