3 # Attempt to reboot a node in debug state.
14 from monitor.util.sshknownhosts import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 api = plc.getAuthAPI()
39 def bootmanager_log_name(hostname):
40 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41 base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42 short_target_filename = os.path.join('history', base_filename)
43 return short_target_filename
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
47 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
52 err = traceback.format_exc()
54 act = ActionRecord(loginbase=loginbase,
58 log_path=short_log_path,
63 class ExceptionDoubleSSHError(Exception): pass
66 def __init__(self, connection, node, config):
67 print "init nodeconnection"
72 def get_boot_state(self):
73 print "get_boot_state(self)"
75 if self.c.modules.os.path.exists('/tmp/source'):
77 elif self.c.modules.os.path.exists('/vservers'):
83 print self.c.modules.sys.path
91 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
98 def get_bootmanager_log(self):
99 bm_name = bootmanager_log_name(self.node)
100 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101 #email_exception(self.node, "collected BM log for %s" % self.node)
102 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
107 def dump_plconf_file(self):
109 self.c.modules.sys.path.append("/tmp/source/")
110 self.c.modules.os.chdir('/tmp/source')
112 log = c.modules.BootManager.log('/tmp/new.log')
113 bm = c.modules.BootManager.BootManager(log,'boot')
115 BootManagerException = c.modules.Exceptions.BootManagerException
116 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
120 InitializeBootManager.Run(bm.VARS, bm.LOG)
121 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
125 print " Possibly, unable to find valid configuration file"
128 for key in bm.VARS.keys():
129 print key, " == ", bm.VARS[key]
131 print " Unable to read Node Configuration"
133 def fsck_repair_node(self):
135 self.c.modules.sys.path.append("/tmp/source/")
136 self.c.modules.os.chdir('/tmp/source')
138 # TODO: set boot state to node's actually boot state.
139 # could be 'boot' or 'safeboot'
140 self.c.modules.os.chdir('/tmp/source')
141 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
142 print "Running MANUAL FSCK already... try again soon."
144 print "Running MANUAL fsck on %s" % self.node
145 cmd = "( touch /tmp/BM_RUNNING ; " + \
146 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
147 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
148 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
149 " rm -f /tmp/BM_RUNNING " + \
151 cmd = cmd % self.get_nodestate()
152 self.c.modules.os.system(cmd)
153 #self.restart_bootmanager('boot')
156 def compare_and_repair_nodekeys(self):
158 self.c.modules.sys.path.append("/tmp/source/")
159 self.c.modules.os.chdir('/tmp/source')
161 log = c.modules.BootManager.log('/tmp/new.log')
162 bm = c.modules.BootManager.BootManager(log,'boot')
164 BootManagerException = c.modules.Exceptions.BootManagerException
165 InitializeBootManager = c.modules.BootManager.InitializeBootManager
166 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
169 plcnode = plccache.GetNodeByName(self.node)
171 InitializeBootManager.Run(bm.VARS, bm.LOG)
172 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
177 print " Possibly, unable to find valid configuration file"
180 print " NODE: %s" % bm.VARS['NODE_KEY']
181 print " PLC : %s" % plcnode['key']
183 if bm.VARS['NODE_KEY'] == plcnode['key']:
186 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
187 print " Successfully updated NODE_KEY with PLC"
192 #for key in bm.VARS.keys():
193 # print key, " == ", bm.VARS[key]
195 print " Unable to retrieve NODE_KEY"
197 def bootmanager_running(self):
198 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
203 def set_nodestate(self, state='boot'):
204 return api.UpdateNode(self.node, {'boot_state' : state})
206 def get_nodestate(self):
208 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
210 traceback.print_exc()
211 # NOTE: use last cached value from plc
212 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
213 return fbnode['plc_node_stats']['boot_state']
216 def restart_node(self, state='boot'):
217 api.UpdateNode(self.node, {'boot_state' : state})
219 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
220 if not pflags.getRecentFlag('gentlekill'):
221 print " Killing all slice processes... : %s" % self.node
222 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
223 self.c.modules.os.system(cmd_slicekill)
224 cmd = """ shutdown -r +1 & """
225 print " Restarting %s : %s" % ( self.node, cmd)
226 self.c.modules.os.system(cmd)
228 pflags.setRecentFlag('gentlekill')
231 print " Restarting with sysrq 'sub' %s" % self.node
232 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
233 self.c.modules.os.system(cmd)
237 def restart_bootmanager(self, forceState):
239 self.c.modules.os.chdir('/tmp/source')
240 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
241 print " BootManager is already running: try again soon..."
243 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
244 cmd = "( touch /tmp/BM_RUNNING ; " + \
245 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
246 " rm -f /tmp/BM_RUNNING " + \
248 cmd = cmd % forceState
249 self.c.modules.os.system(cmd)
254 class PlanetLabSession:
255 globalport = 22000 + int(random.random()*1000)
257 def __init__(self, node, nosetup, verbose):
258 self.verbose = verbose
261 self.nosetup = nosetup
265 def get_connection(self, config):
267 print "SocketConnection(localhost, %s" % self.port
268 sc = SocketConnection("localhost", self.port)
269 print "NodeConnection(%s, %s)" % (sc, self.node)
270 conn = NodeConnection(sc, self.node, config)
272 # NOTE: try twice since this can sometimes fail the first time. If
273 # it fails again, let it go.
274 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
277 def setup_host(self):
278 self.port = PlanetLabSession.globalport
279 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
282 args['port'] = self.port
283 args['user'] = 'root'
284 args['hostname'] = self.node
285 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
289 print "Skipping setup"
292 # COPY Rpyc files to host
293 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
294 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
295 if self.verbose: print cmd
299 localos = moncommands.CMD()
301 ret = localos.system(cmd, timeout)
304 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
305 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
306 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
307 print "trying: ", cmd
308 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
309 ret = localos.system(cmd, timeout)
312 print "\tFAILED TWICE"
313 #email_exception("%s rsync failed twice" % self.node)
314 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
317 # KILL any already running servers.
318 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
319 (ov,ev) = ssh.run_noexcept2("""<<\EOF
321 echo "kill server" >> out.log
322 netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
323 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
324 echo "export" >> out.log
325 export PYTHONPATH=$HOME ;
326 echo "start server" >> out.log
327 python Rpyc/Servers/forking_server.py &> server.log &
328 echo "done" >> out.log
330 print "setup rpyc server over ssh"
334 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
335 # and the following options seems to work well.
336 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
337 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
338 """-o ConnectTimeout=120 """ + \
339 """-n -N -L %(port)s:localhost:18812 """ + \
340 """%(user)s@%(hostname)s"""
342 if self.verbose: print cmd
344 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
345 # TODO: the read() here may block indefinitely. Need a better
346 # approach therefore, that includes a timeout.
347 #ret = self.command.stdout.read(5)
348 ret = moncommands.read_t(self.command.stdout, 5)
352 # NOTE: There is still a slight race for machines that are slow...
353 self.timeout = 2*(t2-t1)
354 print "Sleeping for %s sec" % self.timeout
355 time.sleep(self.timeout)
358 if self.command.returncode is not None:
359 print "Failed to establish tunnel!"
360 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
362 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
366 if self.verbose: print "Killing SSH session %s" % self.port
367 print "Killing SSH session %s" % self.port
371 def steps_to_list(steps, index=1):
372 return map(lambda x: x[index], steps)
374 def index_to_id(steps,index):
375 if index < len(steps):
376 return steps[index][0]
380 class DebugInterface:
381 def __init__(self, hostname):
382 self.hostname = hostname
385 def getConnection(self):
386 print "Creating session for %s" % self.hostname
387 # update known_hosts file (in case the node has rebooted since last run)
389 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
392 print traceback.print_exc()
395 msg = "ERROR setting up session for %s" % self.hostname
398 self.session = PlanetLabSession(self.hostname, False, True)
400 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
401 except ExceptionDoubleSSHError, e:
405 traceback.print_exc()
409 print "Getting connection: 1st try"
411 conn = self.session.get_connection(config)
413 # NOTE: sometimes the wait in setup_host() is not long enough.
414 # So, here we try to wait a little longer before giving up entirely.
416 print "Getting connection: 2nd try"
417 time.sleep(self.session.timeout*5)
418 conn = self.session.get_connection(config)
420 # failed twice... no need to report this really, it's just in a
422 print "Getting connection: failed"
423 email_exception(self.hostname, "failed twice to get connection")
426 traceback.print_exc()
427 email_exception(self.hostname)
429 print "Getting connection: ok"
430 #print "trying to use conn before returning it."
431 #print conn.c.modules.sys.path
432 #print conn.c.modules.os.path.exists('/tmp/source')
435 #print "conn: %s" % conn
438 def getSequences(self):
440 # NOTE: The DB is now the autoritative record for all BM sequences.
441 # An admin can introduce new patterns and actions without touching code.
444 bms = BootmanSequenceRecord.query.all()
446 sequences[s.sequence] = s.action
450 def getDiskSteps(self):
452 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
453 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
454 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
456 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
458 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
459 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
461 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
462 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
464 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
465 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
467 ('floppytimeout','floppy0: floppy timeout called'),
468 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
470 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
471 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
473 # floppy0: floppy timeout called
474 # end_request: I/O error, dev fd0, sector 0
476 # Buffer I/O error on device dm-2, logical block 8888896
477 # ata1: status=0x51 { DriveReady SeekComplete Error }
478 # ata1: error=0x40 { UncorrectableError }
479 # SCSI error : <0 0 0 0> return code = 0x8000002
480 # sda: Current: sense key: Medium Error
481 # Additional sense: Unrecovered read error - auto reallocate failed
483 # SCSI error : <0 2 0 0> return code = 0x40001
484 # end_request: I/O error, dev sda, sector 572489600
488 def getDiskSequence(self, steps, child):
491 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
498 def getBootManagerStepPatterns(self):
500 ('bminit' , 'Initializing the BootManager.'),
501 ('cfg' , 'Reading node configuration file.'),
502 ('auth' , 'Authenticating node with PLC.'),
503 ('getplc' , 'Retrieving details of node from PLC.'),
504 ('update' , 'Updating node boot state at PLC.'),
505 ('hardware' , 'Checking if hardware requirements met.'),
506 ('installinit' , 'Install: Initializing.'),
507 ('installdisk' , 'Install: partitioning disks.'),
508 ('installbootfs', 'Install: bootstrapfs tarball.'),
509 ('installcfg' , 'Install: Writing configuration files.'),
510 ('installstop' , 'Install: Shutting down installer.'),
511 ('update2' , 'Updating node boot state at PLC.'),
512 ('installinit2' , 'Install: Initializing.'),
513 ('validate' , 'Validating node installation.'),
514 ('rebuildinitrd', 'Rebuilding initrd'),
515 ('netcfg' , 'Install: Writing Network Configuration files.'),
516 ('update3' , 'Updating node configuration.'),
517 ('disk' , 'Checking for unused disks to add to LVM.'),
518 ('update4' , 'Sending hardware configuration to PLC.'),
519 ('debug' , 'Starting debug mode'),
520 ('bmexceptmount', 'BootManagerException during mount'),
521 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
522 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
523 ('exception' , 'Exception'),
524 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
525 ('protoerror2' , '500 Internal Server Error'),
526 ('protoerror' , 'XML RPC protocol error'),
527 ('nodehostname' , 'Configured node hostname does not resolve'),
528 ('implementerror', 'Implementation Error'),
529 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
530 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
531 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
532 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
533 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
534 ('noinstall' , 'notinstalled'),
535 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
536 ('noblockdev' , "No block devices detected."),
537 ('missingkernel', "missingkernel"),
538 ('dnserror' , 'Name or service not known'),
539 ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
540 ('noconfig' , "Unable to find and read a node configuration file"),
541 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
542 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
543 ('hardwarerequirefail' , 'Hardware requirements not met'),
544 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
545 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
546 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
547 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
548 ('modulefail' , 'Unable to get list of system modules'),
549 ('writeerror' , 'write error: No space left on device'),
550 ('nospace' , "No space left on device"),
551 ('nonode' , 'Failed to authenticate call: No such node'),
552 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
553 ('authfail2' , 'Authentication Failed'),
554 ('bootcheckfail' , 'BootCheckAuthentication'),
555 ('bootupdatefail' , 'BootUpdateNode'),
559 def getBootManagerSequenceFromLog(self, steps, child):
563 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
564 id = index_to_id(steps,index)
567 if id == "exception":
568 print "...Found An Exception!!!"
569 elif id == "done": #index == len(steps_to_list(steps)):
575 def restore(sitehist, hostname, config=None, forced_action=None):
576 ret = restore_basic(sitehist, hostname, config, forced_action)
580 def restore_basic(sitehist, hostname, config=None, forced_action=None):
582 # NOTE: Nothing works if the bootcd is REALLY old.
583 # So, this is the first step.
585 bootman_action = "unknown"
587 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
588 recent_actions = sitehist.getRecentActions(hostname=hostname)
590 if fbnode['observed_category'] == "OLDBOOTCD":
591 print "\t...Notify owner to update BootImage!!!"
593 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
594 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
596 print "\tDisabling %s due to out-of-date BootImage" % hostname
597 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
599 # NOTE: nothing else is possible.
602 debugnode = DebugInterface(hostname)
603 conn = debugnode.getConnection()
604 if type(conn) == type(False): return "connect_failed"
606 boot_state = conn.get_boot_state()
607 if boot_state != "debug":
608 print "... %s in %s state: skipping..." % (hostname , boot_state)
609 return "skipped" #boot_state == "boot"
611 if conn.bootmanager_running():
612 print "...BootManager is currently running. Skipping host %s" %hostname
613 return "skipped" # True
615 # Read persistent flags, tagged on one week intervals.
617 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
618 dmesg = conn.get_dmesg()
619 child = fdpexpect.fdspawn(dmesg)
621 steps = debugnode.getDiskSteps()
622 sequence = debugnode.getDiskSequence(steps, child)
625 if config and not config.quiet: print "\tSET: ", s
628 print "...Potential drive errors on %s" % hostname
629 if len(s) == 2 and 'floppyerror' in s:
630 print "...Should investigate. Continuing with node."
632 print "...Should investigate. Skipping node."
633 # TODO: send message related to these errors.
635 if not found_within(recent_actions, 'baddisk_notice', 7):
636 print "baddisk_notice not found recently"
638 log=conn.get_dmesg().read()
639 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
640 return "skipping_baddisk"
642 # NOTE: "" does not add a new action record
646 print "...Downloading bm.log from %s" %hostname
647 log = conn.get_bootmanager_log()
648 bm_log_data = log.read() # get data
649 log.seek(0) # reset fd pointer for fdspawn
650 child = fdpexpect.fdspawn(log)
652 if hasattr(config, 'collect') and config.collect: return "collect"
654 if config and not config.quiet: print "...Scanning bm.log for errors"
658 steps = debugnode.getBootManagerStepPatterns()
659 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
661 s = "-".join(sequence)
662 print " FOUND SEQUENCE: ", s
664 # NOTE: We get or set the flag based on the current sequence identifier.
665 # By using the sequence identifier, we guarantee that there will be no
666 # frequent loops. I'm guessing there is a better way to track loops,
669 sequences = debugnode.getSequences()
672 if s not in sequences:
673 print " HOST %s" % hostname
674 print " UNKNOWN SEQUENCE: %s" % s
677 args['hostname'] = hostname
679 args['bmlog'] = bm_log_data
680 args['viart'] = False
681 args['saveact'] = True
682 args['ccemail'] = True
684 sitehist.sendMessage('unknownsequence_notice', **args)
686 conn.restart_bootmanager('boot')
688 bootman_action = "restart_bootmanager"
690 # NOTE: Do not set the pflags value for this sequence if it's unknown.
691 # This way, we can check it again after we've fixed it.
695 bootman_action = sequences[s]
697 if sequences[s] == "restart_bootmanager_boot":
698 print "...Restarting BootManager.py on %s "%hostname
699 conn.restart_bootmanager('boot')
700 elif sequences[s] == "restart_bootmanager_rins":
701 print "...Restarting BootManager.py on %s "%hostname
702 conn.restart_bootmanager('reinstall')
703 elif sequences[s] == "restart_node_rins":
704 conn.restart_node('reinstall')
705 elif sequences[s] == "restart_node_boot":
706 conn.restart_node('boot')
707 elif sequences[s] == "fsck_repair":
708 conn.fsck_repair_node()
709 elif sequences[s] == "repair_node_keys":
710 if conn.compare_and_repair_nodekeys():
711 # the keys either are in sync or were forced in sync.
712 # so try to start BM again.
713 conn.restart_bootmanager(conn.get_nodestate())
715 # there was some failure to synchronize the keys.
716 print "...Unable to repair node keys on %s" %hostname
717 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
719 args['hostname'] = hostname
720 sitehist.sendMessage('nodeconfig_notice', **args)
721 conn.dump_plconf_file()
723 # NOTE: do not add a new action record
726 elif sequences[s] == "unknownsequence_notice":
728 args['hostname'] = hostname
730 args['bmlog'] = bm_log_data
731 args['viart'] = False
732 args['saveact'] = True
733 args['ccemail'] = True
735 sitehist.sendMessage('unknownsequence_notice', **args)
736 conn.restart_bootmanager('boot')
738 elif sequences[s] == "nodeconfig_notice":
740 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
742 args['hostname'] = hostname
743 sitehist.sendMessage('nodeconfig_notice', **args)
744 conn.dump_plconf_file()
746 # NOTE: do not add a new action record
749 elif sequences[s] == "nodenetwork_email":
751 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
753 args['hostname'] = hostname
754 args['bmlog'] = bm_log_data
755 sitehist.sendMessage('nodeconfig_notice', **args)
756 conn.dump_plconf_file()
758 # NOTE: do not add a new action record
761 elif sequences[s] == "noblockdevice_notice":
763 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
765 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
766 args['hostname'] = hostname
768 sitehist.sendMessage('noblockdevice_notice', **args)
770 # NOTE: do not add a new action record
773 elif sequences[s] == "baddisk_notice":
774 # MAKE An ACTION record that this host has failed hardware. May
775 # require either an exception "/minhw" or other manual intervention.
776 # Definitely need to send out some more EMAIL.
777 # TODO: email notice of broken hardware
778 if not found_within(recent_actions, 'baddisk_notice', 7):
779 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
781 args['hostname'] = hostname
782 args['log'] = conn.get_dmesg().read()
784 sitehist.sendMessage('baddisk_notice', **args)
785 #conn.set_nodestate('disabled')
787 # NOTE: do not add a new action record
790 elif sequences[s] == "minimalhardware_notice":
791 if not found_within(recent_actions, 'minimalhardware_notice', 7):
792 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
794 args['hostname'] = hostname
795 args['bmlog'] = bm_log_data
796 sitehist.sendMessage('minimalhardware_notice', **args)
798 # NOTE: do not add a new action record
801 elif sequences[s] == "baddns_notice":
802 if not found_within(recent_actions, 'baddns_notice', 1):
803 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
806 node = plccache.GetNodeByName(hostname)
807 net = api.GetInterfaces(node['interface_ids'])[0]
810 print traceback.print_exc()
811 # TODO: api error. skip email, b/c all info is not available,
812 # flag_set will not be recorded.
814 nodenet_str = network_config_to_str(net)
816 args['hostname'] = hostname
817 args['network_config'] = nodenet_str
818 args['interface_id'] = net['interface_id']
820 sitehist.sendMessage('baddns_notice', **args)
822 # NOTE: do not add a new action record
825 return bootman_action
828 if __name__ == "__main__":
829 print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]