3 # Attempt to reboot a node in debug state.
14 from monitor.util.sshknownhosts import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 api = plc.getAuthAPI()
39 def bootmanager_log_name(hostname):
40 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41 base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42 short_target_filename = os.path.join('history', base_filename)
43 return short_target_filename
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
47 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
52 err = traceback.format_exc()
54 act = ActionRecord(loginbase=loginbase,
58 log_path=short_log_path,
63 class ExceptionDoubleSSHError(Exception): pass
66 def __init__(self, connection, node, config):
67 print "init nodeconnection"
72 def get_boot_state(self):
73 print "get_boot_state(self)"
75 if self.c.modules.os.path.exists('/tmp/source'):
77 elif self.c.modules.os.path.exists('/vservers'):
83 print self.c.modules.sys.path
91 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
98 def get_bootmanager_log(self):
99 bm_name = bootmanager_log_name(self.node)
100 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101 #email_exception(self.node, "collected BM log for %s" % self.node)
102 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
107 def dump_plconf_file(self):
109 self.c.modules.sys.path.append("/tmp/source/")
110 self.c.modules.os.chdir('/tmp/source')
112 log = c.modules.BootManager.log('/tmp/new.log')
113 bm = c.modules.BootManager.BootManager(log,'boot')
115 BootManagerException = c.modules.Exceptions.BootManagerException
116 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
120 InitializeBootManager.Run(bm.VARS, bm.LOG)
121 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
125 print " Possibly, unable to find valid configuration file"
128 for key in bm.VARS.keys():
129 print key, " == ", bm.VARS[key]
131 print " Unable to read Node Configuration"
133 def fsck_repair_node(self):
135 self.c.modules.sys.path.append("/tmp/source/")
136 self.c.modules.os.chdir('/tmp/source')
138 # TODO: set boot state to node's actually boot state.
139 # could be 'boot' or 'safeboot'
140 self.c.modules.os.chdir('/tmp/source')
141 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
142 print "Running MANUAL FSCK already... try again soon."
144 print "Running MANUAL fsck on %s" % self.node
145 cmd = "( touch /tmp/BM_RUNNING ; " + \
146 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
147 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
148 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
149 " rm -f /tmp/BM_RUNNING " + \
151 cmd = cmd % self.get_nodestate()
152 self.c.modules.os.system(cmd)
153 #self.restart_bootmanager('boot')
156 def compare_and_repair_nodekeys(self):
158 self.c.modules.sys.path.append("/tmp/source/")
159 self.c.modules.os.chdir('/tmp/source')
161 log = c.modules.BootManager.log('/tmp/new.log')
162 bm = c.modules.BootManager.BootManager(log,'boot')
164 BootManagerException = c.modules.Exceptions.BootManagerException
165 InitializeBootManager = c.modules.BootManager.InitializeBootManager
166 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
169 plcnode = plccache.GetNodeByName(self.node)
171 InitializeBootManager.Run(bm.VARS, bm.LOG)
172 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
177 print " Possibly, unable to find valid configuration file"
180 print " NODE: %s" % bm.VARS['NODE_KEY']
181 print " PLC : %s" % plcnode['key']
183 if bm.VARS['NODE_KEY'] == plcnode['key']:
186 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
187 print " Successfully updated NODE_KEY with PLC"
192 #for key in bm.VARS.keys():
193 # print key, " == ", bm.VARS[key]
195 print " Unable to retrieve NODE_KEY"
197 def bootmanager_running(self):
198 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
203 def set_nodestate(self, state='boot'):
204 return api.UpdateNode(self.node, {'boot_state' : state})
206 def get_nodestate(self):
208 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
210 traceback.print_exc()
211 # NOTE: use last cached value from plc
212 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
213 return fbnode['plc_node_stats']['boot_state']
216 def restart_node(self, state='boot'):
217 api.UpdateNode(self.node, {'boot_state' : state})
219 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
220 if not pflags.getRecentFlag('gentlekill'):
221 print " Killing all slice processes... : %s" % self.node
222 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
223 self.c.modules.os.system(cmd_slicekill)
224 cmd = """ shutdown -r +1 & """
225 print " Restarting %s : %s" % ( self.node, cmd)
226 self.c.modules.os.system(cmd)
228 pflags.setRecentFlag('gentlekill')
231 print " Restarting with sysrq 'sub' %s" % self.node
232 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
233 self.c.modules.os.system(cmd)
237 def restart_bootmanager(self, forceState):
239 self.c.modules.os.chdir('/tmp/source')
240 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
241 print " BootManager is already running: try again soon..."
243 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
244 cmd = "( touch /tmp/BM_RUNNING ; " + \
245 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
246 " rm -f /tmp/BM_RUNNING " + \
248 cmd = cmd % forceState
249 self.c.modules.os.system(cmd)
254 class PlanetLabSession:
255 globalport = 22000 + int(random.random()*1000)
257 def __init__(self, node, nosetup, verbose):
258 self.verbose = verbose
261 self.nosetup = nosetup
265 def get_connection(self, config):
267 print "SocketConnection(localhost, %s" % self.port
268 sc = SocketConnection("localhost", self.port)
269 print "NodeConnection(%s, %s)" % (sc, self.node)
270 conn = NodeConnection(sc, self.node, config)
272 # NOTE: try twice since this can sometimes fail the first time. If
273 # it fails again, let it go.
274 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
277 def setup_host(self):
278 self.port = PlanetLabSession.globalport
279 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
282 args['port'] = self.port
283 args['user'] = 'root'
284 args['hostname'] = self.node
285 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
289 print "Skipping setup"
292 # COPY Rpyc files to host
293 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
294 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
295 if self.verbose: print cmd
299 localos = moncommands.CMD()
301 ret = localos.system(cmd, timeout)
304 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
305 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
306 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
307 print "trying: ", cmd
308 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
309 ret = localos.system(cmd, timeout)
312 print "\tFAILED TWICE"
313 #email_exception("%s rsync failed twice" % self.node)
314 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
317 # KILL any already running servers.
318 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
319 (ov,ev) = ssh.run_noexcept2("""<<\EOF
321 echo "kill server" >> out.log
322 netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
323 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
324 echo "export" >> out.log
325 export PYTHONPATH=$HOME ;
326 echo "start server" >> out.log
327 python Rpyc/Servers/forking_server.py &> server.log &
328 echo "done" >> out.log
330 print "setup rpyc server over ssh"
334 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
335 # and the following options seems to work well.
336 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
337 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
338 """-o ConnectTimeout=120 """ + \
339 """-n -N -L %(port)s:localhost:18812 """ + \
340 """%(user)s@%(hostname)s"""
342 if self.verbose: print cmd
344 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
345 # TODO: the read() here may block indefinitely. Need a better
346 # approach therefore, that includes a timeout.
347 #ret = self.command.stdout.read(5)
348 ret = moncommands.read_t(self.command.stdout, 5)
352 # NOTE: There is still a slight race for machines that are slow...
353 self.timeout = 2*(t2-t1)
354 print "Sleeping for %s sec" % self.timeout
355 time.sleep(self.timeout)
358 if self.command.returncode is not None:
359 print "Failed to establish tunnel!"
360 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
362 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
366 if self.verbose: print "Killing SSH session %s" % self.port
367 print "Killing SSH session %s" % self.port
371 def steps_to_list(steps, index=1):
372 return map(lambda x: x[index], steps)
374 def index_to_id(steps,index):
375 if index < len(steps):
376 return steps[index][0]
380 class DebugInterface:
381 def __init__(self, hostname):
382 self.hostname = hostname
385 def getConnection(self):
386 print "Creating session for %s" % self.hostname
387 # update known_hosts file (in case the node has rebooted since last run)
389 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
392 print traceback.print_exc()
395 msg = "ERROR setting up session for %s" % self.hostname
398 self.session = PlanetLabSession(self.hostname, False, True)
400 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
401 except ExceptionDoubleSSHError, e:
405 traceback.print_exc()
409 print "Getting connection: 1st try"
411 conn = self.session.get_connection(config)
413 # NOTE: sometimes the wait in setup_host() is not long enough.
414 # So, here we try to wait a little longer before giving up entirely.
416 print "Getting connection: 2nd try"
417 time.sleep(self.session.timeout*5)
418 conn = self.session.get_connection(config)
420 # failed twice... no need to report this really, it's just in a
422 print "Getting connection: failed"
423 email_exception(self.hostname, "failed twice to get connection")
426 traceback.print_exc()
427 email_exception(self.hostname)
429 print "Getting connection: ok"
430 #print "trying to use conn before returning it."
431 #print conn.c.modules.sys.path
432 #print conn.c.modules.os.path.exists('/tmp/source')
435 #print "conn: %s" % conn
438 def getSequences(self):
440 # NOTE: The DB is now the autoritative record for all BM sequences.
441 # An admin can introduce new patterns and actions without touching code.
444 bms = BootmanSequenceRecord.query.all()
446 sequences[s.sequence] = s.action
450 def getDiskSteps(self):
452 ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
453 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
454 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
455 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
457 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
459 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
460 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
462 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
463 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
465 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
466 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
468 ('floppytimeout','floppy0: floppy timeout called'),
469 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
471 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
472 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
474 # floppy0: floppy timeout called
475 # end_request: I/O error, dev fd0, sector 0
477 # Buffer I/O error on device dm-2, logical block 8888896
478 # ata1: status=0x51 { DriveReady SeekComplete Error }
479 # ata1: error=0x40 { UncorrectableError }
480 # SCSI error : <0 0 0 0> return code = 0x8000002
481 # sda: Current: sense key: Medium Error
482 # Additional sense: Unrecovered read error - auto reallocate failed
484 # SCSI error : <0 2 0 0> return code = 0x40001
485 # end_request: I/O error, dev sda, sector 572489600
489 def getDiskSequence(self, steps, child):
492 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
499 def getBootManagerStepPatterns(self):
501 ('bminit' , 'Initializing the BootManager.'),
502 ('cfg' , 'Reading node configuration file.'),
503 ('auth' , 'Authenticating node with PLC.'),
504 ('getplc' , 'Retrieving details of node from PLC.'),
505 ('update' , 'Updating node boot state at PLC.'),
506 ('hardware' , 'Checking if hardware requirements met.'),
507 ('installinit' , 'Install: Initializing.'),
508 ('installdisk' , 'Install: partitioning disks.'),
509 ('installbootfs', 'Install: bootstrapfs tarball.'),
510 ('installcfg' , 'Install: Writing configuration files.'),
511 ('installstop' , 'Install: Shutting down installer.'),
512 ('update2' , 'Updating node boot state at PLC.'),
513 ('installinit2' , 'Install: Initializing.'),
514 ('validate' , 'Validating node installation.'),
515 ('rebuildinitrd', 'Rebuilding initrd'),
516 ('netcfg' , 'Install: Writing Network Configuration files.'),
517 ('update3' , 'Updating node configuration.'),
518 ('disk' , 'Checking for unused disks to add to LVM.'),
519 ('update4' , 'Sending hardware configuration to PLC.'),
520 ('debug' , 'Starting debug mode'),
521 ('bmexceptmount', 'BootManagerException during mount'),
522 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
523 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
524 ('exception' , 'Exception'),
525 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
526 ('protoerror2' , '500 Internal Server Error'),
527 ('protoerror' , 'XML RPC protocol error'),
528 ('nodehostname' , 'Configured node hostname does not resolve'),
529 ('implementerror', 'Implementation Error'),
530 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
531 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
532 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
533 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
534 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
535 ('noinstall' , 'notinstalled'),
536 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
537 ('noblockdev' , "No block devices detected."),
538 ('missingkernel', "missingkernel"),
539 ('dnserror' , 'Name or service not known'),
540 ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
541 ('noconfig' , "Unable to find and read a node configuration file"),
542 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
543 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
544 ('hardwarerequirefail' , 'Hardware requirements not met'),
545 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
546 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
547 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
548 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
549 ('modulefail' , 'Unable to get list of system modules'),
550 ('writeerror' , 'write error: No space left on device'),
551 ('nospace' , "No space left on device"),
552 ('nonode' , 'Failed to authenticate call: No such node'),
553 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
554 ('authfail2' , 'Authentication Failed'),
555 ('bootcheckfail' , 'BootCheckAuthentication'),
556 ('bootupdatefail' , 'BootUpdateNode'),
560 def getBootManagerSequenceFromLog(self, steps, child):
564 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
565 id = index_to_id(steps,index)
568 if id == "exception":
569 print "...Found An Exception!!!"
570 elif id == "done": #index == len(steps_to_list(steps)):
576 def restore(sitehist, hostname, config=None, forced_action=None):
577 ret = restore_basic(sitehist, hostname, config, forced_action)
581 def restore_basic(sitehist, hostname, config=None, forced_action=None):
583 # NOTE: Nothing works if the bootcd is REALLY old.
584 # So, this is the first step.
586 bootman_action = "unknown"
588 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
589 recent_actions = sitehist.getRecentActions(hostname=hostname)
591 if fbnode['observed_category'] == "OLDBOOTCD":
592 print "\t...Notify owner to update BootImage!!!"
594 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
595 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
597 print "\tDisabling %s due to out-of-date BootImage" % hostname
598 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
600 # NOTE: nothing else is possible.
603 debugnode = DebugInterface(hostname)
604 conn = debugnode.getConnection()
605 if type(conn) == type(False): return "connect_failed"
607 boot_state = conn.get_boot_state()
608 if boot_state != "debug":
609 print "... %s in %s state: skipping..." % (hostname , boot_state)
610 return "skipped" #boot_state == "boot"
612 if conn.bootmanager_running():
613 print "...BootManager is currently running. Skipping host %s" %hostname
614 return "skipped" # True
616 # Read persistent flags, tagged on one week intervals.
618 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
619 dmesg = conn.get_dmesg()
620 child = fdpexpect.fdspawn(dmesg)
622 steps = debugnode.getDiskSteps()
623 sequence = debugnode.getDiskSequence(steps, child)
626 if config and not config.quiet: print "\tSET: ", s
629 print "...Potential drive errors on %s" % hostname
630 if len(s) == 2 and 'floppyerror' in s:
631 print "...Should investigate. Continuing with node."
633 print "...Should investigate. Skipping node."
634 # TODO: send message related to these errors.
636 if not found_within(recent_actions, 'baddisk_notice', 7):
637 print "baddisk_notice not found recently"
639 log=conn.get_dmesg().read()
640 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
641 return "skipping_baddisk"
643 # NOTE: "" does not add a new action record
647 print "...Downloading bm.log from %s" %hostname
648 log = conn.get_bootmanager_log()
649 bm_log_data = log.read() # get data
650 log.seek(0) # reset fd pointer for fdspawn
651 child = fdpexpect.fdspawn(log)
653 if hasattr(config, 'collect') and config.collect: return "collect"
655 if config and not config.quiet: print "...Scanning bm.log for errors"
659 steps = debugnode.getBootManagerStepPatterns()
660 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
662 s = "-".join(sequence)
663 print " FOUND SEQUENCE: ", s
665 # NOTE: We get or set the flag based on the current sequence identifier.
666 # By using the sequence identifier, we guarantee that there will be no
667 # frequent loops. I'm guessing there is a better way to track loops,
670 sequences = debugnode.getSequences()
673 if s not in sequences:
674 print " HOST %s" % hostname
675 print " UNKNOWN SEQUENCE: %s" % s
678 args['hostname'] = hostname
680 args['bmlog'] = bm_log_data
681 args['viart'] = False
682 args['saveact'] = True
683 args['ccemail'] = True
685 sitehist.sendMessage('unknownsequence_notice', **args)
687 conn.restart_bootmanager('boot')
689 bootman_action = "restart_bootmanager"
691 # NOTE: Do not set the pflags value for this sequence if it's unknown.
692 # This way, we can check it again after we've fixed it.
696 bootman_action = sequences[s]
698 if sequences[s] == "restart_bootmanager_boot":
699 print "...Restarting BootManager.py on %s "%hostname
700 conn.restart_bootmanager('boot')
701 elif sequences[s] == "restart_bootmanager_rins":
702 print "...Restarting BootManager.py on %s "%hostname
703 conn.restart_bootmanager('reinstall')
704 elif sequences[s] == "restart_node_rins":
705 conn.restart_node('reinstall')
706 elif sequences[s] == "restart_node_boot":
707 conn.restart_node('boot')
708 elif sequences[s] == "fsck_repair":
709 conn.fsck_repair_node()
710 elif sequences[s] == "repair_node_keys":
711 if conn.compare_and_repair_nodekeys():
712 # the keys either are in sync or were forced in sync.
713 # so try to start BM again.
714 conn.restart_bootmanager(conn.get_nodestate())
716 # there was some failure to synchronize the keys.
717 print "...Unable to repair node keys on %s" %hostname
718 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
720 args['hostname'] = hostname
721 sitehist.sendMessage('nodeconfig_notice', **args)
722 conn.dump_plconf_file()
724 # NOTE: do not add a new action record
727 elif sequences[s] == "unknownsequence_notice":
729 args['hostname'] = hostname
731 args['bmlog'] = bm_log_data
732 args['viart'] = False
733 args['saveact'] = True
734 args['ccemail'] = True
736 sitehist.sendMessage('unknownsequence_notice', **args)
737 conn.restart_bootmanager('boot')
739 elif sequences[s] == "nodeconfig_notice":
741 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
743 args['hostname'] = hostname
744 sitehist.sendMessage('nodeconfig_notice', **args)
745 conn.dump_plconf_file()
747 # NOTE: do not add a new action record
750 elif sequences[s] == "nodenetwork_email":
752 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
754 args['hostname'] = hostname
755 args['bmlog'] = bm_log_data
756 sitehist.sendMessage('nodeconfig_notice', **args)
757 conn.dump_plconf_file()
759 # NOTE: do not add a new action record
762 elif sequences[s] == "noblockdevice_notice":
764 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
766 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
767 args['hostname'] = hostname
769 sitehist.sendMessage('noblockdevice_notice', **args)
771 # NOTE: do not add a new action record
774 elif sequences[s] == "baddisk_notice":
775 # MAKE An ACTION record that this host has failed hardware. May
776 # require either an exception "/minhw" or other manual intervention.
777 # Definitely need to send out some more EMAIL.
778 # TODO: email notice of broken hardware
779 if not found_within(recent_actions, 'baddisk_notice', 7):
780 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
782 args['hostname'] = hostname
783 args['log'] = conn.get_dmesg().read()
785 sitehist.sendMessage('baddisk_notice', **args)
786 #conn.set_nodestate('disabled')
788 # NOTE: do not add a new action record
791 elif sequences[s] == "minimalhardware_notice":
792 if not found_within(recent_actions, 'minimalhardware_notice', 7):
793 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
795 args['hostname'] = hostname
796 args['bmlog'] = bm_log_data
797 sitehist.sendMessage('minimalhardware_notice', **args)
799 # NOTE: do not add a new action record
802 elif sequences[s] == "baddns_notice":
803 if not found_within(recent_actions, 'baddns_notice', 1):
804 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
807 node = plccache.GetNodeByName(hostname)
808 net = api.GetInterfaces(node['interface_ids'])[0]
811 print traceback.print_exc()
812 # TODO: api error. skip email, b/c all info is not available,
813 # flag_set will not be recorded.
815 nodenet_str = network_config_to_str(net)
817 args['hostname'] = hostname
818 args['network_config'] = nodenet_str
819 args['interface_id'] = net['interface_id']
821 sitehist.sendMessage('baddns_notice', **args)
823 # NOTE: do not add a new action record
826 return bootman_action
829 if __name__ == "__main__":
830 print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]