3 # Attempt to reboot a node in debug state.
14 from monitor.util.sshknownhosts import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 api = plc.getAuthAPI()
39 def bootmanager_log_name(hostname):
40 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41 base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42 short_target_filename = os.path.join('history', base_filename)
43 return short_target_filename
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
47 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
52 err = traceback.format_exc()
54 act = ActionRecord(loginbase=loginbase,
58 log_path=short_log_path,
63 class ExceptionDoubleSSHError(Exception): pass
66 def __init__(self, connection, node, config):
67 print "init nodeconnection"
72 def get_boot_state(self):
73 print "get_boot_state(self)"
75 if self.c.modules.os.path.exists('/tmp/source'):
77 elif self.c.modules.os.path.exists('/vservers'):
83 print self.c.modules.sys.path
91 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
98 def get_bootmanager_log(self):
99 bm_name = bootmanager_log_name(self.node)
100 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101 #email_exception(self.node, "collected BM log for %s" % self.node)
102 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
107 def dump_plconf_file(self):
109 self.c.modules.sys.path.append("/tmp/source/")
110 self.c.modules.os.chdir('/tmp/source')
112 log = c.modules.BootManager.log('/tmp/new.log')
113 bm = c.modules.BootManager.BootManager(log,'boot')
115 BootManagerException = c.modules.Exceptions.BootManagerException
116 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
120 InitializeBootManager.Run(bm.VARS, bm.LOG)
121 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
125 print " Possibly, unable to find valid configuration file"
128 for key in bm.VARS.keys():
129 print key, " == ", bm.VARS[key]
131 print " Unable to read Node Configuration"
133 def fprobe_repair_node(self):
134 # When fprobe data gets too much, it fills the root partition and
137 self.c.modules.sys.path.append("/tmp/source/")
139 # NOTE: assume that the root fs is already mounted...
140 if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'):
141 print "CLEARING FPROBE DATA on %s" % self.node
142 self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe')
143 cmd = """ ls -lrt . | awk '{if (i<NR/2 && $9) {print "rm "$9;i=i+1;}}' | sh """
144 self.c.modules.os.system(cmd)
146 print "COULD NOT CLEAR FPROBE DATA on %s" % self.node
148 def fsck_repair_node(self):
150 self.c.modules.sys.path.append("/tmp/source/")
151 self.c.modules.os.chdir('/tmp/source')
153 # TODO: set boot state to node's actually boot state.
154 # could be 'boot' or 'safeboot'
155 self.c.modules.os.chdir('/tmp/source')
156 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
157 print "Running MANUAL FSCK already... try again soon."
159 print "Running MANUAL fsck on %s" % self.node
160 cmd = "( touch /tmp/BM_RUNNING ; " + \
161 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
162 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
163 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
164 " rm -f /tmp/BM_RUNNING " + \
166 cmd = cmd % self.get_nodestate()
167 self.c.modules.os.system(cmd)
168 #self.restart_bootmanager('boot')
171 def compare_and_repair_nodekeys(self):
173 self.c.modules.sys.path.append("/tmp/source/")
174 self.c.modules.os.chdir('/tmp/source')
176 log = c.modules.BootManager.log('/tmp/new.log')
177 bm = c.modules.BootManager.BootManager(log,'boot')
179 BootManagerException = c.modules.Exceptions.BootManagerException
180 InitializeBootManager = c.modules.BootManager.InitializeBootManager
181 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
184 plcnode = plccache.GetNodeByName(self.node)
186 InitializeBootManager.Run(bm.VARS, bm.LOG)
187 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
192 print " Possibly, unable to find valid configuration file"
195 print " NODE: %s" % bm.VARS['NODE_KEY']
196 print " PLC : %s" % plcnode['key']
198 if bm.VARS['NODE_KEY'] == plcnode['key']:
201 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
202 print " Successfully updated NODE_KEY with PLC"
207 #for key in bm.VARS.keys():
208 # print key, " == ", bm.VARS[key]
210 print " Unable to retrieve NODE_KEY"
212 def bootmanager_running(self):
213 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
218 def set_nodestate(self, state='boot'):
219 return api.UpdateNode(self.node, {'boot_state' : state})
221 def get_nodestate(self):
223 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
225 traceback.print_exc()
226 # NOTE: use last cached value from plc
227 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
228 return fbnode['plc_node_stats']['boot_state']
231 def restart_node(self, state='boot'):
232 api.UpdateNode(self.node, {'boot_state' : state})
234 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
235 if not pflags.getRecentFlag('gentlekill'):
236 print " Killing all slice processes... : %s" % self.node
237 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
238 self.c.modules.os.system(cmd_slicekill)
239 cmd = """ shutdown -r +1 & """
240 print " Restarting %s : %s" % ( self.node, cmd)
241 self.c.modules.os.system(cmd)
243 pflags.setRecentFlag('gentlekill')
246 print " Restarting with sysrq 'sub' %s" % self.node
247 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
248 self.c.modules.os.system(cmd)
252 def restart_bootmanager(self, forceState):
254 self.c.modules.os.chdir('/tmp/source')
255 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
256 print " BootManager is already running: try again soon..."
258 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
259 cmd = "( touch /tmp/BM_RUNNING ; " + \
260 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
261 " rm -f /tmp/BM_RUNNING " + \
263 cmd = cmd % forceState
264 self.c.modules.os.system(cmd)
269 class PlanetLabSession:
270 globalport = 22000 + int(random.random()*1000)
272 def __init__(self, node, nosetup, verbose):
273 self.verbose = verbose
276 self.nosetup = nosetup
280 def get_connection(self, config):
282 print "SocketConnection(localhost, %s" % self.port
283 sc = SocketConnection("localhost", self.port)
284 print "NodeConnection(%s, %s)" % (sc, self.node)
285 conn = NodeConnection(sc, self.node, config)
287 # NOTE: try twice since this can sometimes fail the first time. If
288 # it fails again, let it go.
289 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
292 def setup_host(self):
293 self.port = PlanetLabSession.globalport
294 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
297 args['port'] = self.port
298 args['user'] = 'root'
299 args['hostname'] = self.node
300 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
304 print "Skipping setup"
307 # COPY Rpyc files to host
308 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
309 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
310 if self.verbose: print cmd
314 localos = moncommands.CMD()
316 ret = localos.system(cmd, timeout)
319 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
320 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
321 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
322 print "trying: ", cmd
323 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
324 ret = localos.system(cmd, timeout)
327 print "\tFAILED TWICE"
328 #email_exception("%s rsync failed twice" % self.node)
329 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
332 # KILL any already running servers.
333 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
334 (ov,ev) = ssh.run_noexcept2("""<<\EOF
336 echo "kill server" >> out.log
337 netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
338 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
339 echo "export" >> out.log
340 export PYTHONPATH=$HOME ;
341 echo "start server" >> out.log
342 python Rpyc/Servers/forking_server.py &> server.log &
343 echo "done" >> out.log
345 print "setup rpyc server over ssh"
349 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
350 # and the following options seems to work well.
351 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
352 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
353 """-o ConnectTimeout=120 """ + \
354 """-n -N -L %(port)s:localhost:18812 """ + \
355 """%(user)s@%(hostname)s"""
357 if self.verbose: print cmd
359 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
360 # TODO: the read() here may block indefinitely. Need a better
361 # approach therefore, that includes a timeout.
362 #ret = self.command.stdout.read(5)
363 ret = moncommands.read_t(self.command.stdout, 5)
367 # NOTE: There is still a slight race for machines that are slow...
368 self.timeout = 2*(t2-t1)
369 print "Sleeping for %s sec" % self.timeout
370 time.sleep(self.timeout)
373 if self.command.returncode is not None:
374 print "Failed to establish tunnel!"
375 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
377 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
381 if self.verbose: print "Killing SSH session %s" % self.port
382 print "Killing SSH session %s" % self.port
386 def steps_to_list(steps, index=1):
387 return map(lambda x: x[index], steps)
389 def index_to_id(steps,index):
390 if index < len(steps):
391 return steps[index][0]
395 class DebugInterface:
396 def __init__(self, hostname):
397 self.hostname = hostname
400 def getConnection(self):
401 print "Creating session for %s" % self.hostname
402 # update known_hosts file (in case the node has rebooted since last run)
404 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
407 print traceback.print_exc()
410 msg = "ERROR setting up session for %s" % self.hostname
413 self.session = PlanetLabSession(self.hostname, False, True)
415 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
416 except ExceptionDoubleSSHError, e:
420 traceback.print_exc()
424 print "Getting connection: 1st try"
426 conn = self.session.get_connection(config)
428 # NOTE: sometimes the wait in setup_host() is not long enough.
429 # So, here we try to wait a little longer before giving up entirely.
431 print "Getting connection: 2nd try"
432 time.sleep(self.session.timeout*5)
433 conn = self.session.get_connection(config)
435 # failed twice... no need to report this really, it's just in a
437 print "Getting connection: failed"
438 email_exception(self.hostname, "failed twice to get connection")
441 traceback.print_exc()
442 email_exception(self.hostname)
444 print "Getting connection: ok"
445 #print "trying to use conn before returning it."
446 #print conn.c.modules.sys.path
447 #print conn.c.modules.os.path.exists('/tmp/source')
450 #print "conn: %s" % conn
453 def getSequences(self):
455 # NOTE: The DB is now the autoritative record for all BM sequences.
456 # An admin can introduce new patterns and actions without touching code.
459 bms = BootmanSequenceRecord.query.all()
461 sequences[s.sequence] = s.action
465 def getDiskSteps(self):
467 ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
468 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
469 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
470 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
472 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
474 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
475 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
477 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
478 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
480 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
481 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
483 ('floppytimeout','floppy0: floppy timeout called'),
484 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
486 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
487 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
489 # floppy0: floppy timeout called
490 # end_request: I/O error, dev fd0, sector 0
492 # Buffer I/O error on device dm-2, logical block 8888896
493 # ata1: status=0x51 { DriveReady SeekComplete Error }
494 # ata1: error=0x40 { UncorrectableError }
495 # SCSI error : <0 0 0 0> return code = 0x8000002
496 # sda: Current: sense key: Medium Error
497 # Additional sense: Unrecovered read error - auto reallocate failed
499 # SCSI error : <0 2 0 0> return code = 0x40001
500 # end_request: I/O error, dev sda, sector 572489600
504 def getDiskSequence(self, steps, child):
507 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
514 def getBootManagerStepPatterns(self):
516 ('bminit' , 'Initializing the BootManager.'),
517 ('cfg' , 'Reading node configuration file.'),
518 ('auth' , 'Authenticating node with PLC.'),
519 ('getplc' , 'Retrieving details of node from PLC.'),
520 ('update' , 'Updating node boot state at PLC.'),
521 ('hardware' , 'Checking if hardware requirements met.'),
522 ('installinit' , 'Install: Initializing.'),
523 ('installdisk' , 'Install: partitioning disks.'),
524 ('installbootfs', 'Install: bootstrapfs tarball.'),
525 ('installcfg' , 'Install: Writing configuration files.'),
526 ('installstop' , 'Install: Shutting down installer.'),
527 ('update2' , 'Updating node boot state at PLC.'),
528 ('installinit2' , 'Install: Initializing.'),
529 ('validate' , 'Validating node installation.'),
530 ('rebuildinitrd', 'Rebuilding initrd'),
531 ('netcfg' , 'Install: Writing Network Configuration files.'),
532 ('update3' , 'Updating node configuration.'),
533 ('disk' , 'Checking for unused disks to add to LVM.'),
534 ('update4' , 'Sending hardware configuration to PLC.'),
535 ('debug' , 'Starting debug mode'),
536 ('bmexceptmount', 'BootManagerException during mount'),
537 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
538 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
539 ('exception' , 'Exception'),
540 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
541 ('protoerror2' , '500 Internal Server Error'),
542 ('protoerror' , 'XML RPC protocol error'),
543 ('nodehostname' , 'Configured node hostname does not resolve'),
544 ('implementerror', 'Implementation Error'),
545 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
546 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
547 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
548 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
549 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
550 ('noinstall' , 'notinstalled'),
551 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
552 ('noblockdev' , "No block devices detected."),
553 ('missingkernel', "missingkernel"),
554 ('dnserror' , 'Name or service not known'),
555 ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
556 ('noconfig' , "Unable to find and read a node configuration file"),
557 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
558 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
559 ('hardwarerequirefail' , 'Hardware requirements not met'),
560 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
561 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
562 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
563 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
564 ('modulefail' , 'Unable to get list of system modules'),
565 ('writeerror' , 'write error: No space left on device'),
566 ('nospace' , "No space left on device"),
567 ('nonode' , 'Failed to authenticate call: No such node'),
568 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
569 ('authfail2' , 'Authentication Failed'),
570 ('bootcheckfail' , 'BootCheckAuthentication'),
571 ('bootupdatefail' , 'BootUpdateNode'),
575 def getBootManagerSequenceFromLog(self, steps, child):
579 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
580 id = index_to_id(steps,index)
583 if id == "exception":
584 print "...Found An Exception!!!"
585 elif id == "done": #index == len(steps_to_list(steps)):
591 def restore(sitehist, hostname, config=None, forced_action=None):
592 ret = restore_basic(sitehist, hostname, config, forced_action)
596 def restore_basic(sitehist, hostname, config=None, forced_action=None):
598 # NOTE: Nothing works if the bootcd is REALLY old.
599 # So, this is the first step.
601 bootman_action = "unknown"
603 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
604 recent_actions = sitehist.getRecentActions(hostname=hostname)
606 if fbnode['observed_category'] == "OLDBOOTCD":
607 print "\t...Notify owner to update BootImage!!!"
609 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
610 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
612 print "\tDisabling %s due to out-of-date BootImage" % hostname
613 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
615 # NOTE: nothing else is possible.
618 debugnode = DebugInterface(hostname)
619 conn = debugnode.getConnection()
620 if type(conn) == type(False): return "connect_failed"
622 boot_state = conn.get_boot_state()
623 if boot_state != "debug":
624 print "... %s in %s state: skipping..." % (hostname , boot_state)
625 return "skipped" #boot_state == "boot"
627 if conn.bootmanager_running():
628 print "...BootManager is currently running. Skipping host %s" %hostname
629 return "skipped" # True
631 # Read persistent flags, tagged on one week intervals.
633 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
634 dmesg = conn.get_dmesg()
635 child = fdpexpect.fdspawn(dmesg)
637 steps = debugnode.getDiskSteps()
638 sequence = debugnode.getDiskSequence(steps, child)
641 if config and not config.quiet: print "\tSET: ", s
644 print "...Potential drive errors on %s" % hostname
645 if len(s) == 2 and 'floppyerror' in s:
646 print "...Should investigate. Continuing with node."
648 print "...Should investigate. Skipping node."
649 # TODO: send message related to these errors.
651 if not found_within(recent_actions, 'baddisk_notice', 7):
652 print "baddisk_notice not found recently"
654 log=conn.get_dmesg().read()
655 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
656 return "skipping_baddisk"
658 # NOTE: "" does not add a new action record
662 print "...Downloading bm.log from %s" %hostname
663 log = conn.get_bootmanager_log()
664 bm_log_data = log.read() # get data
665 log.seek(0) # reset fd pointer for fdspawn
666 child = fdpexpect.fdspawn(log)
668 if hasattr(config, 'collect') and config.collect: return "collect"
670 if config and not config.quiet: print "...Scanning bm.log for errors"
674 steps = debugnode.getBootManagerStepPatterns()
675 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
677 s = "-".join(sequence)
678 print " FOUND SEQUENCE: ", s
680 # NOTE: We get or set the flag based on the current sequence identifier.
681 # By using the sequence identifier, we guarantee that there will be no
682 # frequent loops. I'm guessing there is a better way to track loops,
685 sequences = debugnode.getSequences()
688 if s not in sequences:
689 print " HOST %s" % hostname
690 print " UNKNOWN SEQUENCE: %s" % s
693 args['hostname'] = hostname
695 args['bmlog'] = bm_log_data
696 args['viart'] = False
697 args['saveact'] = True
698 args['ccemail'] = True
701 # NOTE: sequence is unknown and contains nospace, so try the
702 # fprobe repair trick first.
703 conn.fprobe_repair_node()
705 sitehist.sendMessage('unknownsequence_notice', **args)
706 conn.restart_bootmanager('boot')
707 bootman_action = "restart_bootmanager"
709 # NOTE: Do not set the pflags value for this sequence if it's unknown.
710 # This way, we can check it again after we've fixed it.
714 bootman_action = sequences[s]
716 if sequences[s] == "restart_bootmanager_boot":
717 print "...Restarting BootManager.py on %s "%hostname
718 conn.restart_bootmanager('boot')
719 elif sequences[s] == "restart_bootmanager_rins":
720 print "...Restarting BootManager.py on %s "%hostname
721 conn.restart_bootmanager('reinstall')
722 elif sequences[s] == "restart_node_rins":
723 conn.restart_node('reinstall')
724 elif sequences[s] == "restart_node_boot":
725 conn.restart_node('boot')
726 elif sequences[s] == "fsck_repair":
727 conn.fsck_repair_node()
728 elif sequences[s] == "repair_node_keys":
729 if conn.compare_and_repair_nodekeys():
730 # the keys either are in sync or were forced in sync.
731 # so try to start BM again.
732 conn.restart_bootmanager(conn.get_nodestate())
734 # there was some failure to synchronize the keys.
735 print "...Unable to repair node keys on %s" %hostname
736 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
738 args['hostname'] = hostname
739 sitehist.sendMessage('nodeconfig_notice', **args)
740 conn.dump_plconf_file()
742 # NOTE: do not add a new action record
745 elif sequences[s] == "unknownsequence_notice":
747 args['hostname'] = hostname
749 args['bmlog'] = bm_log_data
750 args['viart'] = False
751 args['saveact'] = True
752 args['ccemail'] = True
754 sitehist.sendMessage('unknownsequence_notice', **args)
755 conn.restart_bootmanager('boot')
757 elif sequences[s] == "nodeconfig_notice":
759 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
761 args['hostname'] = hostname
762 sitehist.sendMessage('nodeconfig_notice', **args)
763 conn.dump_plconf_file()
765 # NOTE: do not add a new action record
768 elif sequences[s] == "nodenetwork_email":
770 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
772 args['hostname'] = hostname
773 args['bmlog'] = bm_log_data
774 sitehist.sendMessage('nodeconfig_notice', **args)
775 conn.dump_plconf_file()
777 # NOTE: do not add a new action record
780 elif sequences[s] == "noblockdevice_notice":
782 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
784 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
785 args['hostname'] = hostname
787 sitehist.sendMessage('noblockdevice_notice', **args)
789 # NOTE: do not add a new action record
792 elif sequences[s] == "baddisk_notice":
793 # MAKE An ACTION record that this host has failed hardware. May
794 # require either an exception "/minhw" or other manual intervention.
795 # Definitely need to send out some more EMAIL.
796 # TODO: email notice of broken hardware
797 if not found_within(recent_actions, 'baddisk_notice', 7):
798 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
800 args['hostname'] = hostname
801 args['log'] = conn.get_dmesg().read()
803 sitehist.sendMessage('baddisk_notice', **args)
804 #conn.set_nodestate('disabled')
806 # NOTE: do not add a new action record
809 elif sequences[s] == "minimalhardware_notice":
810 if not found_within(recent_actions, 'minimalhardware_notice', 7):
811 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
813 args['hostname'] = hostname
814 args['bmlog'] = bm_log_data
815 sitehist.sendMessage('minimalhardware_notice', **args)
817 # NOTE: do not add a new action record
820 elif sequences[s] == "baddns_notice":
821 if not found_within(recent_actions, 'baddns_notice', 1):
822 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
825 node = plccache.GetNodeByName(hostname)
826 net = api.GetInterfaces(node['interface_ids'])[0]
829 print traceback.print_exc()
830 # TODO: api error. skip email, b/c all info is not available,
831 # flag_set will not be recorded.
833 nodenet_str = network_config_to_str(net)
835 args['hostname'] = hostname
836 args['network_config'] = nodenet_str
837 args['interface_id'] = net['interface_id']
839 sitehist.sendMessage('baddns_notice', **args)
841 # NOTE: do not add a new action record
844 return bootman_action
847 if __name__ == "__main__":
848 print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]