added actionlist_template to display action list consistently on different pages
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         session.flush(); session.clear()
61         return
62         
63
64 class ExceptionDoubleSSHError(Exception): pass
65
66 class NodeConnection:
67         def __init__(self, connection, node, config):
68                 self.node = node
69                 self.c = connection
70                 self.config = config
71
72         def get_boot_state(self):
73                 try:
74                         if self.c.modules.os.path.exists('/tmp/source'):
75                                 return "debug"
76                         elif self.c.modules.os.path.exists('/vservers'): 
77                                 return "boot"
78                         else:
79                                 return "unknown"
80                 except EOFError:
81                         traceback.print_exc()
82                         print self.c.modules.sys.path
83                 except:
84                         email_exception()
85                         traceback.print_exc()
86
87                 return "unknown"
88
89         def get_dmesg(self):
90                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
91                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
92                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
93                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
94                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
95                 return log
96
97         def get_bootmanager_log(self):
98                 bm_name = bootmanager_log_name(self.node)
99                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
100                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
101                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
102                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
103                 return log
104
105
106 #       def get_dmesg(self):
107 #               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
108 #               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
109 #               log = open("log/dmesg.%s.log" % self.node, 'r')
110 #               return log
111 #
112 #       def get_bootmanager_log(self):
113 #               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
114 #               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
115 #               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
116 #               log = open("log/bm.%s.log" % self.node, 'r')
117 #               return log
118
119         def dump_plconf_file(self):
120                 c = self.c
121                 self.c.modules.sys.path.append("/tmp/source/")
122                 self.c.modules.os.chdir('/tmp/source')
123
124                 log = c.modules.BootManager.log('/tmp/new.log')
125                 bm = c.modules.BootManager.BootManager(log,'boot')
126
127                 BootManagerException = c.modules.Exceptions.BootManagerException
128                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
129                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
130                 bm_continue = True
131
132                 InitializeBootManager.Run(bm.VARS, bm.LOG)
133                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
134                 except Exception, x:
135                         bm_continue = False
136                         print "   ERROR:", x
137                         print "   Possibly, unable to find valid configuration file"
138
139                 if bm_continue:
140                         for key in bm.VARS.keys():
141                                 print key, " == ", bm.VARS[key]
142                 else:
143                         print "   Unable to read Node Configuration"
144                 
145         def fsck_repair_node(self):
146                 c = self.c
147                 self.c.modules.sys.path.append("/tmp/source/")
148                 self.c.modules.os.chdir('/tmp/source')
149                 # TODO: restart
150                 # TODO: set boot state to node's actually boot state.
151                 # could be 'boot' or 'safeboot'
152                 self.c.modules.os.chdir('/tmp/source')
153                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
154                         print "Running MANUAL FSCK already... try again soon."
155                 else:
156                         print "Running MANUAL fsck on %s" % self.node
157                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
158                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
159                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
160                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
161                                   "  rm -f /tmp/BM_RUNNING " + \
162                                   ") &" 
163                         cmd = cmd % self.get_nodestate()
164                         self.c.modules.os.system(cmd)
165                 #self.restart_bootmanager('boot')       
166                 pass
167
168         def compare_and_repair_nodekeys(self):
169                 c = self.c
170                 self.c.modules.sys.path.append("/tmp/source/")
171                 self.c.modules.os.chdir('/tmp/source')
172
173                 log = c.modules.BootManager.log('/tmp/new.log')
174                 bm = c.modules.BootManager.BootManager(log,'boot')
175
176                 BootManagerException = c.modules.Exceptions.BootManagerException
177                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
178                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
179                 bm_continue = True
180
181                 plcnode = plccache.GetNodeByName(self.node)
182
183                 InitializeBootManager.Run(bm.VARS, bm.LOG)
184                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
185                 except Exception, x:
186                         bm_continue = False
187                         print "exception"
188                         print x
189                         print "   Possibly, unable to find valid configuration file"
190
191                 if bm_continue:
192                         print "   NODE: %s" % bm.VARS['NODE_KEY']
193                         print "   PLC : %s" % plcnode['key']
194
195                         if bm.VARS['NODE_KEY'] == plcnode['key']:
196                                 return True
197                         else:
198                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
199                                         print "   Successfully updated NODE_KEY with PLC"
200                                         return True
201                                 else:
202                                         return False
203                                 
204                         #for key in bm.VARS.keys():
205                         #       print key, " == ", bm.VARS[key]
206                 else:
207                         print "   Unable to retrieve NODE_KEY"
208
209         def bootmanager_running(self):
210                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
211                         return True
212                 else:
213                         return False
214
215         def set_nodestate(self, state='boot'):
216                 return api.UpdateNode(self.node, {'boot_state' : state})
217
218         def get_nodestate(self):
219                 try:
220                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
221                 except:
222                         traceback.print_exc()
223                         # NOTE: use last cached value from plc
224                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
225                         return fbnode['plc_node_stats']['boot_state']
226
227
228         def restart_node(self, state='boot'):
229                 api.UpdateNode(self.node, {'boot_state' : state})
230
231                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
232                 if not pflags.getRecentFlag('gentlekill'):
233                         print "   Killing all slice processes... : %s" %  self.node
234                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
235                         self.c.modules.os.system(cmd_slicekill)
236                         cmd = """ shutdown -r +1 & """
237                         print "   Restarting %s : %s" % ( self.node, cmd)
238                         self.c.modules.os.system(cmd)
239
240                         pflags.setRecentFlag('gentlekill')
241                         pflags.save()
242                 else:
243                         print "   Restarting with sysrq 'sub' %s" % self.node
244                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
245                         self.c.modules.os.system(cmd)
246
247                 return
248
249         def restart_bootmanager(self, forceState):
250
251                 self.c.modules.os.chdir('/tmp/source')
252                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
253                         print "   BootManager is already running: try again soon..."
254                 else:
255                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
256                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
257                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
258                                   "  rm -f /tmp/BM_RUNNING " + \
259                                   ") &" 
260                         cmd = cmd % forceState
261                         self.c.modules.os.system(cmd)
262
263                 return 
264
265
266 class PlanetLabSession:
267         globalport = 22000 + int(random.random()*1000)
268
269         def __init__(self, node, nosetup, verbose):
270                 self.verbose = verbose
271                 self.node = node
272                 self.port = None
273                 self.nosetup = nosetup
274                 self.command = None
275                 self.setup_host()
276
277         def get_connection(self, config):
278                 try:
279                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
280                 except:
281                         # NOTE: try twice since this can sometimes fail the first time. If
282                         #               it fails again, let it go.
283                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
284                 return conn
285         
286         def setup_host(self):
287                 self.port = PlanetLabSession.globalport
288                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
289
290                 args = {}
291                 args['port'] = self.port
292                 args['user'] = 'root'
293                 args['hostname'] = self.node
294                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
295                 ssh_port = 22
296
297                 if self.nosetup:
298                         print "Skipping setup"
299                         return 
300
301                 # COPY Rpyc files to host
302                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
303                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
304                 if self.verbose: print cmd
305                 print cmd
306                 # TODO: Add timeout
307                 timeout = 120
308                 localos = moncommands.CMD()
309
310                 ret = localos.system(cmd, timeout)
311                 print ret
312                 if ret != 0:
313                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
314                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
315                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
316                         print "trying: ", cmd
317                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
318                         ret = localos.system(cmd, timeout)
319                         print ret
320                         if ret != 0:
321                                 print "\tFAILED TWICE"
322                                 #email_exception("%s rsync failed twice" % self.node)
323                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
324
325                 t1 = time.time()
326                 # KILL any already running servers.
327                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
328                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
329             rm -f out.log
330             echo "kill server" >> out.log
331             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
332             echo "export" >> out.log
333             export PYTHONPATH=$HOME  ;
334             echo "start server" >> out.log
335             python Rpyc/Servers/forking_server.py &> server.log &
336             echo "done" >> out.log
337 EOF""")
338                 print "setup rpyc server over ssh"
339                 print ssh.ret
340
341                 # TODO: Add timeout
342                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
343                 # and the following options seems to work well.
344                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
345                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
346                           """-o ConnectTimeout=120 """ + \
347                           """-n -N -L %(port)s:localhost:18812 """ + \
348                           """%(user)s@%(hostname)s"""
349                 cmd = cmd % args
350                 if self.verbose: print cmd
351                 print cmd
352                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
353                 # TODO: the read() here may block indefinitely.  Need a better
354                 # approach therefore, that includes a timeout.
355                 #ret = self.command.stdout.read(5)
356                 ret = moncommands.read_t(self.command.stdout, 5)
357
358                 t2 = time.time()
359                 if 'READY' in ret:
360                         # NOTE: There is still a slight race for machines that are slow...
361                         self.timeout = 2*(t2-t1)
362                         print "Sleeping for %s sec" % self.timeout
363                         time.sleep(self.timeout)
364                         return
365
366                 if self.command.returncode is not None:
367                         print "Failed to establish tunnel!"
368                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
369
370                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
371
372         def __del__(self):
373                 if self.command:
374                         if self.verbose: print "Killing SSH session %s" % self.port
375                         print "Killing SSH session %s" % self.port
376                         self.command.kill()
377
378         
379 def steps_to_list(steps, index=1):
380         return map(lambda x: x[index], steps)
381
382 def index_to_id(steps,index):
383         if index < len(steps):
384                 return steps[index][0]
385         else:
386                 return "done"
387
388 class DebugInterface:
389         def __init__(self, hostname):
390                 self.hostname = hostname
391                 self.session = None
392
393         def getConnection(self):
394                 print "Creating session for %s" % self.hostname
395                 # update known_hosts file (in case the node has rebooted since last run)
396                 try:
397                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
398                 except:
399                         email_exception()
400                         print traceback.print_exc()
401                         return False
402
403                 msg = "ERROR setting up session for %s" % self.hostname
404                 try:
405                         if config == None:
406                                 self.session = PlanetLabSession(self.hostname, False, True)
407                         else:
408                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
409                 except ExceptionDoubleSSHError, e:
410                         print msg
411                         return False
412                 except Exception, e:
413                         traceback.print_exc()
414                         email_exception(msg)
415                         return False
416
417                 try:
418                         conn = self.session.get_connection(config)
419                 except EOFError:
420                         # NOTE: sometimes the wait in setup_host() is not long enough.  
421                         # So, here we try to wait a little longer before giving up entirely.
422                         try:
423                                 time.sleep(self.session.timeout*5)
424                                 conn = self.session.get_connection(config)
425                         except EOFError:
426                                 # failed twice... no need to report this really, it's just in a
427                                 # weird state...
428                                 return False
429                         except:
430                                 traceback.print_exc()
431                                 email_exception(self.hostname)
432                                 return False
433                 #print "trying to use conn before returning it."
434                 #print conn.c.modules.sys.path
435                 #print conn.c.modules.os.path.exists('/tmp/source')
436                 #time.sleep(1)
437
438                 #print "conn: %s" % conn
439                 return conn
440
441         def getSequences(self):
442
443                 # TODO: This can be replaced with a DB definition at a future time.
444                 #               This would make it possible for an admin to introduce new
445                 #               patterns without touching code.
446                 
447                 sequences = {}
448                 # restart_bootmanager_boot
449                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
450                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
451                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
452
453                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
454
455                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
456                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
457                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
458                                 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
459                                 "bminit-cfg-auth-getplc-update-debug-done",
460                                 "bminit-cfg-auth-protoerror2-debug-done",
461                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
462                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
463                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
464                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
465                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
466                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
467                                 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
468                                 ]:
469                         sequences.update({n : "restart_bootmanager_boot"})
470
471                 #       conn.restart_bootmanager('reinstall')
472                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
473                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
474                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
475                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
476                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
477                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
478                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
479                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
480                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
481                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
482                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
483                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
484                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
485                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
486                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
487                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
488                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
489                                 # actual solution appears to involve removing the bad files, and
490                                 # continually trying to boot the node.
491                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
492                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
493                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
494                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
495                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
496                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
497                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
498                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
499                                 ]:
500                         sequences.update({n : "restart_bootmanager_rins"})
501
502                 # repair_node_keys
503                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
504                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
505                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
506                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
507                                         "bminit-cfg-auth-authfail-debug-done",
508                                         "bminit-cfg-auth-authfail2-authfail-debug-done",
509                                 ]:
510                         sequences.update({n: "repair_node_keys"})
511
512                 #   conn.restart_node('reinstall')
513                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
514                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
515                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
516                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
517                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
518                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
519                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
520                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
521                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
522                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
523                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
524                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
525                                 ]:
526                         sequences.update({n : "restart_node_rins"})
527
528                 #       restart_node_boot
529                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
530                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
531                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
532                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
533                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
534                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
535                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
536                                  ]:
537                         sequences.update({n: "restart_node_boot"})
538
539                 # fsck_repair
540                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
541                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
542                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
543                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
544                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
545                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
546                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
547                                   "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
548                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
549                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
550                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
551                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
552                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
553                                   "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
554                                 ]:
555                         sequences.update({n : "fsck_repair"})
556
557                 # nodeconfig_notice
558                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
559                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
560                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
561                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
562                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
563                                   "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
564                                   "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
565                                 ]:
566                         sequences.update({n : "nodeconfig_notice"})
567
568                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
569                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
570                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
571                                    "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
572                                 ]:
573                         sequences.update({n : "nodenetwork_email"})
574
575                 # noblockdevice_notice
576                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
577                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
578                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
579                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
580                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
581                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
582                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
583                                 ]:
584                         sequences.update({n : "noblockdevice_notice"})
585
586                 # update_bootcd_email
587                 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
588                                 ]:
589                         sequences.update({n : "update_bootcd_email"})
590
591                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
592                                 ]:
593                         sequences.update({n: "unknownsequence_notice"})
594
595                 # minimalhardware_notice
596                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
597                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
598
599                 # baddisk_notice
600                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
601
602                 # baddns_notice
603                 for n in [ 
604                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
605                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
606                         ]:
607                         sequences.update( { n : "baddns_notice"})
608
609                 return sequences
610
611         def getDiskSteps(self):
612                 steps = [
613                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
614                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
615                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
616
617                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
618
619                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
620                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
621
622                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
623                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
624
625                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
626                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
627
628                         ('floppytimeout','floppy0: floppy timeout called'),
629                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
630
631                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
632                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
633
634                         # floppy0: floppy timeout called
635                         # end_request: I/O error, dev fd0, sector 0
636
637                         # Buffer I/O error on device dm-2, logical block 8888896
638                         # ata1: status=0x51 { DriveReady SeekComplete Error }
639                         # ata1: error=0x40 { UncorrectableError }
640                         # SCSI error : <0 0 0 0> return code = 0x8000002
641                         # sda: Current: sense key: Medium Error
642                         #       Additional sense: Unrecovered read error - auto reallocate failed
643
644                         # SCSI error : <0 2 0 0> return code = 0x40001
645                         # end_request: I/O error, dev sda, sector 572489600
646                 ]
647                 return steps
648
649         def getDiskSequence(self, steps, child):
650                 sequence = []
651                 while True:
652                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
653                         sequence.append(id)
654
655                         if id == "done":
656                                 break
657                 return sequence
658
659         def getBootManagerStepPatterns(self):
660                 steps = [
661                         ('bminit'               , 'Initializing the BootManager.'),
662                         ('cfg'                  , 'Reading node configuration file.'),
663                         ('auth'                 , 'Authenticating node with PLC.'),
664                         ('getplc'               , 'Retrieving details of node from PLC.'),
665                         ('update'               , 'Updating node boot state at PLC.'),
666                         ('hardware'             , 'Checking if hardware requirements met.'),
667                         ('installinit'  , 'Install: Initializing.'),
668                         ('installdisk'  , 'Install: partitioning disks.'),
669                         ('installbootfs', 'Install: bootstrapfs tarball.'),
670                         ('installcfg'   , 'Install: Writing configuration files.'),
671                         ('installstop'  , 'Install: Shutting down installer.'),
672                         ('update2'              , 'Updating node boot state at PLC.'),
673                         ('installinit2' , 'Install: Initializing.'),
674                         ('validate'             , 'Validating node installation.'),
675                         ('rebuildinitrd', 'Rebuilding initrd'),
676                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
677                         ('update3'              , 'Updating node configuration.'),
678                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
679                         ('update4'              , 'Sending hardware configuration to PLC.'),
680                         ('debug'                , 'Starting debug mode'),
681                         ('bmexceptmount', 'BootManagerException during mount'),
682                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
683                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
684                         ('exception'    , 'Exception'),
685                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
686                         ('protoerror2'  , '500 Internal Server Error'),
687                         ('protoerror'   , 'XML RPC protocol error'),
688                         ('nodehostname' , 'Configured node hostname does not resolve'),
689                         ('implementerror', 'Implementation Error'),
690                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
691                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
692                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
693                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
694                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
695                         ('noinstall'    , 'notinstalled'),
696                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
697                         ('noblockdev'   , "No block devices detected."),
698                         ('dnserror'     , 'Name or service not known'),
699                         ('noconfig'             , "Unable to find and read a node configuration file"),
700                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
701                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
702                         ('hardwarerequirefail' , 'Hardware requirements not met'),
703                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
704                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
705                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
706                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
707                         ('modulefail'   , 'Unable to get list of system modules'),
708                         ('writeerror'   , 'write error: No space left on device'),
709                         ('nospace'      , "No space left on device"),
710                         ('nonode'       , 'Failed to authenticate call: No such node'),
711                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
712                         ('authfail2'    , 'Authentication Failed'),
713                         ('bootcheckfail'  , 'BootCheckAuthentication'),
714                         ('bootupdatefail' , 'BootUpdateNode'),
715                 ]
716                 return steps
717
718         def getBootManagerSequenceFromLog(self, steps, child):
719                 sequence = []
720                 while True:
721                         
722                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
723                         id = index_to_id(steps,index)
724                         sequence.append(id)
725
726                         if id == "exception":
727                                 print "...Found An Exception!!!"
728                         elif id == "done": #index == len(steps_to_list(steps)):
729                                 #print "Reached EOF"
730                                 break
731
732                 return sequence
733                 
734 def restore(sitehist, hostname, config=None, forced_action=None):
735         ret = restore_basic(sitehist, hostname, config, forced_action)
736         session.flush()
737         return ret
738
739 def restore_basic(sitehist, hostname, config=None, forced_action=None):
740
741         # NOTE: Nothing works if the bootcd is REALLY old.
742         #       So, this is the first step.
743
744         bootman_action = "unknown"
745
746         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
747         recent_actions = sitehist.getRecentActions(hostname=hostname)
748
749         if fbnode['observed_category'] == "OLDBOOTCD":
750                 print "\t...Notify owner to update BootImage!!!"
751
752                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
753                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
754
755                         print "\tDisabling %s due to out-of-date BootImage" % hostname
756                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
757
758                 # NOTE: nothing else is possible.
759                 return "disabled"
760
761         debugnode = DebugInterface(hostname)
762         conn = debugnode.getConnection()
763         if type(conn) == type(False): return "connect_failed"
764
765         boot_state = conn.get_boot_state()
766         if boot_state != "debug":
767                 print "... %s in %s state: skipping..." % (hostname , boot_state)
768                 return "skipped" #boot_state == "boot"
769
770         if conn.bootmanager_running():
771                 print "...BootManager is currently running.  Skipping host %s" %hostname 
772                 return "skipped" # True
773
774         # Read persistent flags, tagged on one week intervals.
775
776         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
777         dmesg = conn.get_dmesg()
778         child = fdpexpect.fdspawn(dmesg)
779
780         steps = debugnode.getDiskSteps()
781         sequence = debugnode.getDiskSequence(steps, child)
782
783         s = Set(sequence)
784         if config and not config.quiet: print "\tSET: ", s
785
786         if len(s) > 1:
787                 print "...Potential drive errors on %s" % hostname 
788                 if len(s) == 2 and 'floppyerror' in s:
789                         print "...Should investigate.  Continuing with node."
790                 else:
791                         print "...Should investigate.  Skipping node."
792                         # TODO: send message related to these errors.
793
794                         if not found_within(recent_actions, 'baddisk_notice', 7):
795                                 print "baddisk_notice not found recently"
796
797                                 log=conn.get_dmesg().read()
798                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
799                                 #conn.set_nodestate('disabled')
800
801                         return "skipping_baddisk"
802
803         print "...Downloading bm.log from %s" %hostname 
804         log = conn.get_bootmanager_log()
805         child = fdpexpect.fdspawn(log)
806
807         if hasattr(config, 'collect') and config.collect: return "collect"
808
809         if config and not config.quiet: print "...Scanning bm.log for errors"
810
811         time.sleep(1)
812
813         steps = debugnode.getBootManagerStepPatterns()
814         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
815                 
816         s = "-".join(sequence)
817         print "   FOUND SEQUENCE: ", s
818
819         # NOTE: We get or set the flag based on the current sequence identifier.
820         #  By using the sequence identifier, we guarantee that there will be no
821         #  frequent loops.  I'm guessing there is a better way to track loops,
822         #  though.
823
824         sequences = debugnode.getSequences()
825         flag_set = True
826         
827         if s not in sequences:
828                 print "   HOST %s" % hostname
829                 print "   UNKNOWN SEQUENCE: %s" % s
830
831                 args = {}
832                 args['hostname'] = hostname
833                 args['sequence'] = s
834                 args['bmlog'] = conn.get_bootmanager_log().read()
835                 args['viart'] = False
836                 args['saveact'] = True
837                 args['ccemail'] = True
838
839                 sitehist.sendMessage('unknownsequence_notice', **args)
840
841                 conn.restart_bootmanager('boot')
842
843                 bootman_action = "restart_bootmanager"
844
845                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
846                 # This way, we can check it again after we've fixed it.
847                 flag_set = False
848
849         else:
850                 bootman_action = sequences[s]
851
852                 if   sequences[s] == "restart_bootmanager_boot":
853                         print "...Restarting BootManager.py on %s "%hostname 
854                         conn.restart_bootmanager('boot')
855                 elif sequences[s] == "restart_bootmanager_rins":
856                         print "...Restarting BootManager.py on %s "%hostname 
857                         conn.restart_bootmanager('reinstall')
858                 elif sequences[s] == "restart_node_rins":
859                         conn.restart_node('reinstall')
860                 elif sequences[s] == "restart_node_boot":
861                         conn.restart_node('boot')
862                 elif sequences[s] == "fsck_repair":
863                         conn.fsck_repair_node()
864                 elif sequences[s] == "repair_node_keys":
865                         if conn.compare_and_repair_nodekeys():
866                                 # the keys either are in sync or were forced in sync.
867                                 # so try to start BM again.
868                                 conn.restart_bootmanager(conn.get_nodestate())
869                         else:
870                                 # there was some failure to synchronize the keys.
871                                 print "...Unable to repair node keys on %s" %hostname 
872                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
873                                         args = {}
874                                         args['hostname'] = hostname
875                                         sitehist.sendMessage('nodeconfig_notice', **args)
876                                         conn.dump_plconf_file()
877                                 else:
878                                         # NOTE: do not add a new action record
879                                         return ""
880
881                 elif sequences[s] == "unknownsequence_notice":
882                         args = {}
883                         args['hostname'] = hostname
884                         args['sequence'] = s
885                         args['bmlog'] = conn.get_bootmanager_log().read()
886                         args['viart'] = False
887                         args['saveact'] = True
888                         args['ccemail'] = True
889
890                         sitehist.sendMessage('unknownsequence_notice', **args)
891                         conn.restart_bootmanager('boot')
892
893                 elif sequences[s] == "nodeconfig_notice":
894
895                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
896                                 args = {}
897                                 args['hostname'] = hostname
898                                 sitehist.sendMessage('nodeconfig_notice', **args)
899                                 conn.dump_plconf_file()
900                         else:
901                                 # NOTE: do not add a new action record
902                                 return ""
903
904                 elif sequences[s] == "nodenetwork_email":
905
906                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
907                                 args = {}
908                                 args['hostname'] = hostname
909                                 args['bmlog'] = conn.get_bootmanager_log().read()
910                                 sitehist.sendMessage('nodeconfig_notice', **args)
911                                 conn.dump_plconf_file()
912                         else:
913                                 # NOTE: do not add a new action record
914                                 return ""
915
916                 elif sequences[s] == "noblockdevice_notice":
917
918                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
919                                 args = {}
920                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
921                                 args['hostname'] = hostname
922                         
923                                 sitehist.sendMessage('noblockdevice_notice', **args)
924                         else:
925                                 # NOTE: do not add a new action record
926                                 return ""
927
928                 elif sequences[s] == "baddisk_notice":
929                         # MAKE An ACTION record that this host has failed hardware.  May
930                         # require either an exception "/minhw" or other manual intervention.
931                         # Definitely need to send out some more EMAIL.
932                         # TODO: email notice of broken hardware
933                         if not found_within(recent_actions, 'baddisk_notice', 7):
934                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
935                                 args = {}
936                                 args['hostname'] = hostname
937                                 args['log'] = conn.get_dmesg().read()
938
939                                 sitehist.sendMessage('baddisk_notice', **args)
940                                 #conn.set_nodestate('disabled')
941                         else:
942                                 # NOTE: do not add a new action record
943                                 return ""
944
945                 elif sequences[s] == "minimalhardware_notice":
946                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
947                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
948                                 args = {}
949                                 args['hostname'] = hostname
950                                 args['bmlog'] = conn.get_bootmanager_log().read()
951                                 sitehist.sendMessage('minimalhardware_notice', **args)
952                         else:
953                                 # NOTE: do not add a new action record
954                                 return ""
955
956                 elif sequences[s] == "baddns_notice":
957                         if not found_within(recent_actions, 'baddns_notice', 1):
958                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
959                                 args = {}
960                                 try:
961                                         node = plccache.GetNodeByName(hostname)
962                                         net = api.GetInterfaces(node['interface_ids'])[0]
963                                 except:
964                                         email_exception()
965                                         print traceback.print_exc()
966                                         # TODO: api error. skip email, b/c all info is not available,
967                                         # flag_set will not be recorded.
968                                         return "exception"
969                                 nodenet_str = network_config_to_str(net)
970
971                                 args['hostname'] = hostname
972                                 args['network_config'] = nodenet_str
973                                 args['interface_id'] = net['interface_id']
974
975                                 sitehist.sendMessage('baddns_notice', **args)
976                         else:
977                                 # NOTE: do not add a new action record
978                                 return ""
979
980         return bootman_action
981         
982
983 # MAIN -------------------------------------------------------------------
984
985 def main():
986         from monitor import parser as parsermodule
987         parser = parsermodule.getParser()
988
989         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
990                                                 force=None, quiet=False)
991         parser.add_option("", "--child", dest="child", action="store_true", 
992                                                 help="This is the child mode of this process.")
993         parser.add_option("", "--force", dest="force", metavar="boot_state",
994                                                 help="Force a boot state passed to BootManager.py.")
995         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
996                                                 help="Extra quiet output messages.")
997         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
998                                                 help="Extra debug output messages.")
999         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
1000                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
1001         parser.add_option("", "--collect", dest="collect", action="store_true", 
1002                                                 help="No action, just collect dmesg, and bm.log")
1003         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
1004                                                 help="Do not perform the orginary setup phase.")
1005
1006         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
1007         config = parsermodule.parse_args(parser)
1008
1009         if config.nodelist:
1010                 nodes = config.getListFromFile(config.nodelist)
1011         elif config.node:
1012                 nodes = [ config.node ]
1013         else:
1014                 parser.print_help()
1015                 sys.exit(1)
1016
1017         for node in nodes:
1018                 # get sitehist
1019                 lb = plccache.plcdb_hn2lb[node]
1020                 sitehist = SiteInterface.get_or_make(loginbase=lb)
1021                 #reboot(node, config)
1022                 restore(sitehist, node, config=None, forced_action=None)
1023
1024 if __name__ == "__main__":
1025         main()