this doesn't work.
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         return
61         
62
63 class ExceptionDoubleSSHError(Exception): pass
64
65 class NodeConnection:
66         def __init__(self, connection, node, config):
67                 self.node = node
68                 self.c = connection
69                 self.config = config
70
71         def get_boot_state(self):
72                 try:
73                         if self.c.modules.os.path.exists('/tmp/source'):
74                                 return "debug"
75                         elif self.c.modules.os.path.exists('/vservers'): 
76                                 return "boot"
77                         else:
78                                 return "unknown"
79                 except EOFError:
80                         traceback.print_exc()
81                         print self.c.modules.sys.path
82                 except:
83                         email_exception()
84                         traceback.print_exc()
85
86                 return "unknown"
87
88         def get_dmesg(self):
89                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
90                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
91                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
92                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
93                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
94                 return log
95
96         def get_bootmanager_log(self):
97                 bm_name = bootmanager_log_name(self.node)
98                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
99                 #email_exception(self.node, "collected BM log for %s" % self.node)
100                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
101                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
102                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
103                 return log
104
105         def dump_plconf_file(self):
106                 c = self.c
107                 self.c.modules.sys.path.append("/tmp/source/")
108                 self.c.modules.os.chdir('/tmp/source')
109
110                 log = c.modules.BootManager.log('/tmp/new.log')
111                 bm = c.modules.BootManager.BootManager(log,'boot')
112
113                 BootManagerException = c.modules.Exceptions.BootManagerException
114                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
116                 bm_continue = True
117
118                 InitializeBootManager.Run(bm.VARS, bm.LOG)
119                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
120                 except Exception, x:
121                         bm_continue = False
122                         print "   ERROR:", x
123                         print "   Possibly, unable to find valid configuration file"
124
125                 if bm_continue:
126                         for key in bm.VARS.keys():
127                                 print key, " == ", bm.VARS[key]
128                 else:
129                         print "   Unable to read Node Configuration"
130                 
131         def fsck_repair_node(self):
132                 c = self.c
133                 self.c.modules.sys.path.append("/tmp/source/")
134                 self.c.modules.os.chdir('/tmp/source')
135                 # TODO: restart
136                 # TODO: set boot state to node's actually boot state.
137                 # could be 'boot' or 'safeboot'
138                 self.c.modules.os.chdir('/tmp/source')
139                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
140                         print "Running MANUAL FSCK already... try again soon."
141                 else:
142                         print "Running MANUAL fsck on %s" % self.node
143                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
144                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
145                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
146                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
147                                   "  rm -f /tmp/BM_RUNNING " + \
148                                   ") &" 
149                         cmd = cmd % self.get_nodestate()
150                         self.c.modules.os.system(cmd)
151                 #self.restart_bootmanager('boot')       
152                 pass
153
154         def compare_and_repair_nodekeys(self):
155                 c = self.c
156                 self.c.modules.sys.path.append("/tmp/source/")
157                 self.c.modules.os.chdir('/tmp/source')
158
159                 log = c.modules.BootManager.log('/tmp/new.log')
160                 bm = c.modules.BootManager.BootManager(log,'boot')
161
162                 BootManagerException = c.modules.Exceptions.BootManagerException
163                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
164                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
165                 bm_continue = True
166
167                 plcnode = plccache.GetNodeByName(self.node)
168
169                 InitializeBootManager.Run(bm.VARS, bm.LOG)
170                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
171                 except Exception, x:
172                         bm_continue = False
173                         print "exception"
174                         print x
175                         print "   Possibly, unable to find valid configuration file"
176
177                 if bm_continue:
178                         print "   NODE: %s" % bm.VARS['NODE_KEY']
179                         print "   PLC : %s" % plcnode['key']
180
181                         if bm.VARS['NODE_KEY'] == plcnode['key']:
182                                 return True
183                         else:
184                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
185                                         print "   Successfully updated NODE_KEY with PLC"
186                                         return True
187                                 else:
188                                         return False
189                                 
190                         #for key in bm.VARS.keys():
191                         #       print key, " == ", bm.VARS[key]
192                 else:
193                         print "   Unable to retrieve NODE_KEY"
194
195         def bootmanager_running(self):
196                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
197                         return True
198                 else:
199                         return False
200
201         def set_nodestate(self, state='boot'):
202                 return api.UpdateNode(self.node, {'boot_state' : state})
203
204         def get_nodestate(self):
205                 try:
206                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
207                 except:
208                         traceback.print_exc()
209                         # NOTE: use last cached value from plc
210                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
211                         return fbnode['plc_node_stats']['boot_state']
212
213
214         def restart_node(self, state='boot'):
215                 api.UpdateNode(self.node, {'boot_state' : state})
216
217                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
218                 if not pflags.getRecentFlag('gentlekill'):
219                         print "   Killing all slice processes... : %s" %  self.node
220                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
221                         self.c.modules.os.system(cmd_slicekill)
222                         cmd = """ shutdown -r +1 & """
223                         print "   Restarting %s : %s" % ( self.node, cmd)
224                         self.c.modules.os.system(cmd)
225
226                         pflags.setRecentFlag('gentlekill')
227                         pflags.save()
228                 else:
229                         print "   Restarting with sysrq 'sub' %s" % self.node
230                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
231                         self.c.modules.os.system(cmd)
232
233                 return
234
235         def restart_bootmanager(self, forceState):
236
237                 self.c.modules.os.chdir('/tmp/source')
238                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
239                         print "   BootManager is already running: try again soon..."
240                 else:
241                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
242                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
243                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
244                                   "  rm -f /tmp/BM_RUNNING " + \
245                                   ") &" 
246                         cmd = cmd % forceState
247                         self.c.modules.os.system(cmd)
248
249                 return 
250
251
252 class PlanetLabSession:
253         globalport = 22000 + int(random.random()*1000)
254
255         def __init__(self, node, nosetup, verbose):
256                 self.verbose = verbose
257                 self.node = node
258                 self.port = None
259                 self.nosetup = nosetup
260                 self.command = None
261                 self.setup_host()
262
263         def get_connection(self, config):
264                 try:
265                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
266                 except:
267                         # NOTE: try twice since this can sometimes fail the first time. If
268                         #               it fails again, let it go.
269                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
270                 return conn
271         
272         def setup_host(self):
273                 self.port = PlanetLabSession.globalport
274                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
275
276                 args = {}
277                 args['port'] = self.port
278                 args['user'] = 'root'
279                 args['hostname'] = self.node
280                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
281                 ssh_port = 22
282
283                 if self.nosetup:
284                         print "Skipping setup"
285                         return 
286
287                 # COPY Rpyc files to host
288                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
289                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
290                 if self.verbose: print cmd
291                 print cmd
292                 # TODO: Add timeout
293                 timeout = 120
294                 localos = moncommands.CMD()
295
296                 ret = localos.system(cmd, timeout)
297                 print ret
298                 if ret != 0:
299                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
300                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
301                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
302                         print "trying: ", cmd
303                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
304                         ret = localos.system(cmd, timeout)
305                         print ret
306                         if ret != 0:
307                                 print "\tFAILED TWICE"
308                                 #email_exception("%s rsync failed twice" % self.node)
309                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
310
311                 t1 = time.time()
312                 # KILL any already running servers.
313                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
314                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
315             rm -f out.log
316             echo "kill server" >> out.log
317             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
318             echo "export" >> out.log
319             export PYTHONPATH=$HOME  ;
320             echo "start server" >> out.log
321             python Rpyc/Servers/forking_server.py &> server.log &
322             echo "done" >> out.log
323 EOF""")
324                 print "setup rpyc server over ssh"
325                 print ssh.ret
326
327                 # TODO: Add timeout
328                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
329                 # and the following options seems to work well.
330                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
331                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
332                           """-o ConnectTimeout=120 """ + \
333                           """-n -N -L %(port)s:localhost:18812 """ + \
334                           """%(user)s@%(hostname)s"""
335                 cmd = cmd % args
336                 if self.verbose: print cmd
337                 print cmd
338                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
339                 # TODO: the read() here may block indefinitely.  Need a better
340                 # approach therefore, that includes a timeout.
341                 #ret = self.command.stdout.read(5)
342                 ret = moncommands.read_t(self.command.stdout, 5)
343
344                 t2 = time.time()
345                 if 'READY' in ret:
346                         # NOTE: There is still a slight race for machines that are slow...
347                         self.timeout = 2*(t2-t1)
348                         print "Sleeping for %s sec" % self.timeout
349                         time.sleep(self.timeout)
350                         return
351
352                 if self.command.returncode is not None:
353                         print "Failed to establish tunnel!"
354                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
355
356                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
357
358         def __del__(self):
359                 if self.command:
360                         if self.verbose: print "Killing SSH session %s" % self.port
361                         print "Killing SSH session %s" % self.port
362                         self.command.kill()
363
364         
365 def steps_to_list(steps, index=1):
366         return map(lambda x: x[index], steps)
367
368 def index_to_id(steps,index):
369         if index < len(steps):
370                 return steps[index][0]
371         else:
372                 return "done"
373
374 class DebugInterface:
375         def __init__(self, hostname):
376                 self.hostname = hostname
377                 self.session = None
378
379         def getConnection(self):
380                 print "Creating session for %s" % self.hostname
381                 # update known_hosts file (in case the node has rebooted since last run)
382                 try:
383                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
384                 except:
385                         email_exception()
386                         print traceback.print_exc()
387                         return False
388
389                 msg = "ERROR setting up session for %s" % self.hostname
390                 try:
391                         if config == None:
392                                 self.session = PlanetLabSession(self.hostname, False, True)
393                         else:
394                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
395                 except ExceptionDoubleSSHError, e:
396                         print msg
397                         return False
398                 except Exception, e:
399                         traceback.print_exc()
400                         email_exception(msg)
401                         return False
402
403                 try:
404                         conn = self.session.get_connection(config)
405                 except EOFError:
406                         # NOTE: sometimes the wait in setup_host() is not long enough.  
407                         # So, here we try to wait a little longer before giving up entirely.
408                         try:
409                                 time.sleep(self.session.timeout*5)
410                                 conn = self.session.get_connection(config)
411                         except EOFError:
412                                 # failed twice... no need to report this really, it's just in a
413                                 # weird state...
414                                 return False
415                         except:
416                                 traceback.print_exc()
417                                 email_exception(self.hostname)
418                                 return False
419                 #print "trying to use conn before returning it."
420                 #print conn.c.modules.sys.path
421                 #print conn.c.modules.os.path.exists('/tmp/source')
422                 #time.sleep(1)
423
424                 #print "conn: %s" % conn
425                 return conn
426
427         def getSequences(self):
428
429                 # TODO: This can be replaced with a DB definition at a future time.
430                 #               This would make it possible for an admin to introduce new
431                 #               patterns without touching code.
432                 
433                 sequences = {}
434                 # restart_bootmanager_boot
435                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
436                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
437                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
438
439                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
440
441                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
442                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
443                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
444                                 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
445                                 "bminit-cfg-auth-getplc-update-debug-done",
446                                 "bminit-cfg-auth-protoerror2-debug-done",
447                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
448                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
449                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
450                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
451                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
452                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
453                                 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
454                                 ]:
455                         sequences.update({n : "restart_bootmanager_boot"})
456
457                 #       conn.restart_bootmanager('reinstall')
458                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
459                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
460                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
461                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
462                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
463                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
464                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
465                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
466                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
467                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
468                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
469                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
470                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
471                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
473                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
474                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
475                                 # actual solution appears to involve removing the bad files, and
476                                 # continually trying to boot the node.
477                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
478                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
479                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
480                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
481                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
482                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
483                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
484                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
485                                 ]:
486                         sequences.update({n : "restart_bootmanager_rins"})
487
488                 # repair_node_keys
489                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
490                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
491                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
492                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
493                                         "bminit-cfg-auth-authfail-debug-done",
494                                         "bminit-cfg-auth-authfail2-authfail-debug-done",
495                                 ]:
496                         sequences.update({n: "repair_node_keys"})
497
498                 #   conn.restart_node('reinstall')
499                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
500                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
501                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
502                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
503                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
504                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
505                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
506                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
507                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
508                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
509                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
510                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
511                                 ]:
512                         sequences.update({n : "restart_node_rins"})
513
514                 #       restart_node_boot
515                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
516                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
517                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
518                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
519                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
520                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
521                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
522                                  ]:
523                         sequences.update({n: "restart_node_boot"})
524
525                 # fsck_repair
526                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
527                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
528                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
529                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
530                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
531                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
532                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
533                                   "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
534                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
535                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
536                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
537                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
538                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
539                                   "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
540                                 ]:
541                         sequences.update({n : "fsck_repair"})
542
543                 # nodeconfig_notice
544                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
545                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
546                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
547                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
548                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
549                                   "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
550                                   "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
551                                 ]:
552                         sequences.update({n : "nodeconfig_notice"})
553
554                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
555                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
556                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
557                                    "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
558                                 ]:
559                         sequences.update({n : "nodenetwork_email"})
560
561                 # noblockdevice_notice
562                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
563                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
564                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
565                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
566                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
567                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
568                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
569                                 ]:
570                         sequences.update({n : "noblockdevice_notice"})
571
572                 # update_bootcd_email
573                 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
574                                 ]:
575                         sequences.update({n : "update_bootcd_email"})
576
577                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
578                                 ]:
579                         sequences.update({n: "unknownsequence_notice"})
580
581                 # minimalhardware_notice
582                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
583                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
584
585                 # baddisk_notice
586                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
587
588                 # baddns_notice
589                 for n in [ 
590                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
591                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
592                         ]:
593                         sequences.update( { n : "baddns_notice"})
594
595                 return sequences
596
597         def getDiskSteps(self):
598                 steps = [
599                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
600                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
601                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
602
603                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
604
605                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
606                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
607
608                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
609                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
610
611                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
612                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
613
614                         ('floppytimeout','floppy0: floppy timeout called'),
615                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
616
617                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
618                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
619
620                         # floppy0: floppy timeout called
621                         # end_request: I/O error, dev fd0, sector 0
622
623                         # Buffer I/O error on device dm-2, logical block 8888896
624                         # ata1: status=0x51 { DriveReady SeekComplete Error }
625                         # ata1: error=0x40 { UncorrectableError }
626                         # SCSI error : <0 0 0 0> return code = 0x8000002
627                         # sda: Current: sense key: Medium Error
628                         #       Additional sense: Unrecovered read error - auto reallocate failed
629
630                         # SCSI error : <0 2 0 0> return code = 0x40001
631                         # end_request: I/O error, dev sda, sector 572489600
632                 ]
633                 return steps
634
635         def getDiskSequence(self, steps, child):
636                 sequence = []
637                 while True:
638                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
639                         sequence.append(id)
640
641                         if id == "done":
642                                 break
643                 return sequence
644
645         def getBootManagerStepPatterns(self):
646                 steps = [
647                         ('bminit'               , 'Initializing the BootManager.'),
648                         ('cfg'                  , 'Reading node configuration file.'),
649                         ('auth'                 , 'Authenticating node with PLC.'),
650                         ('getplc'               , 'Retrieving details of node from PLC.'),
651                         ('update'               , 'Updating node boot state at PLC.'),
652                         ('hardware'             , 'Checking if hardware requirements met.'),
653                         ('installinit'  , 'Install: Initializing.'),
654                         ('installdisk'  , 'Install: partitioning disks.'),
655                         ('installbootfs', 'Install: bootstrapfs tarball.'),
656                         ('installcfg'   , 'Install: Writing configuration files.'),
657                         ('installstop'  , 'Install: Shutting down installer.'),
658                         ('update2'              , 'Updating node boot state at PLC.'),
659                         ('installinit2' , 'Install: Initializing.'),
660                         ('validate'             , 'Validating node installation.'),
661                         ('rebuildinitrd', 'Rebuilding initrd'),
662                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
663                         ('update3'              , 'Updating node configuration.'),
664                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
665                         ('update4'              , 'Sending hardware configuration to PLC.'),
666                         ('debug'                , 'Starting debug mode'),
667                         ('bmexceptmount', 'BootManagerException during mount'),
668                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
669                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
670                         ('exception'    , 'Exception'),
671                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
672                         ('protoerror2'  , '500 Internal Server Error'),
673                         ('protoerror'   , 'XML RPC protocol error'),
674                         ('nodehostname' , 'Configured node hostname does not resolve'),
675                         ('implementerror', 'Implementation Error'),
676                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
677                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
678                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
679                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
680                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
681                         ('noinstall'    , 'notinstalled'),
682                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
683                         ('noblockdev'   , "No block devices detected."),
684                         ('dnserror'     , 'Name or service not known'),
685                         ('noconfig'             , "Unable to find and read a node configuration file"),
686                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
687                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
688                         ('hardwarerequirefail' , 'Hardware requirements not met'),
689                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
690                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
691                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
692                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
693                         ('modulefail'   , 'Unable to get list of system modules'),
694                         ('writeerror'   , 'write error: No space left on device'),
695                         ('nospace'      , "No space left on device"),
696                         ('nonode'       , 'Failed to authenticate call: No such node'),
697                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
698                         ('authfail2'    , 'Authentication Failed'),
699                         ('bootcheckfail'  , 'BootCheckAuthentication'),
700                         ('bootupdatefail' , 'BootUpdateNode'),
701                 ]
702                 return steps
703
704         def getBootManagerSequenceFromLog(self, steps, child):
705                 sequence = []
706                 while True:
707                         
708                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
709                         id = index_to_id(steps,index)
710                         sequence.append(id)
711
712                         if id == "exception":
713                                 print "...Found An Exception!!!"
714                         elif id == "done": #index == len(steps_to_list(steps)):
715                                 #print "Reached EOF"
716                                 break
717
718                 return sequence
719                 
720 def restore(sitehist, hostname, config=None, forced_action=None):
721         ret = restore_basic(sitehist, hostname, config, forced_action)
722         session.flush()
723         return ret
724
725 def restore_basic(sitehist, hostname, config=None, forced_action=None):
726
727         # NOTE: Nothing works if the bootcd is REALLY old.
728         #       So, this is the first step.
729
730         bootman_action = "unknown"
731
732         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
733         recent_actions = sitehist.getRecentActions(hostname=hostname)
734
735         if fbnode['observed_category'] == "OLDBOOTCD":
736                 print "\t...Notify owner to update BootImage!!!"
737
738                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
739                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
740
741                         print "\tDisabling %s due to out-of-date BootImage" % hostname
742                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
743
744                 # NOTE: nothing else is possible.
745                 return "disabled"
746
747         debugnode = DebugInterface(hostname)
748         conn = debugnode.getConnection()
749         if type(conn) == type(False): return "connect_failed"
750
751         boot_state = conn.get_boot_state()
752         if boot_state != "debug":
753                 print "... %s in %s state: skipping..." % (hostname , boot_state)
754                 return "skipped" #boot_state == "boot"
755
756         if conn.bootmanager_running():
757                 print "...BootManager is currently running.  Skipping host %s" %hostname 
758                 return "skipped" # True
759
760         # Read persistent flags, tagged on one week intervals.
761
762         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
763         dmesg = conn.get_dmesg()
764         child = fdpexpect.fdspawn(dmesg)
765
766         steps = debugnode.getDiskSteps()
767         sequence = debugnode.getDiskSequence(steps, child)
768
769         s = Set(sequence)
770         if config and not config.quiet: print "\tSET: ", s
771
772         if len(s) > 1:
773                 print "...Potential drive errors on %s" % hostname 
774                 if len(s) == 2 and 'floppyerror' in s:
775                         print "...Should investigate.  Continuing with node."
776                 else:
777                         print "...Should investigate.  Skipping node."
778                         # TODO: send message related to these errors.
779
780                         if not found_within(recent_actions, 'baddisk_notice', 7):
781                                 print "baddisk_notice not found recently"
782
783                                 log=conn.get_dmesg().read()
784                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
785                                 return "skipping_baddisk"
786                         else:
787                                 # NOTE: "" does not add a new action record
788                                 return ""
789
790
791         print "...Downloading bm.log from %s" %hostname 
792         log = conn.get_bootmanager_log()
793         bm_log_data = log.read() # get data
794         log.seek(0)     # reset fd pointer for fdspawn
795         child = fdpexpect.fdspawn(log)
796
797         if hasattr(config, 'collect') and config.collect: return "collect"
798
799         if config and not config.quiet: print "...Scanning bm.log for errors"
800
801         time.sleep(1)
802
803         steps = debugnode.getBootManagerStepPatterns()
804         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
805                 
806         s = "-".join(sequence)
807         print "   FOUND SEQUENCE: ", s
808
809         # NOTE: We get or set the flag based on the current sequence identifier.
810         #  By using the sequence identifier, we guarantee that there will be no
811         #  frequent loops.  I'm guessing there is a better way to track loops,
812         #  though.
813
814         sequences = debugnode.getSequences()
815         flag_set = True
816         
817         if s not in sequences:
818                 print "   HOST %s" % hostname
819                 print "   UNKNOWN SEQUENCE: %s" % s
820
821                 args = {}
822                 args['hostname'] = hostname
823                 args['sequence'] = s
824                 args['bmlog'] = bm_log_data
825                 args['viart'] = False
826                 args['saveact'] = True
827                 args['ccemail'] = True
828
829                 sitehist.sendMessage('unknownsequence_notice', **args)
830
831                 conn.restart_bootmanager('boot')
832
833                 bootman_action = "restart_bootmanager"
834
835                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
836                 # This way, we can check it again after we've fixed it.
837                 flag_set = False
838
839         else:
840                 bootman_action = sequences[s]
841
842                 if   sequences[s] == "restart_bootmanager_boot":
843                         print "...Restarting BootManager.py on %s "%hostname 
844                         conn.restart_bootmanager('boot')
845                 elif sequences[s] == "restart_bootmanager_rins":
846                         print "...Restarting BootManager.py on %s "%hostname 
847                         conn.restart_bootmanager('reinstall')
848                 elif sequences[s] == "restart_node_rins":
849                         conn.restart_node('reinstall')
850                 elif sequences[s] == "restart_node_boot":
851                         conn.restart_node('boot')
852                 elif sequences[s] == "fsck_repair":
853                         conn.fsck_repair_node()
854                 elif sequences[s] == "repair_node_keys":
855                         if conn.compare_and_repair_nodekeys():
856                                 # the keys either are in sync or were forced in sync.
857                                 # so try to start BM again.
858                                 conn.restart_bootmanager(conn.get_nodestate())
859                         else:
860                                 # there was some failure to synchronize the keys.
861                                 print "...Unable to repair node keys on %s" %hostname 
862                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
863                                         args = {}
864                                         args['hostname'] = hostname
865                                         sitehist.sendMessage('nodeconfig_notice', **args)
866                                         conn.dump_plconf_file()
867                                 else:
868                                         # NOTE: do not add a new action record
869                                         return ""
870
871                 elif sequences[s] == "unknownsequence_notice":
872                         args = {}
873                         args['hostname'] = hostname
874                         args['sequence'] = s
875                         args['bmlog'] = bm_log_data
876                         args['viart'] = False
877                         args['saveact'] = True
878                         args['ccemail'] = True
879
880                         sitehist.sendMessage('unknownsequence_notice', **args)
881                         conn.restart_bootmanager('boot')
882
883                 elif sequences[s] == "nodeconfig_notice":
884
885                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
886                                 args = {}
887                                 args['hostname'] = hostname
888                                 sitehist.sendMessage('nodeconfig_notice', **args)
889                                 conn.dump_plconf_file()
890                         else:
891                                 # NOTE: do not add a new action record
892                                 return ""
893
894                 elif sequences[s] == "nodenetwork_email":
895
896                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
897                                 args = {}
898                                 args['hostname'] = hostname
899                                 args['bmlog'] = bm_log_data
900                                 sitehist.sendMessage('nodeconfig_notice', **args)
901                                 conn.dump_plconf_file()
902                         else:
903                                 # NOTE: do not add a new action record
904                                 return ""
905
906                 elif sequences[s] == "noblockdevice_notice":
907
908                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
909                                 args = {}
910                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
911                                 args['hostname'] = hostname
912                         
913                                 sitehist.sendMessage('noblockdevice_notice', **args)
914                         else:
915                                 # NOTE: do not add a new action record
916                                 return ""
917
918                 elif sequences[s] == "baddisk_notice":
919                         # MAKE An ACTION record that this host has failed hardware.  May
920                         # require either an exception "/minhw" or other manual intervention.
921                         # Definitely need to send out some more EMAIL.
922                         # TODO: email notice of broken hardware
923                         if not found_within(recent_actions, 'baddisk_notice', 7):
924                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
925                                 args = {}
926                                 args['hostname'] = hostname
927                                 args['log'] = conn.get_dmesg().read()
928
929                                 sitehist.sendMessage('baddisk_notice', **args)
930                                 #conn.set_nodestate('disabled')
931                         else:
932                                 # NOTE: do not add a new action record
933                                 return ""
934
935                 elif sequences[s] == "minimalhardware_notice":
936                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
937                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
938                                 args = {}
939                                 args['hostname'] = hostname
940                                 args['bmlog'] = bm_log_data
941                                 sitehist.sendMessage('minimalhardware_notice', **args)
942                         else:
943                                 # NOTE: do not add a new action record
944                                 return ""
945
946                 elif sequences[s] == "baddns_notice":
947                         if not found_within(recent_actions, 'baddns_notice', 1):
948                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
949                                 args = {}
950                                 try:
951                                         node = plccache.GetNodeByName(hostname)
952                                         net = api.GetInterfaces(node['interface_ids'])[0]
953                                 except:
954                                         email_exception()
955                                         print traceback.print_exc()
956                                         # TODO: api error. skip email, b/c all info is not available,
957                                         # flag_set will not be recorded.
958                                         return "exception"
959                                 nodenet_str = network_config_to_str(net)
960
961                                 args['hostname'] = hostname
962                                 args['network_config'] = nodenet_str
963                                 args['interface_id'] = net['interface_id']
964
965                                 sitehist.sendMessage('baddns_notice', **args)
966                         else:
967                                 # NOTE: do not add a new action record
968                                 return ""
969
970         return bootman_action
971         
972
973 # MAIN -------------------------------------------------------------------
974
975 def main():
976         from monitor import parser as parsermodule
977         parser = parsermodule.getParser()
978
979         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
980                                                 force=None, quiet=False)
981         parser.add_option("", "--child", dest="child", action="store_true", 
982                                                 help="This is the child mode of this process.")
983         parser.add_option("", "--force", dest="force", metavar="boot_state",
984                                                 help="Force a boot state passed to BootManager.py.")
985         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
986                                                 help="Extra quiet output messages.")
987         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
988                                                 help="Extra debug output messages.")
989         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
990                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
991         parser.add_option("", "--collect", dest="collect", action="store_true", 
992                                                 help="No action, just collect dmesg, and bm.log")
993         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
994                                                 help="Do not perform the orginary setup phase.")
995
996         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
997         config = parsermodule.parse_args(parser)
998
999         if config.nodelist:
1000                 nodes = config.getListFromFile(config.nodelist)
1001         elif config.node:
1002                 nodes = [ config.node ]
1003         else:
1004                 parser.print_help()
1005                 sys.exit(1)
1006
1007         for node in nodes:
1008                 # get sitehist
1009                 lb = plccache.plcdb_hn2lb[node]
1010                 sitehist = SiteInterface.get_or_make(loginbase=lb)
1011                 #reboot(node, config)
1012                 restore(sitehist, node, config=None, forced_action=None)
1013
1014 if __name__ == "__main__":
1015         main()