moved nodequery common code to monitor/query.py
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         return
61         
62
63 class ExceptionDoubleSSHError(Exception): pass
64
65 class NodeConnection:
66         def __init__(self, connection, node, config):
67                 self.node = node
68                 self.c = connection
69                 self.config = config
70
71         def get_boot_state(self):
72                 try:
73                         if self.c.modules.os.path.exists('/tmp/source'):
74                                 return "debug"
75                         elif self.c.modules.os.path.exists('/vservers'): 
76                                 return "boot"
77                         else:
78                                 return "unknown"
79                 except EOFError:
80                         traceback.print_exc()
81                         print self.c.modules.sys.path
82                 except:
83                         email_exception()
84                         traceback.print_exc()
85
86                 return "unknown"
87
88         def get_dmesg(self):
89                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
90                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
91                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
92                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
93                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
94                 return log
95
96         def get_bootmanager_log(self):
97                 bm_name = bootmanager_log_name(self.node)
98                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
99                 #email_exception(self.node, "collected BM log for %s" % self.node)
100                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
101                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
102                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
103                 return log
104
105         def dump_plconf_file(self):
106                 c = self.c
107                 self.c.modules.sys.path.append("/tmp/source/")
108                 self.c.modules.os.chdir('/tmp/source')
109
110                 log = c.modules.BootManager.log('/tmp/new.log')
111                 bm = c.modules.BootManager.BootManager(log,'boot')
112
113                 BootManagerException = c.modules.Exceptions.BootManagerException
114                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
116                 bm_continue = True
117
118                 InitializeBootManager.Run(bm.VARS, bm.LOG)
119                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
120                 except Exception, x:
121                         bm_continue = False
122                         print "   ERROR:", x
123                         print "   Possibly, unable to find valid configuration file"
124
125                 if bm_continue:
126                         for key in bm.VARS.keys():
127                                 print key, " == ", bm.VARS[key]
128                 else:
129                         print "   Unable to read Node Configuration"
130                 
131         def fsck_repair_node(self):
132                 c = self.c
133                 self.c.modules.sys.path.append("/tmp/source/")
134                 self.c.modules.os.chdir('/tmp/source')
135                 # TODO: restart
136                 # TODO: set boot state to node's actually boot state.
137                 # could be 'boot' or 'safeboot'
138                 self.c.modules.os.chdir('/tmp/source')
139                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
140                         print "Running MANUAL FSCK already... try again soon."
141                 else:
142                         print "Running MANUAL fsck on %s" % self.node
143                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
144                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
145                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
146                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
147                                   "  rm -f /tmp/BM_RUNNING " + \
148                                   ") &" 
149                         cmd = cmd % self.get_nodestate()
150                         self.c.modules.os.system(cmd)
151                 #self.restart_bootmanager('boot')       
152                 pass
153
154         def compare_and_repair_nodekeys(self):
155                 c = self.c
156                 self.c.modules.sys.path.append("/tmp/source/")
157                 self.c.modules.os.chdir('/tmp/source')
158
159                 log = c.modules.BootManager.log('/tmp/new.log')
160                 bm = c.modules.BootManager.BootManager(log,'boot')
161
162                 BootManagerException = c.modules.Exceptions.BootManagerException
163                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
164                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
165                 bm_continue = True
166
167                 plcnode = plccache.GetNodeByName(self.node)
168
169                 InitializeBootManager.Run(bm.VARS, bm.LOG)
170                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
171                 except Exception, x:
172                         bm_continue = False
173                         print "exception"
174                         print x
175                         print "   Possibly, unable to find valid configuration file"
176
177                 if bm_continue:
178                         print "   NODE: %s" % bm.VARS['NODE_KEY']
179                         print "   PLC : %s" % plcnode['key']
180
181                         if bm.VARS['NODE_KEY'] == plcnode['key']:
182                                 return True
183                         else:
184                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
185                                         print "   Successfully updated NODE_KEY with PLC"
186                                         return True
187                                 else:
188                                         return False
189                                 
190                         #for key in bm.VARS.keys():
191                         #       print key, " == ", bm.VARS[key]
192                 else:
193                         print "   Unable to retrieve NODE_KEY"
194
195         def bootmanager_running(self):
196                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
197                         return True
198                 else:
199                         return False
200
201         def set_nodestate(self, state='boot'):
202                 return api.UpdateNode(self.node, {'boot_state' : state})
203
204         def get_nodestate(self):
205                 try:
206                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
207                 except:
208                         traceback.print_exc()
209                         # NOTE: use last cached value from plc
210                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
211                         return fbnode['plc_node_stats']['boot_state']
212
213
214         def restart_node(self, state='boot'):
215                 api.UpdateNode(self.node, {'boot_state' : state})
216
217                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
218                 if not pflags.getRecentFlag('gentlekill'):
219                         print "   Killing all slice processes... : %s" %  self.node
220                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
221                         self.c.modules.os.system(cmd_slicekill)
222                         cmd = """ shutdown -r +1 & """
223                         print "   Restarting %s : %s" % ( self.node, cmd)
224                         self.c.modules.os.system(cmd)
225
226                         pflags.setRecentFlag('gentlekill')
227                         pflags.save()
228                 else:
229                         print "   Restarting with sysrq 'sub' %s" % self.node
230                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
231                         self.c.modules.os.system(cmd)
232
233                 return
234
235         def restart_bootmanager(self, forceState):
236
237                 self.c.modules.os.chdir('/tmp/source')
238                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
239                         print "   BootManager is already running: try again soon..."
240                 else:
241                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
242                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
243                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
244                                   "  rm -f /tmp/BM_RUNNING " + \
245                                   ") &" 
246                         cmd = cmd % forceState
247                         self.c.modules.os.system(cmd)
248
249                 return 
250
251
252 class PlanetLabSession:
253         globalport = 22000 + int(random.random()*1000)
254
255         def __init__(self, node, nosetup, verbose):
256                 self.verbose = verbose
257                 self.node = node
258                 self.port = None
259                 self.nosetup = nosetup
260                 self.command = None
261                 self.setup_host()
262
263         def get_connection(self, config):
264                 try:
265                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
266                 except:
267                         # NOTE: try twice since this can sometimes fail the first time. If
268                         #               it fails again, let it go.
269                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
270                 return conn
271         
272         def setup_host(self):
273                 self.port = PlanetLabSession.globalport
274                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
275
276                 args = {}
277                 args['port'] = self.port
278                 args['user'] = 'root'
279                 args['hostname'] = self.node
280                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
281                 ssh_port = 22
282
283                 if self.nosetup:
284                         print "Skipping setup"
285                         return 
286
287                 # COPY Rpyc files to host
288                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
289                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
290                 if self.verbose: print cmd
291                 print cmd
292                 # TODO: Add timeout
293                 timeout = 120
294                 localos = moncommands.CMD()
295
296                 ret = localos.system(cmd, timeout)
297                 print ret
298                 if ret != 0:
299                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
300                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
301                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
302                         print "trying: ", cmd
303                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
304                         ret = localos.system(cmd, timeout)
305                         print ret
306                         if ret != 0:
307                                 print "\tFAILED TWICE"
308                                 #email_exception("%s rsync failed twice" % self.node)
309                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
310
311                 t1 = time.time()
312                 # KILL any already running servers.
313                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
314                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
315             rm -f out.log
316             echo "kill server" >> out.log
317             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
318             echo "export" >> out.log
319             export PYTHONPATH=$HOME  ;
320             echo "start server" >> out.log
321             python Rpyc/Servers/forking_server.py &> server.log &
322             echo "done" >> out.log
323 EOF""")
324                 print "setup rpyc server over ssh"
325                 print ssh.ret
326
327                 # TODO: Add timeout
328                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
329                 # and the following options seems to work well.
330                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
331                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
332                           """-o ConnectTimeout=120 """ + \
333                           """-n -N -L %(port)s:localhost:18812 """ + \
334                           """%(user)s@%(hostname)s"""
335                 cmd = cmd % args
336                 if self.verbose: print cmd
337                 print cmd
338                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
339                 # TODO: the read() here may block indefinitely.  Need a better
340                 # approach therefore, that includes a timeout.
341                 #ret = self.command.stdout.read(5)
342                 ret = moncommands.read_t(self.command.stdout, 5)
343
344                 t2 = time.time()
345                 if 'READY' in ret:
346                         # NOTE: There is still a slight race for machines that are slow...
347                         self.timeout = 2*(t2-t1)
348                         print "Sleeping for %s sec" % self.timeout
349                         time.sleep(self.timeout)
350                         return
351
352                 if self.command.returncode is not None:
353                         print "Failed to establish tunnel!"
354                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
355
356                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
357
358         def __del__(self):
359                 if self.command:
360                         if self.verbose: print "Killing SSH session %s" % self.port
361                         print "Killing SSH session %s" % self.port
362                         self.command.kill()
363
364         
365 def steps_to_list(steps, index=1):
366         return map(lambda x: x[index], steps)
367
368 def index_to_id(steps,index):
369         if index < len(steps):
370                 return steps[index][0]
371         else:
372                 return "done"
373
374 class DebugInterface:
375         def __init__(self, hostname):
376                 self.hostname = hostname
377                 self.session = None
378
379         def getConnection(self):
380                 print "Creating session for %s" % self.hostname
381                 # update known_hosts file (in case the node has rebooted since last run)
382                 try:
383                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
384                 except:
385                         email_exception()
386                         print traceback.print_exc()
387                         return False
388
389                 msg = "ERROR setting up session for %s" % self.hostname
390                 try:
391                         if config == None:
392                                 self.session = PlanetLabSession(self.hostname, False, True)
393                         else:
394                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
395                 except ExceptionDoubleSSHError, e:
396                         print msg
397                         return False
398                 except Exception, e:
399                         traceback.print_exc()
400                         email_exception(msg)
401                         return False
402
403                 try:
404                         conn = self.session.get_connection(config)
405                 except EOFError:
406                         # NOTE: sometimes the wait in setup_host() is not long enough.  
407                         # So, here we try to wait a little longer before giving up entirely.
408                         try:
409                                 time.sleep(self.session.timeout*5)
410                                 conn = self.session.get_connection(config)
411                         except EOFError:
412                                 # failed twice... no need to report this really, it's just in a
413                                 # weird state...
414                                 return False
415                         except:
416                                 traceback.print_exc()
417                                 email_exception(self.hostname)
418                                 return False
419                 #print "trying to use conn before returning it."
420                 #print conn.c.modules.sys.path
421                 #print conn.c.modules.os.path.exists('/tmp/source')
422                 #time.sleep(1)
423
424                 #print "conn: %s" % conn
425                 return conn
426
427         def getSequences(self):
428
429                 # TODO: This can be replaced with a DB definition at a future time.
430                 #               This would make it possible for an admin to introduce new
431                 #               patterns without touching code.
432                 
433                 sequences = {}
434                 # restart_bootmanager_boot
435                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
436                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
437                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
438
439                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
440
441                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
442                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
443                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
444                                 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
445                                 "bminit-cfg-auth-getplc-update-debug-done",
446                                 "bminit-cfg-auth-protoerror2-debug-done",
447                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
448                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
449                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
450                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
451                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
452                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
453                                 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
454                                 ]:
455                         sequences.update({n : "restart_bootmanager_boot"})
456
457                 #       conn.restart_bootmanager('reinstall')
458                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
459                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
460                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
461                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
462                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
463                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
464                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
465                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
466                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
467                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
468                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
469                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
470                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
471                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
473                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
474                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
475                                 # actual solution appears to involve removing the bad files, and
476                                 # continually trying to boot the node.
477                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
478                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
479                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
480                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
481                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
482                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
483                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
484                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
485                                 ]:
486                         sequences.update({n : "restart_bootmanager_rins"})
487
488                 # repair_node_keys
489                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
490                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
491                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
492                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
493                                         "bminit-cfg-auth-authfail-debug-done",
494                                         "bminit-cfg-auth-authfail2-authfail-debug-done",
495                                 ]:
496                         sequences.update({n: "repair_node_keys"})
497
498                 #   conn.restart_node('reinstall')
499                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
500                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
501                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
502                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
503                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
504                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
505                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
506                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
507                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
508                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
509                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
510                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
511                                 ]:
512                         sequences.update({n : "restart_node_rins"})
513
514                 #       restart_node_boot
515                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
516                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
517                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
518                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
519                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
520                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
521                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
522                                  ]:
523                         sequences.update({n: "restart_node_boot"})
524
525                 # fsck_repair
526                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
527                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
528                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
529                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
530                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
531                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
532                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
533                                   "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
534                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
535                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
536                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
537                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
538                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
539                                   "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
540                                 ]:
541                         sequences.update({n : "fsck_repair"})
542
543                 # nodeconfig_notice
544                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
545                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
546                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
547                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
548                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
549                                   "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
550                                   "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
551                                 ]:
552                         sequences.update({n : "nodeconfig_notice"})
553
554                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
555                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
556                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
557                                    "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
558                                 ]:
559                         sequences.update({n : "nodenetwork_email"})
560
561                 # noblockdevice_notice
562                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
563                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
564                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
565                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
566                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
567                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
568                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
569                                 ]:
570                         sequences.update({n : "noblockdevice_notice"})
571
572                 # update_bootcd_email
573                 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
574                                 ]:
575                         sequences.update({n : "update_bootcd_email"})
576
577                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
578                                 ]:
579                         sequences.update({n: "unknownsequence_notice"})
580
581                 # minimalhardware_notice
582                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
583                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
584
585                 # baddisk_notice
586                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
587
588                 # baddns_notice
589                 for n in [ 
590                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
591                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
592                         ]:
593                         sequences.update( { n : "baddns_notice"})
594
595                 return sequences
596
597         def getDiskSteps(self):
598                 steps = [
599                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
600                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
601                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
602
603                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
604
605                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
606                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
607
608                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
609                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
610
611                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
612                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
613
614                         ('floppytimeout','floppy0: floppy timeout called'),
615                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
616
617                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
618                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
619
620                         # floppy0: floppy timeout called
621                         # end_request: I/O error, dev fd0, sector 0
622
623                         # Buffer I/O error on device dm-2, logical block 8888896
624                         # ata1: status=0x51 { DriveReady SeekComplete Error }
625                         # ata1: error=0x40 { UncorrectableError }
626                         # SCSI error : <0 0 0 0> return code = 0x8000002
627                         # sda: Current: sense key: Medium Error
628                         #       Additional sense: Unrecovered read error - auto reallocate failed
629
630                         # SCSI error : <0 2 0 0> return code = 0x40001
631                         # end_request: I/O error, dev sda, sector 572489600
632                 ]
633                 return steps
634
635         def getDiskSequence(self, steps, child):
636                 sequence = []
637                 while True:
638                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
639                         sequence.append(id)
640
641                         if id == "done":
642                                 break
643                 return sequence
644
645         def getBootManagerStepPatterns(self):
646                 steps = [
647                         ('bminit'               , 'Initializing the BootManager.'),
648                         ('cfg'                  , 'Reading node configuration file.'),
649                         ('auth'                 , 'Authenticating node with PLC.'),
650                         ('getplc'               , 'Retrieving details of node from PLC.'),
651                         ('update'               , 'Updating node boot state at PLC.'),
652                         ('hardware'             , 'Checking if hardware requirements met.'),
653                         ('installinit'  , 'Install: Initializing.'),
654                         ('installdisk'  , 'Install: partitioning disks.'),
655                         ('installbootfs', 'Install: bootstrapfs tarball.'),
656                         ('installcfg'   , 'Install: Writing configuration files.'),
657                         ('installstop'  , 'Install: Shutting down installer.'),
658                         ('update2'              , 'Updating node boot state at PLC.'),
659                         ('installinit2' , 'Install: Initializing.'),
660                         ('validate'             , 'Validating node installation.'),
661                         ('rebuildinitrd', 'Rebuilding initrd'),
662                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
663                         ('update3'              , 'Updating node configuration.'),
664                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
665                         ('update4'              , 'Sending hardware configuration to PLC.'),
666                         ('debug'                , 'Starting debug mode'),
667                         ('bmexceptmount', 'BootManagerException during mount'),
668                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
669                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
670                         ('exception'    , 'Exception'),
671                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
672                         ('protoerror2'  , '500 Internal Server Error'),
673                         ('protoerror'   , 'XML RPC protocol error'),
674                         ('nodehostname' , 'Configured node hostname does not resolve'),
675                         ('implementerror', 'Implementation Error'),
676                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
677                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
678                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
679                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
680                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
681                         ('noinstall'    , 'notinstalled'),
682                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
683                         ('noblockdev'   , "No block devices detected."),
684                         ('dnserror'     , 'Name or service not known'),
685                         ('noconfig'             , "Unable to find and read a node configuration file"),
686                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
687                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
688                         ('hardwarerequirefail' , 'Hardware requirements not met'),
689                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
690                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
691                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
692                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
693                         ('modulefail'   , 'Unable to get list of system modules'),
694                         ('writeerror'   , 'write error: No space left on device'),
695                         ('nospace'      , "No space left on device"),
696                         ('nonode'       , 'Failed to authenticate call: No such node'),
697                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
698                         ('authfail2'    , 'Authentication Failed'),
699                         ('bootcheckfail'  , 'BootCheckAuthentication'),
700                         ('bootupdatefail' , 'BootUpdateNode'),
701                 ]
702                 return steps
703
704         def getBootManagerSequenceFromLog(self, steps, child):
705                 sequence = []
706                 while True:
707                         
708                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
709                         id = index_to_id(steps,index)
710                         sequence.append(id)
711
712                         if id == "exception":
713                                 print "...Found An Exception!!!"
714                         elif id == "done": #index == len(steps_to_list(steps)):
715                                 #print "Reached EOF"
716                                 break
717
718                 return sequence
719                 
720 def restore(sitehist, hostname, config=None, forced_action=None):
721         ret = restore_basic(sitehist, hostname, config, forced_action)
722         session.flush()
723         return ret
724
725 def restore_basic(sitehist, hostname, config=None, forced_action=None):
726
727         # NOTE: Nothing works if the bootcd is REALLY old.
728         #       So, this is the first step.
729
730         bootman_action = "unknown"
731
732         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
733         recent_actions = sitehist.getRecentActions(hostname=hostname)
734
735         if fbnode['observed_category'] == "OLDBOOTCD":
736                 print "\t...Notify owner to update BootImage!!!"
737
738                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
739                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
740
741                         print "\tDisabling %s due to out-of-date BootImage" % hostname
742                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
743
744                 # NOTE: nothing else is possible.
745                 return "disabled"
746
747         debugnode = DebugInterface(hostname)
748         conn = debugnode.getConnection()
749         if type(conn) == type(False): return "connect_failed"
750
751         boot_state = conn.get_boot_state()
752         if boot_state != "debug":
753                 print "... %s in %s state: skipping..." % (hostname , boot_state)
754                 return "skipped" #boot_state == "boot"
755
756         if conn.bootmanager_running():
757                 print "...BootManager is currently running.  Skipping host %s" %hostname 
758                 return "skipped" # True
759
760         # Read persistent flags, tagged on one week intervals.
761
762         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
763         dmesg = conn.get_dmesg()
764         child = fdpexpect.fdspawn(dmesg)
765
766         steps = debugnode.getDiskSteps()
767         sequence = debugnode.getDiskSequence(steps, child)
768
769         s = Set(sequence)
770         if config and not config.quiet: print "\tSET: ", s
771
772         if len(s) > 1:
773                 print "...Potential drive errors on %s" % hostname 
774                 if len(s) == 2 and 'floppyerror' in s:
775                         print "...Should investigate.  Continuing with node."
776                 else:
777                         print "...Should investigate.  Skipping node."
778                         # TODO: send message related to these errors.
779
780                         if not found_within(recent_actions, 'baddisk_notice', 7):
781                                 print "baddisk_notice not found recently"
782
783                                 log=conn.get_dmesg().read()
784                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
785                                 #conn.set_nodestate('disabled')
786
787                         return "skipping_baddisk"
788
789         print "...Downloading bm.log from %s" %hostname 
790         log = conn.get_bootmanager_log()
791         bm_log_data = log.read() # get data
792         log.seek(0)     # reset fd pointer for fdspawn
793         child = fdpexpect.fdspawn(log)
794
795         if hasattr(config, 'collect') and config.collect: return "collect"
796
797         if config and not config.quiet: print "...Scanning bm.log for errors"
798
799         time.sleep(1)
800
801         steps = debugnode.getBootManagerStepPatterns()
802         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
803                 
804         s = "-".join(sequence)
805         print "   FOUND SEQUENCE: ", s
806
807         # NOTE: We get or set the flag based on the current sequence identifier.
808         #  By using the sequence identifier, we guarantee that there will be no
809         #  frequent loops.  I'm guessing there is a better way to track loops,
810         #  though.
811
812         sequences = debugnode.getSequences()
813         flag_set = True
814         
815         if s not in sequences:
816                 print "   HOST %s" % hostname
817                 print "   UNKNOWN SEQUENCE: %s" % s
818
819                 args = {}
820                 args['hostname'] = hostname
821                 args['sequence'] = s
822                 args['bmlog'] = bm_log_data
823                 args['viart'] = False
824                 args['saveact'] = True
825                 args['ccemail'] = True
826
827                 sitehist.sendMessage('unknownsequence_notice', **args)
828
829                 conn.restart_bootmanager('boot')
830
831                 bootman_action = "restart_bootmanager"
832
833                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
834                 # This way, we can check it again after we've fixed it.
835                 flag_set = False
836
837         else:
838                 bootman_action = sequences[s]
839
840                 if   sequences[s] == "restart_bootmanager_boot":
841                         print "...Restarting BootManager.py on %s "%hostname 
842                         conn.restart_bootmanager('boot')
843                 elif sequences[s] == "restart_bootmanager_rins":
844                         print "...Restarting BootManager.py on %s "%hostname 
845                         conn.restart_bootmanager('reinstall')
846                 elif sequences[s] == "restart_node_rins":
847                         conn.restart_node('reinstall')
848                 elif sequences[s] == "restart_node_boot":
849                         conn.restart_node('boot')
850                 elif sequences[s] == "fsck_repair":
851                         conn.fsck_repair_node()
852                 elif sequences[s] == "repair_node_keys":
853                         if conn.compare_and_repair_nodekeys():
854                                 # the keys either are in sync or were forced in sync.
855                                 # so try to start BM again.
856                                 conn.restart_bootmanager(conn.get_nodestate())
857                         else:
858                                 # there was some failure to synchronize the keys.
859                                 print "...Unable to repair node keys on %s" %hostname 
860                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
861                                         args = {}
862                                         args['hostname'] = hostname
863                                         sitehist.sendMessage('nodeconfig_notice', **args)
864                                         conn.dump_plconf_file()
865                                 else:
866                                         # NOTE: do not add a new action record
867                                         return ""
868
869                 elif sequences[s] == "unknownsequence_notice":
870                         args = {}
871                         args['hostname'] = hostname
872                         args['sequence'] = s
873                         args['bmlog'] = bm_log_data
874                         args['viart'] = False
875                         args['saveact'] = True
876                         args['ccemail'] = True
877
878                         sitehist.sendMessage('unknownsequence_notice', **args)
879                         conn.restart_bootmanager('boot')
880
881                 elif sequences[s] == "nodeconfig_notice":
882
883                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
884                                 args = {}
885                                 args['hostname'] = hostname
886                                 sitehist.sendMessage('nodeconfig_notice', **args)
887                                 conn.dump_plconf_file()
888                         else:
889                                 # NOTE: do not add a new action record
890                                 return ""
891
892                 elif sequences[s] == "nodenetwork_email":
893
894                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
895                                 args = {}
896                                 args['hostname'] = hostname
897                                 args['bmlog'] = bm_log_data
898                                 sitehist.sendMessage('nodeconfig_notice', **args)
899                                 conn.dump_plconf_file()
900                         else:
901                                 # NOTE: do not add a new action record
902                                 return ""
903
904                 elif sequences[s] == "noblockdevice_notice":
905
906                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
907                                 args = {}
908                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
909                                 args['hostname'] = hostname
910                         
911                                 sitehist.sendMessage('noblockdevice_notice', **args)
912                         else:
913                                 # NOTE: do not add a new action record
914                                 return ""
915
916                 elif sequences[s] == "baddisk_notice":
917                         # MAKE An ACTION record that this host has failed hardware.  May
918                         # require either an exception "/minhw" or other manual intervention.
919                         # Definitely need to send out some more EMAIL.
920                         # TODO: email notice of broken hardware
921                         if not found_within(recent_actions, 'baddisk_notice', 7):
922                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
923                                 args = {}
924                                 args['hostname'] = hostname
925                                 args['log'] = conn.get_dmesg().read()
926
927                                 sitehist.sendMessage('baddisk_notice', **args)
928                                 #conn.set_nodestate('disabled')
929                         else:
930                                 # NOTE: do not add a new action record
931                                 return ""
932
933                 elif sequences[s] == "minimalhardware_notice":
934                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
935                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
936                                 args = {}
937                                 args['hostname'] = hostname
938                                 args['bmlog'] = bm_log_data
939                                 sitehist.sendMessage('minimalhardware_notice', **args)
940                         else:
941                                 # NOTE: do not add a new action record
942                                 return ""
943
944                 elif sequences[s] == "baddns_notice":
945                         if not found_within(recent_actions, 'baddns_notice', 1):
946                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
947                                 args = {}
948                                 try:
949                                         node = plccache.GetNodeByName(hostname)
950                                         net = api.GetInterfaces(node['interface_ids'])[0]
951                                 except:
952                                         email_exception()
953                                         print traceback.print_exc()
954                                         # TODO: api error. skip email, b/c all info is not available,
955                                         # flag_set will not be recorded.
956                                         return "exception"
957                                 nodenet_str = network_config_to_str(net)
958
959                                 args['hostname'] = hostname
960                                 args['network_config'] = nodenet_str
961                                 args['interface_id'] = net['interface_id']
962
963                                 sitehist.sendMessage('baddns_notice', **args)
964                         else:
965                                 # NOTE: do not add a new action record
966                                 return ""
967
968         return bootman_action
969         
970
971 # MAIN -------------------------------------------------------------------
972
973 def main():
974         from monitor import parser as parsermodule
975         parser = parsermodule.getParser()
976
977         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
978                                                 force=None, quiet=False)
979         parser.add_option("", "--child", dest="child", action="store_true", 
980                                                 help="This is the child mode of this process.")
981         parser.add_option("", "--force", dest="force", metavar="boot_state",
982                                                 help="Force a boot state passed to BootManager.py.")
983         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
984                                                 help="Extra quiet output messages.")
985         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
986                                                 help="Extra debug output messages.")
987         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
988                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
989         parser.add_option("", "--collect", dest="collect", action="store_true", 
990                                                 help="No action, just collect dmesg, and bm.log")
991         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
992                                                 help="Do not perform the orginary setup phase.")
993
994         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
995         config = parsermodule.parse_args(parser)
996
997         if config.nodelist:
998                 nodes = config.getListFromFile(config.nodelist)
999         elif config.node:
1000                 nodes = [ config.node ]
1001         else:
1002                 parser.print_help()
1003                 sys.exit(1)
1004
1005         for node in nodes:
1006                 # get sitehist
1007                 lb = plccache.plcdb_hn2lb[node]
1008                 sitehist = SiteInterface.get_or_make(loginbase=lb)
1009                 #reboot(node, config)
1010                 restore(sitehist, node, config=None, forced_action=None)
1011
1012 if __name__ == "__main__":
1013         main()