add pcu_name to pcufailed_notice
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
76                 return log
77
78         def get_bootmanager_log(self):
79                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80                 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81                 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
83                 return log
84
85
86 #       def get_dmesg(self):
87 #               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 #               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 #               log = open("log/dmesg.%s.log" % self.node, 'r')
90 #               return log
91 #
92 #       def get_bootmanager_log(self):
93 #               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 #               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 #               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 #               log = open("log/bm.%s.log" % self.node, 'r')
97 #               return log
98
99         def dump_plconf_file(self):
100                 c = self.c
101                 self.c.modules.sys.path.append("/tmp/source/")
102                 self.c.modules.os.chdir('/tmp/source')
103
104                 log = c.modules.BootManager.log('/tmp/new.log')
105                 bm = c.modules.BootManager.BootManager(log,'boot')
106
107                 BootManagerException = c.modules.Exceptions.BootManagerException
108                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
110                 bm_continue = True
111
112                 InitializeBootManager.Run(bm.VARS, bm.LOG)
113                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
114                 except Exception, x:
115                         bm_continue = False
116                         print "   ERROR:", x
117                         print "   Possibly, unable to find valid configuration file"
118
119                 if bm_continue:
120                         for key in bm.VARS.keys():
121                                 print key, " == ", bm.VARS[key]
122                 else:
123                         print "   Unable to read Node Configuration"
124                 
125         def fsck_repair_node(self):
126                 c = self.c
127                 self.c.modules.sys.path.append("/tmp/source/")
128                 self.c.modules.os.chdir('/tmp/source')
129                 # TODO: restart
130                 # TODO: set boot state to node's actually boot state.
131                 # could be 'boot' or 'safeboot'
132                 self.c.modules.os.chdir('/tmp/source')
133                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
134                         print "Running MANUAL FSCK already... try again soon."
135                 else:
136                         print "Running MANUAL fsck on %s" % self.node
137                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
138                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
139                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
140                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
141                                   "  rm -f /tmp/BM_RUNNING " + \
142                                   ") &" 
143                         cmd = cmd % self.get_nodestate()
144                         self.c.modules.os.system(cmd)
145                 #self.restart_bootmanager('boot')       
146                 pass
147
148         def compare_and_repair_nodekeys(self):
149                 c = self.c
150                 self.c.modules.sys.path.append("/tmp/source/")
151                 self.c.modules.os.chdir('/tmp/source')
152
153                 log = c.modules.BootManager.log('/tmp/new.log')
154                 bm = c.modules.BootManager.BootManager(log,'boot')
155
156                 BootManagerException = c.modules.Exceptions.BootManagerException
157                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
158                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
159                 bm_continue = True
160
161                 plcnode = plccache.GetNodeByName(self.node)
162
163                 InitializeBootManager.Run(bm.VARS, bm.LOG)
164                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
165                 except Exception, x:
166                         bm_continue = False
167                         print "exception"
168                         print x
169                         print "   Possibly, unable to find valid configuration file"
170
171                 if bm_continue:
172                         print "   NODE: %s" % bm.VARS['NODE_KEY']
173                         print "   PLC : %s" % plcnode['key']
174
175                         if bm.VARS['NODE_KEY'] == plcnode['key']:
176                                 return True
177                         else:
178                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
179                                         print "   Successfully updated NODE_KEY with PLC"
180                                         return True
181                                 else:
182                                         return False
183                                 
184                         #for key in bm.VARS.keys():
185                         #       print key, " == ", bm.VARS[key]
186                 else:
187                         print "   Unable to retrieve NODE_KEY"
188
189         def bootmanager_running(self):
190                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
191                         return True
192                 else:
193                         return False
194
195         def set_nodestate(self, state='boot'):
196                 return api.UpdateNode(self.node, {'boot_state' : state})
197
198         def get_nodestate(self):
199                 try:
200                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
201                 except:
202                         traceback.print_exc()
203                         # NOTE: use last cached value from plc
204                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
205                         return fbnode['plc_node_stats']['boot_state']
206
207
208         def restart_node(self, state='boot'):
209                 api.UpdateNode(self.node, {'boot_state' : state})
210
211                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
212                 if not pflags.getRecentFlag('gentlekill'):
213                         print "   Killing all slice processes... : %s" %  self.node
214                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
215                         self.c.modules.os.system(cmd_slicekill)
216                         cmd = """ shutdown -r +1 & """
217                         print "   Restarting %s : %s" % ( self.node, cmd)
218                         self.c.modules.os.system(cmd)
219
220                         pflags.setRecentFlag('gentlekill')
221                         pflags.save()
222                 else:
223                         print "   Restarting with sysrq 'sub' %s" % self.node
224                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
225                         self.c.modules.os.system(cmd)
226
227                 return
228
229         def restart_bootmanager(self, forceState):
230
231                 self.c.modules.os.chdir('/tmp/source')
232                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
233                         print "   BootManager is already running: try again soon..."
234                 else:
235                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
236                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
237                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
238                                   "  rm -f /tmp/BM_RUNNING " + \
239                                   ") &" 
240                         cmd = cmd % forceState
241                         self.c.modules.os.system(cmd)
242
243                 return 
244
245
246 class PlanetLabSession:
247         globalport = 22000 + int(random.random()*1000)
248
249         def __init__(self, node, nosetup, verbose):
250                 self.verbose = verbose
251                 self.node = node
252                 self.port = None
253                 self.nosetup = nosetup
254                 self.command = None
255                 self.setup_host()
256
257         def get_connection(self, config):
258                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
259                 #i = 0
260                 #while i < 3: 
261                 #       print i, conn.c.modules.sys.path
262                 #       print conn.c.modules.os.path.exists('/tmp/source')
263                 #       i+=1
264                 #       time.sleep(1)
265                 return conn
266         
267         def setup_host(self):
268                 self.port = PlanetLabSession.globalport
269                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
270
271                 args = {}
272                 args['port'] = self.port
273                 args['user'] = 'root'
274                 args['hostname'] = self.node
275                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
276                 ssh_port = 22
277
278                 if self.nosetup:
279                         print "Skipping setup"
280                         return 
281
282                 # COPY Rpyc files to host
283                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
284                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
285                 if self.verbose: print cmd
286                 print cmd
287                 # TODO: Add timeout
288                 timeout = 120
289                 localos = moncommands.CMD()
290
291                 ret = localos.system(cmd, timeout)
292                 print ret
293                 if ret != 0:
294                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
295                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
296                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
297                         print "trying: ", cmd
298                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
299                         ret = localos.system(cmd, timeout)
300                         print ret
301                         if ret != 0:
302                                 print "\tFAILED TWICE"
303                                 #email_exception("%s rsync failed twice" % self.node)
304                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
305
306                 t1 = time.time()
307                 # KILL any already running servers.
308                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
309                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
310             rm -f out.log
311             echo "kill server" >> out.log
312             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
313             echo "export" >> out.log
314             export PYTHONPATH=$HOME  ;
315             echo "start server" >> out.log
316             python Rpyc/Servers/forking_server.py &> server.log &
317             echo "done" >> out.log
318 EOF""")
319                 print "setup rpyc server over ssh"
320                 print ssh.ret
321
322                 # TODO: Add timeout
323                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
324                 # and the following options seems to work well.
325                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
326                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
327                           """-o ConnectTimeout=120 """ + \
328                           """-n -N -L %(port)s:localhost:18812 """ + \
329                           """%(user)s@%(hostname)s"""
330                 cmd = cmd % args
331                 if self.verbose: print cmd
332                 print cmd
333                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
334                 # TODO: the read() here may block indefinitely.  Need a better
335                 # approach therefore, that includes a timeout.
336                 #ret = self.command.stdout.read(5)
337                 ret = moncommands.read_t(self.command.stdout, 5)
338
339                 t2 = time.time()
340                 if 'READY' in ret:
341                         # NOTE: There is still a slight race for machines that are slow...
342                         self.timeout = 2*(t2-t1)
343                         print "Sleeping for %s sec" % self.timeout
344                         time.sleep(self.timeout)
345                         return
346
347                 if self.command.returncode is not None:
348                         print "Failed to establish tunnel!"
349                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
350
351                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
352
353         def __del__(self):
354                 if self.command:
355                         if self.verbose: print "Killing SSH session %s" % self.port
356                         print "Killing SSH session %s" % self.port
357                         self.command.kill()
358
359         
360 def steps_to_list(steps, index=1):
361         return map(lambda x: x[index], steps)
362
363 def index_to_id(steps,index):
364         if index < len(steps):
365                 return steps[index][0]
366         else:
367                 return "done"
368
369 class DebugInterface:
370         def __init__(self, hostname):
371                 self.hostname = hostname
372                 self.session = None
373
374         def getConnection(self):
375                 print "Creating session for %s" % self.hostname
376                 # update known_hosts file (in case the node has rebooted since last run)
377                 try:
378                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
379                 except:
380                         email_exception()
381                         print traceback.print_exc()
382                         return False
383
384                 msg = "ERROR setting up session for %s" % self.hostname
385                 try:
386                         if config == None:
387                                 self.session = PlanetLabSession(self.hostname, False, True)
388                         else:
389                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
390                 except ExceptionDoubleSSHError, e:
391                         print msg
392                         return False
393                 except Exception, e:
394                         traceback.print_exc()
395                         email_exception(msg)
396                         return False
397
398                 try:
399                         conn = self.session.get_connection(config)
400                 except EOFError:
401                         # NOTE: sometimes the wait in setup_host() is not long enough.  
402                         # So, here we try to wait a little longer before giving up entirely.
403                         try:
404                                 time.sleep(self.session.timeout*5)
405                                 conn = self.session.get_connection(config)
406                         except EOFError:
407                                 # failed twice... no need to report this really, it's just in a
408                                 # weird state...
409                                 return False
410                         except:
411                                 traceback.print_exc()
412                                 email_exception(self.hostname)
413                                 return False
414                 #print "trying to use conn before returning it."
415                 #print conn.c.modules.sys.path
416                 #print conn.c.modules.os.path.exists('/tmp/source')
417                 #time.sleep(1)
418
419                 #print "conn: %s" % conn
420                 return conn
421
422         def getSequences(self):
423
424                 # TODO: This can be replaced with a DB definition at a future time.
425                 #               This would make it possible for an admin to introduce new
426                 #               patterns without touching code.
427                 
428                 sequences = {}
429                 # restart_bootmanager_boot
430                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
432                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
433
434                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
435
436                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
437                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
438                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
439                                 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
440                                 "bminit-cfg-auth-getplc-update-debug-done",
441                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
442                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
443                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
444                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
445                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
446                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
447                                 ]:
448                         sequences.update({n : "restart_bootmanager_boot"})
449
450                 #       conn.restart_bootmanager('reinstall')
451                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
452                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
453                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
454                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
455                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
456                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
457                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
458                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
459                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
460                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
461                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
462                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
463                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
464                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
465                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
466                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
467                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
468                                 # actual solution appears to involve removing the bad files, and
469                                 # continually trying to boot the node.
470                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
471                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
473                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
474                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
475                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
476                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
477                                 ]:
478                         sequences.update({n : "restart_bootmanager_rins"})
479
480                 # repair_node_keys
481                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
482                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
483                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
484                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
485                                 ]:
486                         sequences.update({n: "repair_node_keys"})
487
488                 #   conn.restart_node('reinstall')
489                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
490                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
491                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
492                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
493                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
494                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
495                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
496                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
497                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
498                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
499                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
500                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
501                                 ]:
502                         sequences.update({n : "restart_node_rins"})
503
504                 #       restart_node_boot
505                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
506                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
507                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
508                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
509                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
510                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
511                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
512                                  ]:
513                         sequences.update({n: "restart_node_boot"})
514
515                 # fsck_repair
516                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
517                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
518                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
519                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
520                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
521                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
522                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
523                                   "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
524                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
525                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
526                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
527                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
528                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
529                                   "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
530                                 ]:
531                         sequences.update({n : "fsck_repair"})
532
533                 # nodeconfig_notice
534                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
535                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
536                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
537                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
538                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
539                                   "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
540                                   "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
541                                 ]:
542                         sequences.update({n : "nodeconfig_notice"})
543
544                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
545                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
546                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
547                                    "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
548                                 ]:
549                         sequences.update({n : "nodenetwork_email"})
550
551                 # noblockdevice_notice
552                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
553                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
554                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
555                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
556                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
557                                 ]:
558                         sequences.update({n : "noblockdevice_notice"})
559
560                 # update_bootcd_email
561                 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
562                                 ]:
563                         sequences.update({n : "update_bootcd_email"})
564
565                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
566                                 ]:
567                         sequences.update({n: "unknownsequence_notice"})
568
569                 # minimalhardware_notice
570                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
571                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
572
573                 # baddisk_notice
574                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
575
576                 # baddns_notice
577                 for n in [ 
578                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
579                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
580                         ]:
581                         sequences.update( { n : "baddns_notice"})
582
583                 return sequences
584
585         def getDiskSteps(self):
586                 steps = [
587                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
588                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
589                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
590
591                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
592
593                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
594                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
595
596                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
597                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
598
599                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
600                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
601
602                         ('floppytimeout','floppy0: floppy timeout called'),
603                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
604
605                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
606                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
607
608                         # floppy0: floppy timeout called
609                         # end_request: I/O error, dev fd0, sector 0
610
611                         # Buffer I/O error on device dm-2, logical block 8888896
612                         # ata1: status=0x51 { DriveReady SeekComplete Error }
613                         # ata1: error=0x40 { UncorrectableError }
614                         # SCSI error : <0 0 0 0> return code = 0x8000002
615                         # sda: Current: sense key: Medium Error
616                         #       Additional sense: Unrecovered read error - auto reallocate failed
617
618                         # SCSI error : <0 2 0 0> return code = 0x40001
619                         # end_request: I/O error, dev sda, sector 572489600
620                 ]
621                 return steps
622
623         def getDiskSequence(self, steps, child):
624                 sequence = []
625                 while True:
626                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
627                         sequence.append(id)
628
629                         if id == "done":
630                                 break
631                 return sequence
632
633         def getBootManagerStepPatterns(self):
634                 steps = [
635                         ('bminit'               , 'Initializing the BootManager.'),
636                         ('cfg'                  , 'Reading node configuration file.'),
637                         ('auth'                 , 'Authenticating node with PLC.'),
638                         ('getplc'               , 'Retrieving details of node from PLC.'),
639                         ('update'               , 'Updating node boot state at PLC.'),
640                         ('hardware'             , 'Checking if hardware requirements met.'),
641                         ('installinit'  , 'Install: Initializing.'),
642                         ('installdisk'  , 'Install: partitioning disks.'),
643                         ('installbootfs', 'Install: bootstrapfs tarball.'),
644                         ('installcfg'   , 'Install: Writing configuration files.'),
645                         ('installstop'  , 'Install: Shutting down installer.'),
646                         ('update2'              , 'Updating node boot state at PLC.'),
647                         ('installinit2' , 'Install: Initializing.'),
648                         ('validate'             , 'Validating node installation.'),
649                         ('rebuildinitrd', 'Rebuilding initrd'),
650                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
651                         ('update3'              , 'Updating node configuration.'),
652                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
653                         ('update4'              , 'Sending hardware configuration to PLC.'),
654                         ('debug'                , 'Starting debug mode'),
655                         ('bmexceptmount', 'BootManagerException during mount'),
656                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
657                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
658                         ('exception'    , 'Exception'),
659                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
660                         ('protoerror'   , 'XML RPC protocol error'),
661                         ('nodehostname' , 'Configured node hostname does not resolve'),
662                         ('implementerror', 'Implementation Error'),
663                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
664                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
665                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
666                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
667                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
668                         ('noinstall'    , 'notinstalled'),
669                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
670                         ('noblockdev'   , "No block devices detected."),
671                         ('dnserror'     , 'Name or service not known'),
672                         ('noconfig'             , "Unable to find and read a node configuration file"),
673                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
674                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
675                         ('hardwarerequirefail' , 'Hardware requirements not met'),
676                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
677                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
678                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
679                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
680                         ('modulefail'   , 'Unable to get list of system modules'),
681                         ('writeerror'   , 'write error: No space left on device'),
682                         ('nospace'      , "No space left on device"),
683                         ('nonode'       , 'Failed to authenticate call: No such node'),
684                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
685                         ('bootcheckfail'     , 'BootCheckAuthentication'),
686                         ('bootupdatefail'   , 'BootUpdateNode'),
687                 ]
688                 return steps
689
690         def getBootManagerSequenceFromLog(self, steps, child):
691                 sequence = []
692                 while True:
693                         
694                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
695                         id = index_to_id(steps,index)
696                         sequence.append(id)
697
698                         if id == "exception":
699                                 print "...Found An Exception!!!"
700                         elif id == "done": #index == len(steps_to_list(steps)):
701                                 #print "Reached EOF"
702                                 break
703
704                 return sequence
705                 
706 def restore(sitehist, hostname, config=None, forced_action=None):
707         ret = restore_basic(sitehist, hostname, config, forced_action)
708         session.flush()
709         return ret
710
711 def restore_basic(sitehist, hostname, config=None, forced_action=None):
712
713         # NOTE: Nothing works if the bootcd is REALLY old.
714         #       So, this is the first step.
715
716         bootman_action = "unknown"
717
718         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
719         recent_actions = sitehist.getRecentActions(hostname=hostname)
720
721         if fbnode['observed_category'] == "OLDBOOTCD":
722                 print "\t...Notify owner to update BootImage!!!"
723
724                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
725                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
726
727                         print "\tDisabling %s due to out-of-date BootImage" % hostname
728                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
729
730                 # NOTE: nothing else is possible.
731                 return "disabled"
732
733         debugnode = DebugInterface(hostname)
734         conn = debugnode.getConnection()
735         if type(conn) == type(False): return "error"
736
737         boot_state = conn.get_boot_state()
738         if boot_state != "debug":
739                 print "... %s in %s state: skipping..." % (hostname , boot_state)
740                 return "skipped" #boot_state == "boot"
741
742         if conn.bootmanager_running():
743                 print "...BootManager is currently running.  Skipping host %s" %hostname 
744                 return "skipped" # True
745
746         # Read persistent flags, tagged on one week intervals.
747
748         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
749         dmesg = conn.get_dmesg()
750         child = fdpexpect.fdspawn(dmesg)
751
752         steps = debugnode.getDiskSteps()
753         sequence = debugnode.getDiskSequence(steps, child)
754
755         s = Set(sequence)
756         if config and not config.quiet: print "\tSET: ", s
757
758         if len(s) > 1:
759                 print "...Potential drive errors on %s" % hostname 
760                 if len(s) == 2 and 'floppyerror' in s:
761                         print "...Should investigate.  Continuing with node."
762                 else:
763                         print "...Should investigate.  Skipping node."
764                         # TODO: send message related to these errors.
765
766                         if not found_within(recent_actions, 'baddisk_notice', 7):
767                                 print "baddisk_notice not found recently"
768
769                                 log=conn.get_dmesg().read()
770                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
771                                 #conn.set_nodestate('disabled')
772
773                         return "skipping_baddisk"
774
775         print "...Downloading bm.log from %s" %hostname 
776         log = conn.get_bootmanager_log()
777         child = fdpexpect.fdspawn(log)
778
779         if hasattr(config, 'collect') and config.collect: return "collect"
780
781         if config and not config.quiet: print "...Scanning bm.log for errors"
782
783         time.sleep(1)
784
785         steps = debugnode.getBootManagerStepPatterns()
786         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
787                 
788         s = "-".join(sequence)
789         print "   FOUND SEQUENCE: ", s
790
791         # NOTE: We get or set the flag based on the current sequence identifier.
792         #  By using the sequence identifier, we guarantee that there will be no
793         #  frequent loops.  I'm guessing there is a better way to track loops,
794         #  though.
795
796         sequences = debugnode.getSequences()
797         flag_set = True
798         
799         if s not in sequences:
800                 print "   HOST %s" % hostname
801                 print "   UNKNOWN SEQUENCE: %s" % s
802
803                 args = {}
804                 args['hostname'] = hostname
805                 args['sequence'] = s
806                 args['bmlog'] = conn.get_bootmanager_log().read()
807                 args['viart'] = False
808                 args['saveact'] = True
809                 args['ccemail'] = True
810
811                 sitehist.sendMessage('unknownsequence_notice', **args)
812
813                 conn.restart_bootmanager('boot')
814
815                 bootman_action = "restart_bootmanager"
816
817                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
818                 # This way, we can check it again after we've fixed it.
819                 flag_set = False
820
821         else:
822                 bootman_action = sequences[s]
823
824                 if   sequences[s] == "restart_bootmanager_boot":
825                         print "...Restarting BootManager.py on %s "%hostname 
826                         conn.restart_bootmanager('boot')
827                 elif sequences[s] == "restart_bootmanager_rins":
828                         print "...Restarting BootManager.py on %s "%hostname 
829                         conn.restart_bootmanager('reinstall')
830                 elif sequences[s] == "restart_node_rins":
831                         conn.restart_node('reinstall')
832                 elif sequences[s] == "restart_node_boot":
833                         conn.restart_node('boot')
834                 elif sequences[s] == "fsck_repair":
835                         conn.fsck_repair_node()
836                 elif sequences[s] == "repair_node_keys":
837                         if conn.compare_and_repair_nodekeys():
838                                 # the keys either are in sync or were forced in sync.
839                                 # so try to start BM again.
840                                 conn.restart_bootmanager(conn.get_nodestate())
841                                 pass
842                         else:
843                                 # there was some failure to synchronize the keys.
844                                 print "...Unable to repair node keys on %s" %hostname 
845
846                 elif sequences[s] == "unknownsequence_notice":
847                         args = {}
848                         args['hostname'] = hostname
849                         args['sequence'] = s
850                         args['bmlog'] = conn.get_bootmanager_log().read()
851                         args['viart'] = False
852                         args['saveact'] = True
853                         args['ccemail'] = True
854
855                         sitehist.sendMessage('unknownsequence_notice', **args)
856                         conn.restart_bootmanager('boot')
857
858                 elif sequences[s] == "nodeconfig_notice":
859
860                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
861                                 args = {}
862                                 args['hostname'] = hostname
863                                 sitehist.sendMessage('nodeconfig_notice', **args)
864                                 conn.dump_plconf_file()
865
866                 elif sequences[s] == "nodenetwork_email":
867
868                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
869                                 args = {}
870                                 args['hostname'] = hostname
871                                 args['bmlog'] = conn.get_bootmanager_log().read()
872                                 sitehist.sendMessage('nodeconfig_notice', **args)
873                                 conn.dump_plconf_file()
874
875                 elif sequences[s] == "noblockdevice_notice":
876
877                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
878                                 args = {}
879                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
880                                 args['hostname'] = hostname
881                         
882                                 sitehist.sendMessage('noblockdevice_notice', **args)
883
884                 elif sequences[s] == "baddisk_notice":
885                         # MAKE An ACTION record that this host has failed hardware.  May
886                         # require either an exception "/minhw" or other manual intervention.
887                         # Definitely need to send out some more EMAIL.
888                         # TODO: email notice of broken hardware
889                         if not found_within(recent_actions, 'baddisk_notice', 7):
890                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
891                                 args = {}
892                                 args['hostname'] = hostname
893                                 args['log'] = conn.get_dmesg().read()
894
895                                 sitehist.sendMessage('baddisk_notice', **args)
896                                 #conn.set_nodestate('disabled')
897
898                 elif sequences[s] == "minimalhardware_notice":
899                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
900                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
901                                 args = {}
902                                 args['hostname'] = hostname
903                                 args['bmlog'] = conn.get_bootmanager_log().read()
904                                 sitehist.sendMessage('minimalhardware_notice', **args)
905
906                 elif sequences[s] == "baddns_notice":
907                         if not found_within(recent_actions, 'baddns_notice', 1):
908                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
909                                 args = {}
910                                 try:
911                                         node = plccache.GetNodeByName(hostname)
912                                         net = api.GetInterfaces(node['interface_ids'])[0]
913                                 except:
914                                         email_exception()
915                                         print traceback.print_exc()
916                                         # TODO: api error. skip email, b/c all info is not available,
917                                         # flag_set will not be recorded.
918                                         return "exception"
919                                 nodenet_str = network_config_to_str(net)
920
921                                 args['hostname'] = hostname
922                                 args['network_config'] = nodenet_str
923                                 args['interface_id'] = net['interface_id']
924
925                                 sitehist.sendMessage('baddns_notice', **args)
926
927         return bootman_action
928         
929
930 # MAIN -------------------------------------------------------------------
931
932 def main():
933         from monitor import parser as parsermodule
934         parser = parsermodule.getParser()
935
936         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
937                                                 force=None, quiet=False)
938         parser.add_option("", "--child", dest="child", action="store_true", 
939                                                 help="This is the child mode of this process.")
940         parser.add_option("", "--force", dest="force", metavar="boot_state",
941                                                 help="Force a boot state passed to BootManager.py.")
942         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
943                                                 help="Extra quiet output messages.")
944         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
945                                                 help="Extra debug output messages.")
946         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
947                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
948         parser.add_option("", "--collect", dest="collect", action="store_true", 
949                                                 help="No action, just collect dmesg, and bm.log")
950         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
951                                                 help="Do not perform the orginary setup phase.")
952
953         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
954         config = parsermodule.parse_args(parser)
955
956         if config.nodelist:
957                 nodes = config.getListFromFile(config.nodelist)
958         elif config.node:
959                 nodes = [ config.node ]
960         else:
961                 parser.print_help()
962                 sys.exit(1)
963
964         for node in nodes:
965                 # get sitehist
966                 lb = plccache.plcdb_hn2lb[node]
967                 sitehist = SiteInterface.get_or_make(loginbase=lb)
968                 #reboot(node, config)
969                 restore(sitehist, node, config=None, forced_action=None)
970
971 if __name__ == "__main__":
972         main()