controllers should allow refreshes while findall is running.
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73                 log = open("log/dmesg.%s.log" % self.node, 'r')
74                 return log
75
76         def get_bootmanager_log(self):
77                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79                 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80                 log = open("log/bm.%s.log" % self.node, 'r')
81                 return log
82
83         def dump_plconf_file(self):
84                 c = self.c
85                 self.c.modules.sys.path.append("/tmp/source/")
86                 self.c.modules.os.chdir('/tmp/source')
87
88                 log = c.modules.BootManager.log('/tmp/new.log')
89                 bm = c.modules.BootManager.BootManager(log,'boot')
90
91                 BootManagerException = c.modules.Exceptions.BootManagerException
92                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
94                 bm_continue = True
95
96                 InitializeBootManager.Run(bm.VARS, bm.LOG)
97                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
98                 except Exception, x:
99                         bm_continue = False
100                         print "   ERROR:", x
101                         print "   Possibly, unable to find valid configuration file"
102
103                 if bm_continue:
104                         for key in bm.VARS.keys():
105                                 print key, " == ", bm.VARS[key]
106                 else:
107                         print "   Unable to read Node Configuration"
108                 
109
110         def compare_and_repair_nodekeys(self):
111                 c = self.c
112                 self.c.modules.sys.path.append("/tmp/source/")
113                 self.c.modules.os.chdir('/tmp/source')
114
115                 log = c.modules.BootManager.log('/tmp/new.log')
116                 bm = c.modules.BootManager.BootManager(log,'boot')
117
118                 BootManagerException = c.modules.Exceptions.BootManagerException
119                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
121                 bm_continue = True
122
123                 plcnode = plccache.GetNodeByName(self.node)
124
125                 InitializeBootManager.Run(bm.VARS, bm.LOG)
126                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
127                 except Exception, x:
128                         bm_continue = False
129                         print "exception"
130                         print x
131                         print "   Possibly, unable to find valid configuration file"
132
133                 if bm_continue:
134                         print "   NODE: %s" % bm.VARS['NODE_KEY']
135                         print "   PLC : %s" % plcnode['key']
136
137                         if bm.VARS['NODE_KEY'] == plcnode['key']:
138                                 return True
139                         else:
140                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141                                         print "   Successfully updated NODE_KEY with PLC"
142                                         return True
143                                 else:
144                                         return False
145                                 
146                         #for key in bm.VARS.keys():
147                         #       print key, " == ", bm.VARS[key]
148                 else:
149                         print "   Unable to retrieve NODE_KEY"
150
151         def bootmanager_running(self):
152                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
153                         return True
154                 else:
155                         return False
156
157         def set_nodestate(self, state='boot'):
158                 return api.UpdateNode(self.node, {'boot_state' : state})
159
160         def restart_node(self, state='boot'):
161                 api.UpdateNode(self.node, {'boot_state' : state})
162
163                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164                 if not pflags.getRecentFlag('gentlekill'):
165                         print "   Killing all slice processes... : %s" %  self.node
166                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167                         self.c.modules.os.system(cmd_slicekill)
168                         cmd = """ shutdown -r +1 & """
169                         print "   Restarting %s : %s" % ( self.node, cmd)
170                         self.c.modules.os.system(cmd)
171
172                         pflags.setRecentFlag('gentlekill')
173                         pflags.save()
174                 else:
175                         print "   Restarting with sysrq 'sub' %s" % self.node
176                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177                         self.c.modules.os.system(cmd)
178
179                 return
180
181         def restart_bootmanager(self, forceState):
182
183                 self.c.modules.os.chdir('/tmp/source')
184                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185                         print "   BootManager is already running: try again soon..."
186                 else:
187                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
189                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
190                                   "  rm -f /tmp/BM_RUNNING " + \
191                                   ") &" 
192                         cmd = cmd % forceState
193                         self.c.modules.os.system(cmd)
194
195                 return 
196
197
198 class PlanetLabSession:
199         globalport = 22000 + int(random.random()*1000)
200
201         def __init__(self, node, nosetup, verbose):
202                 self.verbose = verbose
203                 self.node = node
204                 self.port = None
205                 self.nosetup = nosetup
206                 self.command = None
207                 self.setup_host()
208
209         def get_connection(self, config):
210                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
211                 #i = 0
212                 #while i < 3: 
213                 #       print i, conn.c.modules.sys.path
214                 #       print conn.c.modules.os.path.exists('/tmp/source')
215                 #       i+=1
216                 #       time.sleep(1)
217                 return conn
218         
219         def setup_host(self):
220                 self.port = PlanetLabSession.globalport
221                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
222
223                 args = {}
224                 args['port'] = self.port
225                 args['user'] = 'root'
226                 args['hostname'] = self.node
227                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
228                 ssh_port = 22
229
230                 if self.nosetup:
231                         print "Skipping setup"
232                         return 
233
234                 # COPY Rpyc files to host
235                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236                 if self.verbose: print cmd
237                 print cmd
238                 # TODO: Add timeout
239                 timeout = 120
240                 localos = moncommands.CMD()
241
242                 ret = localos.system(cmd, timeout)
243                 print ret
244                 if ret != 0:
245                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248                         ret = localos.system(cmd, timeout)
249                         print ret
250                         if ret != 0:
251                                 print "\tFAILED TWICE"
252                                 #sys.exit(1)
253                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
254
255                 t1 = time.time()
256                 # KILL any already running servers.
257                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
259             rm -f out.log
260             echo "kill server" >> out.log
261             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
262             echo "export" >> out.log
263             export PYTHONPATH=$HOME  ;
264             echo "start server" >> out.log
265             python Rpyc/Servers/forking_server.py &> server.log &
266             echo "done" >> out.log
267 EOF""")
268                 #cmd = """ssh %(user)s@%(hostname)s """ + \
269                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
270                 #cmd = cmd % args
271                 #if self.verbose: print cmd
272                 ## TODO: Add timeout
273                 #print localos.system(cmd,timeout)
274
275                 ## START a new rpyc server.
276                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
278                 #cmd = cmd % args
279                 #if self.verbose: print cmd
280                 #print localos.system(cmd,timeout)
281                 print "setup rpyc server over ssh"
282                 print ssh.ret
283
284                 # TODO: Add timeout
285                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
286                 # and the following options seems to work well.
287                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289                           """-o ConnectTimeout=120 """ + \
290                           """-n -N -L %(port)s:localhost:18812 """ + \
291                           """%(user)s@%(hostname)s"""
292                 cmd = cmd % args
293                 if self.verbose: print cmd
294                 print cmd
295                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296                 # TODO: the read() here may block indefinitely.  Need a better
297                 # approach therefore, that includes a timeout.
298                 #ret = self.command.stdout.read(5)
299                 ret = moncommands.read_t(self.command.stdout, 5)
300
301                 t2 = time.time()
302                 if 'READY' in ret:
303                         # NOTE: There is still a slight race for machines that are slow...
304                         self.timeout = 2*(t2-t1)
305                         print "Sleeping for %s sec" % self.timeout
306                         time.sleep(self.timeout)
307                         return
308
309                 if self.command.returncode is not None:
310                         print "Failed to establish tunnel!"
311                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
312
313                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
314
315         def __del__(self):
316                 if self.command:
317                         if self.verbose: print "Killing SSH session %s" % self.port
318                         print "Killing SSH session %s" % self.port
319                         self.command.kill()
320
321         
322 def steps_to_list(steps, index=1):
323         return map(lambda x: x[index], steps)
324
325 def index_to_id(steps,index):
326         if index < len(steps):
327                 return steps[index][0]
328         else:
329                 return "done"
330
331 class DebugInterface:
332         def __init__(self, hostname):
333                 self.hostname = hostname
334                 self.session = None
335
336         def getConnection(self):
337                 print "Creating session for %s" % self.hostname
338                 # update known_hosts file (in case the node has rebooted since last run)
339                 try:
340                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
341                 except:
342                         email_exception()
343                         print traceback.print_exc()
344                         return False
345
346                 try:
347                         if config == None:
348                                 self.session = PlanetLabSession(self.hostname, False, True)
349                         else:
350                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351                 except ExceptionDoubleSSHError, e:
352                         msg = "ERROR setting up session for %s" % self.hostname
353                         print msg
354                         return False
355                 except Exception, e:
356                         traceback.print_exc()
357                         email_exception(msg)
358                         return False
359
360                 try:
361                         conn = self.session.get_connection(config)
362                 except EOFError:
363                         # NOTE: sometimes the wait in setup_host() is not long enough.  
364                         # So, here we try to wait a little longer before giving up entirely.
365                         try:
366                                 time.sleep(self.session.timeout*5)
367                                 conn = self.session.get_connection(config)
368                         except EOFError:
369                                 # failed twice... no need to report this really, it's just in a
370                                 # weird state...
371                                 return False
372                         except:
373                                 traceback.print_exc()
374                                 email_exception(self.hostname)
375                                 return False
376                 #print "trying to use conn before returning it."
377                 #print conn.c.modules.sys.path
378                 #print conn.c.modules.os.path.exists('/tmp/source')
379                 #time.sleep(1)
380
381                 #print "conn: %s" % conn
382                 return conn
383
384         def getSequences(self):
385
386                 # TODO: This can be replaced with a DB definition at a future time.
387                 #               This would make it possible for an admin to introduce new
388                 #               patterns without touching code.
389                 
390                 sequences = {}
391                 # restart_bootmanager_boot
392                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
395
396                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
397
398                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400                                 "bminit-cfg-auth-getplc-update-debug-done",
401                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
407                                 ]:
408                         sequences.update({n : "restart_bootmanager_boot"})
409
410                 #       conn.restart_bootmanager('reinstall')
411                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428                                 # actual solution appears to involve removing the bad files, and
429                                 # continually trying to boot the node.
430                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
433                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
434                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
435                                 ]:
436                         sequences.update({n : "restart_bootmanager_rins"})
437
438                 # repair_node_keys
439                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
440                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
441                                 ]:
442                         sequences.update({n: "repair_node_keys"})
443
444                 #   conn.restart_node('reinstall')
445                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
446                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
447                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
448                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
449                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
450                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
451                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
452                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
453                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
454                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
455                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
456                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
457                                 ]:
458                         sequences.update({n : "restart_node_rins"})
459
460                 #       restart_node_boot
461                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
462                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
463                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
464                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
465                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
466                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
467                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
468                                  ]:
469                         sequences.update({n: "restart_node_boot"})
470
471                 # update_node_config_email
472                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
473                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
474                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
475                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
476                                 ]:
477                         sequences.update({n : "update_node_config_email"})
478
479                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
480                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
481                                 ]:
482                         sequences.update({n : "nodenetwork_email"})
483
484                 # update_bootcd_email
485                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
486                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
487                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
488                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
489                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
490                                 ]:
491                         sequences.update({n : "update_bootcd_email"})
492
493                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
494                                 ]:
495                         sequences.update({n: "suspect_error_email"})
496
497                 # update_hardware_email
498                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
499                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
500
501                 # broken_hardware_email
502                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
503
504                 # bad_dns_email
505                 for n in [ 
506                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
507                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
508                         ]:
509                         sequences.update( { n : "bad_dns_email"})
510
511                 return sequences
512
513         def getDiskSteps(self):
514                 steps = [
515                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
516                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
517                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
518
519                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
520
521                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
522                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
523
524                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
525                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
526
527                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
528                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
529
530                         ('floppytimeout','floppy0: floppy timeout called'),
531                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
532
533                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
534                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
535
536                         # floppy0: floppy timeout called
537                         # end_request: I/O error, dev fd0, sector 0
538
539                         # Buffer I/O error on device dm-2, logical block 8888896
540                         # ata1: status=0x51 { DriveReady SeekComplete Error }
541                         # ata1: error=0x40 { UncorrectableError }
542                         # SCSI error : <0 0 0 0> return code = 0x8000002
543                         # sda: Current: sense key: Medium Error
544                         #       Additional sense: Unrecovered read error - auto reallocate failed
545
546                         # SCSI error : <0 2 0 0> return code = 0x40001
547                         # end_request: I/O error, dev sda, sector 572489600
548                 ]
549                 return steps
550
551         def getDiskSequence(self, steps, child):
552                 sequence = []
553                 while True:
554                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
555                         sequence.append(id)
556
557                         if id == "done":
558                                 break
559                 return sequence
560
561         def getBootManagerStepPatterns(self):
562                 steps = [
563                         ('bminit'               , 'Initializing the BootManager.'),
564                         ('cfg'                  , 'Reading node configuration file.'),
565                         ('auth'                 , 'Authenticating node with PLC.'),
566                         ('getplc'               , 'Retrieving details of node from PLC.'),
567                         ('update'               , 'Updating node boot state at PLC.'),
568                         ('hardware'             , 'Checking if hardware requirements met.'),
569                         ('installinit'  , 'Install: Initializing.'),
570                         ('installdisk'  , 'Install: partitioning disks.'),
571                         ('installbootfs', 'Install: bootstrapfs tarball.'),
572                         ('installcfg'   , 'Install: Writing configuration files.'),
573                         ('installstop'  , 'Install: Shutting down installer.'),
574                         ('update2'              , 'Updating node boot state at PLC.'),
575                         ('installinit2' , 'Install: Initializing.'),
576                         ('validate'             , 'Validating node installation.'),
577                         ('rebuildinitrd', 'Rebuilding initrd'),
578                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
579                         ('update3'              , 'Updating node configuration.'),
580                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
581                         ('update4'              , 'Sending hardware configuration to PLC.'),
582                         ('debug'                , 'Starting debug mode'),
583                         ('bmexceptmount', 'BootManagerException during mount'),
584                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
585                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
586                         ('exception'    , 'Exception'),
587                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
588                         ('protoerror'   , 'XML RPC protocol error'),
589                         ('nodehostname' , 'Configured node hostname does not resolve'),
590                         ('implementerror', 'Implementation Error'),
591                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
592                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
593                         ('noinstall'    , 'notinstalled'),
594                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
595                         ('noblockdev'   , "No block devices detected."),
596                         ('dnserror'     , 'Name or service not known'),
597                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
598                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
599                         ('hardwarerequirefail' , 'Hardware requirements not met'),
600                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
601                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
602                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
603                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
604                         ('modulefail'   , 'Unable to get list of system modules'),
605                         ('writeerror'   , 'write error: No space left on device'),
606                         ('nospace'      , "No space left on device"),
607                         ('nonode'       , 'Failed to authenticate call: No such node'),
608                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
609                         ('bootcheckfail'     , 'BootCheckAuthentication'),
610                         ('bootupdatefail'   , 'BootUpdateNode'),
611                 ]
612                 return steps
613
614         def getBootManagerSequenceFromLog(self, steps, child):
615                 sequence = []
616                 while True:
617                         
618                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
619                         id = index_to_id(steps,index)
620                         sequence.append(id)
621
622                         if id == "exception":
623                                 print "...Found An Exception!!!"
624                         elif id == "done": #index == len(steps_to_list(steps)):
625                                 #print "Reached EOF"
626                                 break
627
628                 return sequence
629                 
630
631 def restore(sitehist, hostname, config=None, forced_action=None):
632
633         # NOTE: Nothing works if the bootcd is REALLY old.
634         #       So, this is the first step.
635
636         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
637         recent_actions = sitehist.getRecentActions(hostname=hostname)
638
639         if fbnode['observed_category'] == "OLDBOOTCD":
640                 print "\t...Notify owner to update BootImage!!!"
641
642                 if not found_within(recent_actions, 'newbootcd_notice', 3):
643                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
644
645                         print "\tDisabling %s due to out-of-date BootImage" % hostname
646                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
647
648                 # NOTE: nothing else is possible.
649                 return True
650
651         debugnode = DebugInterface(hostname)
652         conn = debugnode.getConnection()
653         #print "conn: %s" % conn
654         #print "trying to use conn after returning it."
655         #print conn.c.modules.sys.path
656         #print conn.c.modules.os.path.exists('/tmp/source')
657         if type(conn) == type(False): return False
658
659         #if forced_action == "reboot":
660         #       conn.restart_node('reinstall')
661         #       return True
662
663         boot_state = conn.get_boot_state()
664         if boot_state != "debug":
665                 print "... %s in %s state: skipping..." % (hostname , boot_state)
666                 return boot_state == "boot"
667
668         if conn.bootmanager_running():
669                 print "...BootManager is currently running.  Skipping host %s" %hostname 
670                 return True
671
672         # Read persistent flags, tagged on one week intervals.
673
674         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
675         dmesg = conn.get_dmesg()
676         child = fdpexpect.fdspawn(dmesg)
677
678         steps = debugnode.getDiskSteps()
679         sequence = debugnode.getDiskSequence(steps, child)
680
681         s = Set(sequence)
682         if config and not config.quiet: print "\tSET: ", s
683
684         if len(s) > 1:
685                 print "...Potential drive errors on %s" % hostname 
686                 if len(s) == 2 and 'floppyerror' in s:
687                         print "...Should investigate.  Continuing with node."
688                 else:
689                         print "...Should investigate.  Skipping node."
690                         # TODO: send message related to these errors.
691
692                         if not found_within(recent_actions, 'newbootcd_notice', 3):
693
694                                 log=conn.get_dmesg().read()
695                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
696                                 conn.set_nodestate('disable')
697
698                         return False
699
700         print "...Downloading bm.log from %s" %hostname 
701         log = conn.get_bootmanager_log()
702         child = fdpexpect.fdspawn(log)
703
704         if hasattr(config, 'collect') and config.collect: return True
705
706         if config and not config.quiet: print "...Scanning bm.log for errors"
707
708         time.sleep(1)
709
710         steps = debugnode.getBootManagerStepPatterns()
711         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
712                 
713         s = "-".join(sequence)
714         print "   FOUND SEQUENCE: ", s
715
716         # NOTE: We get or set the flag based on the current sequence identifier.
717         #  By using the sequence identifier, we guarantee that there will be no
718         #  frequent loops.  I'm guessing there is a better way to track loops,
719         #  though.
720
721         sequences = debugnode.getSequences()
722         flag_set = True
723         
724         if s not in sequences:
725                 print "   HOST %s" % hostname
726                 print "   UNKNOWN SEQUENCE: %s" % s
727
728                 args = {}
729                 args['hostname'] = hostname
730                 args['sequence'] = s
731                 args['bmlog'] = conn.get_bootmanager_log().read()
732                 args['viart'] = False
733
734                 sitehist.sendMessage('unknownsequence_notice', **args)
735
736                 conn.restart_bootmanager('boot')
737
738                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
739                 # This way, we can check it again after we've fixed it.
740                 flag_set = False
741
742         else:
743
744                 if   sequences[s] == "restart_bootmanager_boot":
745                         print "...Restarting BootManager.py on %s "%hostname 
746                         conn.restart_bootmanager('boot')
747                 elif sequences[s] == "restart_bootmanager_rins":
748                         print "...Restarting BootManager.py on %s "%hostname 
749                         conn.restart_bootmanager('reinstall')
750                 elif sequences[s] == "restart_node_rins":
751                         conn.restart_node('reinstall')
752                 elif sequences[s] == "restart_node_boot":
753                         conn.restart_node('boot')
754                 elif sequences[s] == "repair_node_keys":
755                         if conn.compare_and_repair_nodekeys():
756                                 # the keys either are in sync or were forced in sync.
757                                 # so try to reboot the node again.
758                                 # TODO: why was this originally 'reinstall' instead of 'boot'??
759                                 conn.restart_bootmanager('boot')
760                                 pass
761                         else:
762                                 # there was some failure to synchronize the keys.
763                                 print "...Unable to repair node keys on %s" %hostname 
764
765                 elif sequences[s] == "suspect_error_email":
766                         args = {}
767                         args['hostname'] = hostname
768                         args['sequence'] = s
769                         args['bmlog'] = conn.get_bootmanager_log().read()
770                         args['viart'] = False
771
772                         sitehist.sendMessage('unknownsequence_notice', **args)
773                         conn.restart_bootmanager('boot')
774
775                 # TODO: differentiate this and the 'nodenetwork_email' actions.
776                 elif sequences[s] == "update_node_config_email":
777
778                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
779                                 args = {}
780                                 args['hostname'] = hostname
781                                 sitehist.sendMessage('nodeconfig_notice', **args)
782                                 conn.dump_plconf_file()
783
784                 elif sequences[s] == "nodenetwork_email":
785
786                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
787                                 args = {}
788                                 args['hostname'] = hostname
789                                 args['bmlog'] = conn.get_bootmanager_log().read()
790                                 sitehist.sendMessage('nodeconfig_notice', **args)
791                                 conn.dump_plconf_file()
792
793                 elif sequences[s] == "update_bootcd_email":
794
795                         if not found_within(recent_actions, 'newalphacd_notice', 3):
796                                 args = {}
797                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
798                                 args['hostname'] = hostname
799                         
800                                 sitehist.sendMessage('newalphacd_notice', **args)
801
802                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
803
804                 elif sequences[s] == "broken_hardware_email":
805                         # MAKE An ACTION record that this host has failed hardware.  May
806                         # require either an exception "/minhw" or other manual intervention.
807                         # Definitely need to send out some more EMAIL.
808                         # TODO: email notice of broken hardware
809                         if not found_within(recent_actions, 'baddisk_notice', 1):
810                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
811                                 args = {}
812                                 args['hostname'] = hostname
813                                 args['log'] = conn.get_dmesg().read()
814
815                                 sitehist.sendMessage('baddisk_notice', **args)
816                                 conn.set_nodestate('disable')
817
818                 elif sequences[s] == "update_hardware_email":
819                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
820                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
821                                 args = {}
822                                 args['hostname'] = hostname
823                                 args['bmlog'] = conn.get_bootmanager_log().read()
824                                 sitehist.sendMessage('minimalhardware_notice', **args)
825
826                 elif sequences[s] == "bad_dns_email":
827                         if not found_within(recent_actions, 'baddns_notice', 1):
828                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
829                                 args = {}
830                                 try:
831                                         node = plccache.GetNodeByName(hostname)
832                                         net = api.GetInterfaces(node['interface_ids'])[0]
833                                 except:
834                                         email_exception()
835                                         print traceback.print_exc()
836                                         # TODO: api error. skip email, b/c all info is not available,
837                                         # flag_set will not be recorded.
838                                         return False
839                                 nodenet_str = network_config_to_str(net)
840
841                                 args['hostname'] = hostname
842                                 args['network_config'] = nodenet_str
843                                 args['interface_id'] = net['interface_id']
844
845                                 sitehist.sendMessage('baddns_notice', **args)
846
847         return True
848         
849
850 # MAIN -------------------------------------------------------------------
851
852 def main():
853         from monitor import parser as parsermodule
854         parser = parsermodule.getParser()
855
856         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
857                                                 force=None, quiet=False)
858         parser.add_option("", "--child", dest="child", action="store_true", 
859                                                 help="This is the child mode of this process.")
860         parser.add_option("", "--force", dest="force", metavar="boot_state",
861                                                 help="Force a boot state passed to BootManager.py.")
862         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
863                                                 help="Extra quiet output messages.")
864         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
865                                                 help="Extra debug output messages.")
866         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
867                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
868         parser.add_option("", "--collect", dest="collect", action="store_true", 
869                                                 help="No action, just collect dmesg, and bm.log")
870         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
871                                                 help="Do not perform the orginary setup phase.")
872
873         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
874         config = parsermodule.parse_args(parser)
875
876         if config.nodelist:
877                 nodes = config.getListFromFile(config.nodelist)
878         elif config.node:
879                 nodes = [ config.node ]
880         else:
881                 parser.print_help()
882                 sys.exit(1)
883
884         for node in nodes:
885                 # get sitehist
886                 lb = plccache.plcdb_hn2lb[node]
887                 sitehist = SiteInterface.get_or_make(loginbase=lb)
888                 #reboot(node, config)
889                 restore(sitehist, node, config=None, forced_action=None)
890
891 if __name__ == "__main__":
892         main()