bug fix in summary template
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
76                 return log
77
78         def get_bootmanager_log(self):
79                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80                 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81                 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
83                 return log
84
85
86 #       def get_dmesg(self):
87 #               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 #               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 #               log = open("log/dmesg.%s.log" % self.node, 'r')
90 #               return log
91 #
92 #       def get_bootmanager_log(self):
93 #               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 #               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 #               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 #               log = open("log/bm.%s.log" % self.node, 'r')
97 #               return log
98
99         def dump_plconf_file(self):
100                 c = self.c
101                 self.c.modules.sys.path.append("/tmp/source/")
102                 self.c.modules.os.chdir('/tmp/source')
103
104                 log = c.modules.BootManager.log('/tmp/new.log')
105                 bm = c.modules.BootManager.BootManager(log,'boot')
106
107                 BootManagerException = c.modules.Exceptions.BootManagerException
108                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
110                 bm_continue = True
111
112                 InitializeBootManager.Run(bm.VARS, bm.LOG)
113                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
114                 except Exception, x:
115                         bm_continue = False
116                         print "   ERROR:", x
117                         print "   Possibly, unable to find valid configuration file"
118
119                 if bm_continue:
120                         for key in bm.VARS.keys():
121                                 print key, " == ", bm.VARS[key]
122                 else:
123                         print "   Unable to read Node Configuration"
124                 
125
126         def compare_and_repair_nodekeys(self):
127                 c = self.c
128                 self.c.modules.sys.path.append("/tmp/source/")
129                 self.c.modules.os.chdir('/tmp/source')
130
131                 log = c.modules.BootManager.log('/tmp/new.log')
132                 bm = c.modules.BootManager.BootManager(log,'boot')
133
134                 BootManagerException = c.modules.Exceptions.BootManagerException
135                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
136                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
137                 bm_continue = True
138
139                 plcnode = plccache.GetNodeByName(self.node)
140
141                 InitializeBootManager.Run(bm.VARS, bm.LOG)
142                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
143                 except Exception, x:
144                         bm_continue = False
145                         print "exception"
146                         print x
147                         print "   Possibly, unable to find valid configuration file"
148
149                 if bm_continue:
150                         print "   NODE: %s" % bm.VARS['NODE_KEY']
151                         print "   PLC : %s" % plcnode['key']
152
153                         if bm.VARS['NODE_KEY'] == plcnode['key']:
154                                 return True
155                         else:
156                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
157                                         print "   Successfully updated NODE_KEY with PLC"
158                                         return True
159                                 else:
160                                         return False
161                                 
162                         #for key in bm.VARS.keys():
163                         #       print key, " == ", bm.VARS[key]
164                 else:
165                         print "   Unable to retrieve NODE_KEY"
166
167         def bootmanager_running(self):
168                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
169                         return True
170                 else:
171                         return False
172
173         def set_nodestate(self, state='boot'):
174                 return api.UpdateNode(self.node, {'boot_state' : state})
175
176         def restart_node(self, state='boot'):
177                 api.UpdateNode(self.node, {'boot_state' : state})
178
179                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
180                 if not pflags.getRecentFlag('gentlekill'):
181                         print "   Killing all slice processes... : %s" %  self.node
182                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
183                         self.c.modules.os.system(cmd_slicekill)
184                         cmd = """ shutdown -r +1 & """
185                         print "   Restarting %s : %s" % ( self.node, cmd)
186                         self.c.modules.os.system(cmd)
187
188                         pflags.setRecentFlag('gentlekill')
189                         pflags.save()
190                 else:
191                         print "   Restarting with sysrq 'sub' %s" % self.node
192                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
193                         self.c.modules.os.system(cmd)
194
195                 return
196
197         def restart_bootmanager(self, forceState):
198
199                 self.c.modules.os.chdir('/tmp/source')
200                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
201                         print "   BootManager is already running: try again soon..."
202                 else:
203                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
204                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
205                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
206                                   "  rm -f /tmp/BM_RUNNING " + \
207                                   ") &" 
208                         cmd = cmd % forceState
209                         self.c.modules.os.system(cmd)
210
211                 return 
212
213
214 class PlanetLabSession:
215         globalport = 22000 + int(random.random()*1000)
216
217         def __init__(self, node, nosetup, verbose):
218                 self.verbose = verbose
219                 self.node = node
220                 self.port = None
221                 self.nosetup = nosetup
222                 self.command = None
223                 self.setup_host()
224
225         def get_connection(self, config):
226                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
227                 #i = 0
228                 #while i < 3: 
229                 #       print i, conn.c.modules.sys.path
230                 #       print conn.c.modules.os.path.exists('/tmp/source')
231                 #       i+=1
232                 #       time.sleep(1)
233                 return conn
234         
235         def setup_host(self):
236                 self.port = PlanetLabSession.globalport
237                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
238
239                 args = {}
240                 args['port'] = self.port
241                 args['user'] = 'root'
242                 args['hostname'] = self.node
243                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
244                 ssh_port = 22
245
246                 if self.nosetup:
247                         print "Skipping setup"
248                         return 
249
250                 # COPY Rpyc files to host
251                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
252                 if self.verbose: print cmd
253                 print cmd
254                 # TODO: Add timeout
255                 timeout = 120
256                 localos = moncommands.CMD()
257
258                 ret = localos.system(cmd, timeout)
259                 print ret
260                 if ret != 0:
261                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
262                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
263                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
264                         ret = localos.system(cmd, timeout)
265                         print ret
266                         if ret != 0:
267                                 print "\tFAILED TWICE"
268                                 #sys.exit(1)
269                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
270
271                 t1 = time.time()
272                 # KILL any already running servers.
273                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
274                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
275             rm -f out.log
276             echo "kill server" >> out.log
277             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
278             echo "export" >> out.log
279             export PYTHONPATH=$HOME  ;
280             echo "start server" >> out.log
281             python Rpyc/Servers/forking_server.py &> server.log &
282             echo "done" >> out.log
283 EOF""")
284                 #cmd = """ssh %(user)s@%(hostname)s """ + \
285                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
286                 #cmd = cmd % args
287                 #if self.verbose: print cmd
288                 ## TODO: Add timeout
289                 #print localos.system(cmd,timeout)
290
291                 ## START a new rpyc server.
292                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
293                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
294                 #cmd = cmd % args
295                 #if self.verbose: print cmd
296                 #print localos.system(cmd,timeout)
297                 print "setup rpyc server over ssh"
298                 print ssh.ret
299
300                 # TODO: Add timeout
301                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
302                 # and the following options seems to work well.
303                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
304                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
305                           """-o ConnectTimeout=120 """ + \
306                           """-n -N -L %(port)s:localhost:18812 """ + \
307                           """%(user)s@%(hostname)s"""
308                 cmd = cmd % args
309                 if self.verbose: print cmd
310                 print cmd
311                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
312                 # TODO: the read() here may block indefinitely.  Need a better
313                 # approach therefore, that includes a timeout.
314                 #ret = self.command.stdout.read(5)
315                 ret = moncommands.read_t(self.command.stdout, 5)
316
317                 t2 = time.time()
318                 if 'READY' in ret:
319                         # NOTE: There is still a slight race for machines that are slow...
320                         self.timeout = 2*(t2-t1)
321                         print "Sleeping for %s sec" % self.timeout
322                         time.sleep(self.timeout)
323                         return
324
325                 if self.command.returncode is not None:
326                         print "Failed to establish tunnel!"
327                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
328
329                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
330
331         def __del__(self):
332                 if self.command:
333                         if self.verbose: print "Killing SSH session %s" % self.port
334                         print "Killing SSH session %s" % self.port
335                         self.command.kill()
336
337         
338 def steps_to_list(steps, index=1):
339         return map(lambda x: x[index], steps)
340
341 def index_to_id(steps,index):
342         if index < len(steps):
343                 return steps[index][0]
344         else:
345                 return "done"
346
347 class DebugInterface:
348         def __init__(self, hostname):
349                 self.hostname = hostname
350                 self.session = None
351
352         def getConnection(self):
353                 print "Creating session for %s" % self.hostname
354                 # update known_hosts file (in case the node has rebooted since last run)
355                 try:
356                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
357                 except:
358                         email_exception()
359                         print traceback.print_exc()
360                         return False
361
362                 try:
363                         if config == None:
364                                 self.session = PlanetLabSession(self.hostname, False, True)
365                         else:
366                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
367                 except ExceptionDoubleSSHError, e:
368                         msg = "ERROR setting up session for %s" % self.hostname
369                         print msg
370                         return False
371                 except Exception, e:
372                         traceback.print_exc()
373                         email_exception(msg)
374                         return False
375
376                 try:
377                         conn = self.session.get_connection(config)
378                 except EOFError:
379                         # NOTE: sometimes the wait in setup_host() is not long enough.  
380                         # So, here we try to wait a little longer before giving up entirely.
381                         try:
382                                 time.sleep(self.session.timeout*5)
383                                 conn = self.session.get_connection(config)
384                         except EOFError:
385                                 # failed twice... no need to report this really, it's just in a
386                                 # weird state...
387                                 return False
388                         except:
389                                 traceback.print_exc()
390                                 email_exception(self.hostname)
391                                 return False
392                 #print "trying to use conn before returning it."
393                 #print conn.c.modules.sys.path
394                 #print conn.c.modules.os.path.exists('/tmp/source')
395                 #time.sleep(1)
396
397                 #print "conn: %s" % conn
398                 return conn
399
400         def getSequences(self):
401
402                 # TODO: This can be replaced with a DB definition at a future time.
403                 #               This would make it possible for an admin to introduce new
404                 #               patterns without touching code.
405                 
406                 sequences = {}
407                 # restart_bootmanager_boot
408                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
409                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
410                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
411
412                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
413
414                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
416                                 "bminit-cfg-auth-getplc-update-debug-done",
417                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
418                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
419                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
420                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
421                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
422                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
423                                 ]:
424                         sequences.update({n : "restart_bootmanager_boot"})
425
426                 #       conn.restart_bootmanager('reinstall')
427                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
428                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
429                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
430                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
433                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
434                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
435                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
436                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
437                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
438                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
439                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
440                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
441                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
442                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
443                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
444                                 # actual solution appears to involve removing the bad files, and
445                                 # continually trying to boot the node.
446                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
447                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
448                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
449                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
450                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
451                                 ]:
452                         sequences.update({n : "restart_bootmanager_rins"})
453
454                 # repair_node_keys
455                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
456                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
457                                 ]:
458                         sequences.update({n: "repair_node_keys"})
459
460                 #   conn.restart_node('reinstall')
461                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
462                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
463                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
464                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
465                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
466                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
467                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
468                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
469                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
470                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
471                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
473                                 ]:
474                         sequences.update({n : "restart_node_rins"})
475
476                 #       restart_node_boot
477                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
478                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
479                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
480                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
481                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
482                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
483                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
484                                  ]:
485                         sequences.update({n: "restart_node_boot"})
486
487                 # update_node_config_email
488                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
489                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
490                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
491                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
492                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
493                                 ]:
494                         sequences.update({n : "update_node_config_email"})
495
496                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
497                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
498                                 ]:
499                         sequences.update({n : "nodenetwork_email"})
500
501                 # update_bootcd_email
502                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
503                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
504                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
505                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
506                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
507                                 ]:
508                         sequences.update({n : "update_bootcd_email"})
509
510                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
511                                 ]:
512                         sequences.update({n: "suspect_error_email"})
513
514                 # update_hardware_email
515                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
516                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
517
518                 # broken_hardware_email
519                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
520
521                 # bad_dns_email
522                 for n in [ 
523                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
524                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
525                         ]:
526                         sequences.update( { n : "bad_dns_email"})
527
528                 return sequences
529
530         def getDiskSteps(self):
531                 steps = [
532                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
533                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
534                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
535
536                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
537
538                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
539                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
540
541                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
542                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
543
544                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
545                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
546
547                         ('floppytimeout','floppy0: floppy timeout called'),
548                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
549
550                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
551                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
552
553                         # floppy0: floppy timeout called
554                         # end_request: I/O error, dev fd0, sector 0
555
556                         # Buffer I/O error on device dm-2, logical block 8888896
557                         # ata1: status=0x51 { DriveReady SeekComplete Error }
558                         # ata1: error=0x40 { UncorrectableError }
559                         # SCSI error : <0 0 0 0> return code = 0x8000002
560                         # sda: Current: sense key: Medium Error
561                         #       Additional sense: Unrecovered read error - auto reallocate failed
562
563                         # SCSI error : <0 2 0 0> return code = 0x40001
564                         # end_request: I/O error, dev sda, sector 572489600
565                 ]
566                 return steps
567
568         def getDiskSequence(self, steps, child):
569                 sequence = []
570                 while True:
571                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
572                         sequence.append(id)
573
574                         if id == "done":
575                                 break
576                 return sequence
577
578         def getBootManagerStepPatterns(self):
579                 steps = [
580                         ('bminit'               , 'Initializing the BootManager.'),
581                         ('cfg'                  , 'Reading node configuration file.'),
582                         ('auth'                 , 'Authenticating node with PLC.'),
583                         ('getplc'               , 'Retrieving details of node from PLC.'),
584                         ('update'               , 'Updating node boot state at PLC.'),
585                         ('hardware'             , 'Checking if hardware requirements met.'),
586                         ('installinit'  , 'Install: Initializing.'),
587                         ('installdisk'  , 'Install: partitioning disks.'),
588                         ('installbootfs', 'Install: bootstrapfs tarball.'),
589                         ('installcfg'   , 'Install: Writing configuration files.'),
590                         ('installstop'  , 'Install: Shutting down installer.'),
591                         ('update2'              , 'Updating node boot state at PLC.'),
592                         ('installinit2' , 'Install: Initializing.'),
593                         ('validate'             , 'Validating node installation.'),
594                         ('rebuildinitrd', 'Rebuilding initrd'),
595                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
596                         ('update3'              , 'Updating node configuration.'),
597                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
598                         ('update4'              , 'Sending hardware configuration to PLC.'),
599                         ('debug'                , 'Starting debug mode'),
600                         ('bmexceptmount', 'BootManagerException during mount'),
601                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
602                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
603                         ('exception'    , 'Exception'),
604                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
605                         ('protoerror'   , 'XML RPC protocol error'),
606                         ('nodehostname' , 'Configured node hostname does not resolve'),
607                         ('implementerror', 'Implementation Error'),
608                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
609                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
610                         ('noinstall'    , 'notinstalled'),
611                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
612                         ('noblockdev'   , "No block devices detected."),
613                         ('dnserror'     , 'Name or service not known'),
614                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
615                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
616                         ('hardwarerequirefail' , 'Hardware requirements not met'),
617                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
618                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
619                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
620                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
621                         ('modulefail'   , 'Unable to get list of system modules'),
622                         ('writeerror'   , 'write error: No space left on device'),
623                         ('nospace'      , "No space left on device"),
624                         ('nonode'       , 'Failed to authenticate call: No such node'),
625                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
626                         ('bootcheckfail'     , 'BootCheckAuthentication'),
627                         ('bootupdatefail'   , 'BootUpdateNode'),
628                 ]
629                 return steps
630
631         def getBootManagerSequenceFromLog(self, steps, child):
632                 sequence = []
633                 while True:
634                         
635                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
636                         id = index_to_id(steps,index)
637                         sequence.append(id)
638
639                         if id == "exception":
640                                 print "...Found An Exception!!!"
641                         elif id == "done": #index == len(steps_to_list(steps)):
642                                 #print "Reached EOF"
643                                 break
644
645                 return sequence
646                 
647 def restore(sitehist, hostname, config=None, forced_action=None):
648         ret = restore_basic(sitehist, hostname, config, forced_action)
649         session.flush()
650         return ret
651
652 def restore_basic(sitehist, hostname, config=None, forced_action=None):
653
654         # NOTE: Nothing works if the bootcd is REALLY old.
655         #       So, this is the first step.
656
657         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
658         recent_actions = sitehist.getRecentActions(hostname=hostname)
659
660         if fbnode['observed_category'] == "OLDBOOTCD":
661                 print "\t...Notify owner to update BootImage!!!"
662
663                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
664                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
665
666                         print "\tDisabling %s due to out-of-date BootImage" % hostname
667                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
668
669                 # NOTE: nothing else is possible.
670                 return True
671
672         debugnode = DebugInterface(hostname)
673         conn = debugnode.getConnection()
674         if type(conn) == type(False): return False
675
676         boot_state = conn.get_boot_state()
677         if boot_state != "debug":
678                 print "... %s in %s state: skipping..." % (hostname , boot_state)
679                 return boot_state == "boot"
680
681         if conn.bootmanager_running():
682                 print "...BootManager is currently running.  Skipping host %s" %hostname 
683                 return True
684
685         # Read persistent flags, tagged on one week intervals.
686
687         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
688         dmesg = conn.get_dmesg()
689         child = fdpexpect.fdspawn(dmesg)
690
691         steps = debugnode.getDiskSteps()
692         sequence = debugnode.getDiskSequence(steps, child)
693
694         s = Set(sequence)
695         if config and not config.quiet: print "\tSET: ", s
696
697         if len(s) > 1:
698                 print "...Potential drive errors on %s" % hostname 
699                 if len(s) == 2 and 'floppyerror' in s:
700                         print "...Should investigate.  Continuing with node."
701                 else:
702                         print "...Should investigate.  Skipping node."
703                         # TODO: send message related to these errors.
704
705                         if not found_within(recent_actions, 'baddisk_notice', 7):
706                                 print "baddisk_notice not found recently"
707
708                                 log=conn.get_dmesg().read()
709                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
710                                 conn.set_nodestate('disabled')
711
712                         return False
713
714         print "...Downloading bm.log from %s" %hostname 
715         log = conn.get_bootmanager_log()
716         child = fdpexpect.fdspawn(log)
717
718         if hasattr(config, 'collect') and config.collect: return True
719
720         if config and not config.quiet: print "...Scanning bm.log for errors"
721
722         time.sleep(1)
723
724         steps = debugnode.getBootManagerStepPatterns()
725         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
726                 
727         s = "-".join(sequence)
728         print "   FOUND SEQUENCE: ", s
729
730         # NOTE: We get or set the flag based on the current sequence identifier.
731         #  By using the sequence identifier, we guarantee that there will be no
732         #  frequent loops.  I'm guessing there is a better way to track loops,
733         #  though.
734
735         sequences = debugnode.getSequences()
736         flag_set = True
737         
738         if s not in sequences:
739                 print "   HOST %s" % hostname
740                 print "   UNKNOWN SEQUENCE: %s" % s
741
742                 args = {}
743                 args['hostname'] = hostname
744                 args['sequence'] = s
745                 args['bmlog'] = conn.get_bootmanager_log().read()
746                 args['viart'] = False
747                 args['saveact'] = True
748                 args['ccemail'] = True
749
750                 sitehist.sendMessage('unknownsequence_notice', **args)
751
752                 conn.restart_bootmanager('boot')
753
754                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
755                 # This way, we can check it again after we've fixed it.
756                 flag_set = False
757
758         else:
759
760                 if   sequences[s] == "restart_bootmanager_boot":
761                         print "...Restarting BootManager.py on %s "%hostname 
762                         conn.restart_bootmanager('boot')
763                 elif sequences[s] == "restart_bootmanager_rins":
764                         print "...Restarting BootManager.py on %s "%hostname 
765                         conn.restart_bootmanager('reinstall')
766                 elif sequences[s] == "restart_node_rins":
767                         conn.restart_node('reinstall')
768                 elif sequences[s] == "restart_node_boot":
769                         conn.restart_node('boot')
770                 elif sequences[s] == "repair_node_keys":
771                         if conn.compare_and_repair_nodekeys():
772                                 # the keys either are in sync or were forced in sync.
773                                 # so try to reboot the node again.
774                                 # TODO: why was this originally 'reinstall' instead of 'boot'??
775                                 conn.restart_bootmanager('boot')
776                                 pass
777                         else:
778                                 # there was some failure to synchronize the keys.
779                                 print "...Unable to repair node keys on %s" %hostname 
780
781                 elif sequences[s] == "suspect_error_email":
782                         args = {}
783                         args['hostname'] = hostname
784                         args['sequence'] = s
785                         args['bmlog'] = conn.get_bootmanager_log().read()
786                         args['viart'] = False
787                         args['saveact'] = True
788                         args['ccemail'] = True
789
790                         sitehist.sendMessage('unknownsequence_notice', **args)
791                         conn.restart_bootmanager('boot')
792
793                 # TODO: differentiate this and the 'nodenetwork_email' actions.
794                 elif sequences[s] == "update_node_config_email":
795
796                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
797                                 args = {}
798                                 args['hostname'] = hostname
799                                 sitehist.sendMessage('nodeconfig_notice', **args)
800                                 conn.dump_plconf_file()
801
802                 elif sequences[s] == "nodenetwork_email":
803
804                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
805                                 args = {}
806                                 args['hostname'] = hostname
807                                 args['bmlog'] = conn.get_bootmanager_log().read()
808                                 sitehist.sendMessage('nodeconfig_notice', **args)
809                                 conn.dump_plconf_file()
810
811                 elif sequences[s] == "update_bootcd_email":
812
813                         if not found_within(recent_actions, 'newalphacd_notice', 3.5):
814                                 args = {}
815                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
816                                 args['hostname'] = hostname
817                         
818                                 sitehist.sendMessage('newalphacd_notice', **args)
819
820                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
821
822                 elif sequences[s] == "broken_hardware_email":
823                         # MAKE An ACTION record that this host has failed hardware.  May
824                         # require either an exception "/minhw" or other manual intervention.
825                         # Definitely need to send out some more EMAIL.
826                         # TODO: email notice of broken hardware
827                         if not found_within(recent_actions, 'baddisk_notice', 7):
828                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
829                                 args = {}
830                                 args['hostname'] = hostname
831                                 args['log'] = conn.get_dmesg().read()
832
833                                 sitehist.sendMessage('baddisk_notice', **args)
834                                 conn.set_nodestate('disabled')
835
836                 elif sequences[s] == "update_hardware_email":
837                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
838                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
839                                 args = {}
840                                 args['hostname'] = hostname
841                                 args['bmlog'] = conn.get_bootmanager_log().read()
842                                 sitehist.sendMessage('minimalhardware_notice', **args)
843
844                 elif sequences[s] == "bad_dns_email":
845                         if not found_within(recent_actions, 'baddns_notice', 1):
846                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
847                                 args = {}
848                                 try:
849                                         node = plccache.GetNodeByName(hostname)
850                                         net = api.GetInterfaces(node['interface_ids'])[0]
851                                 except:
852                                         email_exception()
853                                         print traceback.print_exc()
854                                         # TODO: api error. skip email, b/c all info is not available,
855                                         # flag_set will not be recorded.
856                                         return False
857                                 nodenet_str = network_config_to_str(net)
858
859                                 args['hostname'] = hostname
860                                 args['network_config'] = nodenet_str
861                                 args['interface_id'] = net['interface_id']
862
863                                 sitehist.sendMessage('baddns_notice', **args)
864
865         return True
866         
867
868 # MAIN -------------------------------------------------------------------
869
870 def main():
871         from monitor import parser as parsermodule
872         parser = parsermodule.getParser()
873
874         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
875                                                 force=None, quiet=False)
876         parser.add_option("", "--child", dest="child", action="store_true", 
877                                                 help="This is the child mode of this process.")
878         parser.add_option("", "--force", dest="force", metavar="boot_state",
879                                                 help="Force a boot state passed to BootManager.py.")
880         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
881                                                 help="Extra quiet output messages.")
882         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
883                                                 help="Extra debug output messages.")
884         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
885                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
886         parser.add_option("", "--collect", dest="collect", action="store_true", 
887                                                 help="No action, just collect dmesg, and bm.log")
888         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
889                                                 help="Do not perform the orginary setup phase.")
890
891         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
892         config = parsermodule.parse_args(parser)
893
894         if config.nodelist:
895                 nodes = config.getListFromFile(config.nodelist)
896         elif config.node:
897                 nodes = [ config.node ]
898         else:
899                 parser.print_help()
900                 sys.exit(1)
901
902         for node in nodes:
903                 # get sitehist
904                 lb = plccache.plcdb_hn2lb[node]
905                 sitehist = SiteInterface.get_or_make(loginbase=lb)
906                 #reboot(node, config)
907                 restore(sitehist, node, config=None, forced_action=None)
908
909 if __name__ == "__main__":
910         main()