add a 'delete_recent' function to ActionRecord
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
76                 return log
77
78         def get_bootmanager_log(self):
79                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80                 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81                 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
83                 return log
84
85
86 #       def get_dmesg(self):
87 #               self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 #               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 #               log = open("log/dmesg.%s.log" % self.node, 'r')
90 #               return log
91 #
92 #       def get_bootmanager_log(self):
93 #               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 #               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 #               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 #               log = open("log/bm.%s.log" % self.node, 'r')
97 #               return log
98
99         def dump_plconf_file(self):
100                 c = self.c
101                 self.c.modules.sys.path.append("/tmp/source/")
102                 self.c.modules.os.chdir('/tmp/source')
103
104                 log = c.modules.BootManager.log('/tmp/new.log')
105                 bm = c.modules.BootManager.BootManager(log,'boot')
106
107                 BootManagerException = c.modules.Exceptions.BootManagerException
108                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
110                 bm_continue = True
111
112                 InitializeBootManager.Run(bm.VARS, bm.LOG)
113                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
114                 except Exception, x:
115                         bm_continue = False
116                         print "   ERROR:", x
117                         print "   Possibly, unable to find valid configuration file"
118
119                 if bm_continue:
120                         for key in bm.VARS.keys():
121                                 print key, " == ", bm.VARS[key]
122                 else:
123                         print "   Unable to read Node Configuration"
124                 
125         def fsck_repair_node(self):
126                 c = self.c
127                 self.c.modules.sys.path.append("/tmp/source/")
128                 self.c.modules.os.chdir('/tmp/source')
129                 # TODO: restart
130                 # TODO: set boot state to node's actually boot state.
131                 # could be 'boot' or 'safeboot'
132                 self.c.modules.os.chdir('/tmp/source')
133                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
134                         print "Running MANUAL FSCK already... try again soon."
135                 else:
136                         print "Running MANUAL fsck on %s" % self.node
137                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
138                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
139                                   "  fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
140                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
141                                   "  rm -f /tmp/BM_RUNNING " + \
142                                   ") &" 
143                         cmd = cmd % self.get_nodestate()
144                         self.c.modules.os.system(cmd)
145                 #self.restart_bootmanager('boot')       
146                 pass
147
148         def compare_and_repair_nodekeys(self):
149                 c = self.c
150                 self.c.modules.sys.path.append("/tmp/source/")
151                 self.c.modules.os.chdir('/tmp/source')
152
153                 log = c.modules.BootManager.log('/tmp/new.log')
154                 bm = c.modules.BootManager.BootManager(log,'boot')
155
156                 BootManagerException = c.modules.Exceptions.BootManagerException
157                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
158                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
159                 bm_continue = True
160
161                 plcnode = plccache.GetNodeByName(self.node)
162
163                 InitializeBootManager.Run(bm.VARS, bm.LOG)
164                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
165                 except Exception, x:
166                         bm_continue = False
167                         print "exception"
168                         print x
169                         print "   Possibly, unable to find valid configuration file"
170
171                 if bm_continue:
172                         print "   NODE: %s" % bm.VARS['NODE_KEY']
173                         print "   PLC : %s" % plcnode['key']
174
175                         if bm.VARS['NODE_KEY'] == plcnode['key']:
176                                 return True
177                         else:
178                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
179                                         print "   Successfully updated NODE_KEY with PLC"
180                                         return True
181                                 else:
182                                         return False
183                                 
184                         #for key in bm.VARS.keys():
185                         #       print key, " == ", bm.VARS[key]
186                 else:
187                         print "   Unable to retrieve NODE_KEY"
188
189         def bootmanager_running(self):
190                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
191                         return True
192                 else:
193                         return False
194
195         def set_nodestate(self, state='boot'):
196                 return api.UpdateNode(self.node, {'boot_state' : state})
197
198         def get_nodestate(self):
199                 try:
200                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
201                 except:
202                         traceback.print_exc()
203                         # NOTE: use last cached value from plc
204                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
205                         return fbnode['plc_node_stats']['boot_state']
206
207
208         def restart_node(self, state='boot'):
209                 api.UpdateNode(self.node, {'boot_state' : state})
210
211                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
212                 if not pflags.getRecentFlag('gentlekill'):
213                         print "   Killing all slice processes... : %s" %  self.node
214                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
215                         self.c.modules.os.system(cmd_slicekill)
216                         cmd = """ shutdown -r +1 & """
217                         print "   Restarting %s : %s" % ( self.node, cmd)
218                         self.c.modules.os.system(cmd)
219
220                         pflags.setRecentFlag('gentlekill')
221                         pflags.save()
222                 else:
223                         print "   Restarting with sysrq 'sub' %s" % self.node
224                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
225                         self.c.modules.os.system(cmd)
226
227                 return
228
229         def restart_bootmanager(self, forceState):
230
231                 self.c.modules.os.chdir('/tmp/source')
232                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
233                         print "   BootManager is already running: try again soon..."
234                 else:
235                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
236                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
237                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
238                                   "  rm -f /tmp/BM_RUNNING " + \
239                                   ") &" 
240                         cmd = cmd % forceState
241                         self.c.modules.os.system(cmd)
242
243                 return 
244
245
246 class PlanetLabSession:
247         globalport = 22000 + int(random.random()*1000)
248
249         def __init__(self, node, nosetup, verbose):
250                 self.verbose = verbose
251                 self.node = node
252                 self.port = None
253                 self.nosetup = nosetup
254                 self.command = None
255                 self.setup_host()
256
257         def get_connection(self, config):
258                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
259                 #i = 0
260                 #while i < 3: 
261                 #       print i, conn.c.modules.sys.path
262                 #       print conn.c.modules.os.path.exists('/tmp/source')
263                 #       i+=1
264                 #       time.sleep(1)
265                 return conn
266         
267         def setup_host(self):
268                 self.port = PlanetLabSession.globalport
269                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
270
271                 args = {}
272                 args['port'] = self.port
273                 args['user'] = 'root'
274                 args['hostname'] = self.node
275                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
276                 ssh_port = 22
277
278                 if self.nosetup:
279                         print "Skipping setup"
280                         return 
281
282                 # COPY Rpyc files to host
283                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
284                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
285                 if self.verbose: print cmd
286                 print cmd
287                 # TODO: Add timeout
288                 timeout = 120
289                 localos = moncommands.CMD()
290
291                 ret = localos.system(cmd, timeout)
292                 print ret
293                 if ret != 0:
294                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
295                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
296                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
297                         print "trying: ", cmd
298                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
299                         ret = localos.system(cmd, timeout)
300                         print ret
301                         if ret != 0:
302                                 print "\tFAILED TWICE"
303                                 #sys.exit(1)
304                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
305
306                 t1 = time.time()
307                 # KILL any already running servers.
308                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
309                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
310             rm -f out.log
311             echo "kill server" >> out.log
312             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
313             echo "export" >> out.log
314             export PYTHONPATH=$HOME  ;
315             echo "start server" >> out.log
316             python Rpyc/Servers/forking_server.py &> server.log &
317             echo "done" >> out.log
318 EOF""")
319                 #cmd = """ssh %(user)s@%(hostname)s """ + \
320                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
321                 #cmd = cmd % args
322                 #if self.verbose: print cmd
323                 ## TODO: Add timeout
324                 #print localos.system(cmd,timeout)
325
326                 ## START a new rpyc server.
327                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
328                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
329                 #cmd = cmd % args
330                 #if self.verbose: print cmd
331                 #print localos.system(cmd,timeout)
332                 print "setup rpyc server over ssh"
333                 print ssh.ret
334
335                 # TODO: Add timeout
336                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
337                 # and the following options seems to work well.
338                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
339                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
340                           """-o ConnectTimeout=120 """ + \
341                           """-n -N -L %(port)s:localhost:18812 """ + \
342                           """%(user)s@%(hostname)s"""
343                 cmd = cmd % args
344                 if self.verbose: print cmd
345                 print cmd
346                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
347                 # TODO: the read() here may block indefinitely.  Need a better
348                 # approach therefore, that includes a timeout.
349                 #ret = self.command.stdout.read(5)
350                 ret = moncommands.read_t(self.command.stdout, 5)
351
352                 t2 = time.time()
353                 if 'READY' in ret:
354                         # NOTE: There is still a slight race for machines that are slow...
355                         self.timeout = 2*(t2-t1)
356                         print "Sleeping for %s sec" % self.timeout
357                         time.sleep(self.timeout)
358                         return
359
360                 if self.command.returncode is not None:
361                         print "Failed to establish tunnel!"
362                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
363
364                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
365
366         def __del__(self):
367                 if self.command:
368                         if self.verbose: print "Killing SSH session %s" % self.port
369                         print "Killing SSH session %s" % self.port
370                         self.command.kill()
371
372         
373 def steps_to_list(steps, index=1):
374         return map(lambda x: x[index], steps)
375
376 def index_to_id(steps,index):
377         if index < len(steps):
378                 return steps[index][0]
379         else:
380                 return "done"
381
382 class DebugInterface:
383         def __init__(self, hostname):
384                 self.hostname = hostname
385                 self.session = None
386
387         def getConnection(self):
388                 print "Creating session for %s" % self.hostname
389                 # update known_hosts file (in case the node has rebooted since last run)
390                 try:
391                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
392                 except:
393                         email_exception()
394                         print traceback.print_exc()
395                         return False
396
397                 msg = "ERROR setting up session for %s" % self.hostname
398                 try:
399                         if config == None:
400                                 self.session = PlanetLabSession(self.hostname, False, True)
401                         else:
402                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
403                 except ExceptionDoubleSSHError, e:
404                         print msg
405                         return False
406                 except Exception, e:
407                         traceback.print_exc()
408                         email_exception(msg)
409                         return False
410
411                 try:
412                         conn = self.session.get_connection(config)
413                 except EOFError:
414                         # NOTE: sometimes the wait in setup_host() is not long enough.  
415                         # So, here we try to wait a little longer before giving up entirely.
416                         try:
417                                 time.sleep(self.session.timeout*5)
418                                 conn = self.session.get_connection(config)
419                         except EOFError:
420                                 # failed twice... no need to report this really, it's just in a
421                                 # weird state...
422                                 return False
423                         except:
424                                 traceback.print_exc()
425                                 email_exception(self.hostname)
426                                 return False
427                 #print "trying to use conn before returning it."
428                 #print conn.c.modules.sys.path
429                 #print conn.c.modules.os.path.exists('/tmp/source')
430                 #time.sleep(1)
431
432                 #print "conn: %s" % conn
433                 return conn
434
435         def getSequences(self):
436
437                 # TODO: This can be replaced with a DB definition at a future time.
438                 #               This would make it possible for an admin to introduce new
439                 #               patterns without touching code.
440                 
441                 sequences = {}
442                 # restart_bootmanager_boot
443                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
444                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
445                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
446
447                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
448
449                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
450                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
451                                 "bminit-cfg-auth-getplc-update-debug-done",
452                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
453                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
454                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
455                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
456                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
457                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
458                                 ]:
459                         sequences.update({n : "restart_bootmanager_boot"})
460
461                 #       conn.restart_bootmanager('reinstall')
462                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
463                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
464                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
465                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
466                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
467                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
468                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
469                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
470                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
471                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
473                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
474                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
475                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
476                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
477                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
478                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
479                                 # actual solution appears to involve removing the bad files, and
480                                 # continually trying to boot the node.
481                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
482                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
483                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
484                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
485                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
486                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
487                                 ]:
488                         sequences.update({n : "restart_bootmanager_rins"})
489
490                 # repair_node_keys
491                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
492                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
493                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
494                                 ]:
495                         sequences.update({n: "repair_node_keys"})
496
497                 #   conn.restart_node('reinstall')
498                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
499                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
500                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
501                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
502                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
503                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
504                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
505                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
506                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
507                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
508                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
509                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
510                                 ]:
511                         sequences.update({n : "restart_node_rins"})
512
513                 #       restart_node_boot
514                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
515                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
516                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
517                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
518                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
519                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
520                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
521                                  ]:
522                         sequences.update({n: "restart_node_boot"})
523
524                 # fsck_repair
525                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
526                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
527                                   "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
528                                 ]:
529                         sequences.update({n : "fsck_repair"})
530
531                 # update_node_config_email
532                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
533                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
534                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
535                                   "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
536                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
537                                   "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
538                                 ]:
539                         sequences.update({n : "update_node_config_email"})
540
541                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
542                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
543                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
544                                 ]:
545                         sequences.update({n : "nodenetwork_email"})
546
547                 # update_bootcd_email
548                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
549                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
550                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
551                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
552                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
553                                 ]:
554                         sequences.update({n : "update_bootcd_email"})
555
556                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
557                                 ]:
558                         sequences.update({n: "suspect_error_email"})
559
560                 # update_hardware_email
561                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
562                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
563
564                 # broken_hardware_email
565                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
566
567                 # bad_dns_email
568                 for n in [ 
569                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
570                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
571                         ]:
572                         sequences.update( { n : "bad_dns_email"})
573
574                 return sequences
575
576         def getDiskSteps(self):
577                 steps = [
578                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
579                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
580                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
581
582                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
583
584                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
585                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
586
587                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
588                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
589
590                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
591                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
592
593                         ('floppytimeout','floppy0: floppy timeout called'),
594                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
595
596                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
597                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
598
599                         # floppy0: floppy timeout called
600                         # end_request: I/O error, dev fd0, sector 0
601
602                         # Buffer I/O error on device dm-2, logical block 8888896
603                         # ata1: status=0x51 { DriveReady SeekComplete Error }
604                         # ata1: error=0x40 { UncorrectableError }
605                         # SCSI error : <0 0 0 0> return code = 0x8000002
606                         # sda: Current: sense key: Medium Error
607                         #       Additional sense: Unrecovered read error - auto reallocate failed
608
609                         # SCSI error : <0 2 0 0> return code = 0x40001
610                         # end_request: I/O error, dev sda, sector 572489600
611                 ]
612                 return steps
613
614         def getDiskSequence(self, steps, child):
615                 sequence = []
616                 while True:
617                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
618                         sequence.append(id)
619
620                         if id == "done":
621                                 break
622                 return sequence
623
624         def getBootManagerStepPatterns(self):
625                 steps = [
626                         ('bminit'               , 'Initializing the BootManager.'),
627                         ('cfg'                  , 'Reading node configuration file.'),
628                         ('auth'                 , 'Authenticating node with PLC.'),
629                         ('getplc'               , 'Retrieving details of node from PLC.'),
630                         ('update'               , 'Updating node boot state at PLC.'),
631                         ('hardware'             , 'Checking if hardware requirements met.'),
632                         ('installinit'  , 'Install: Initializing.'),
633                         ('installdisk'  , 'Install: partitioning disks.'),
634                         ('installbootfs', 'Install: bootstrapfs tarball.'),
635                         ('installcfg'   , 'Install: Writing configuration files.'),
636                         ('installstop'  , 'Install: Shutting down installer.'),
637                         ('update2'              , 'Updating node boot state at PLC.'),
638                         ('installinit2' , 'Install: Initializing.'),
639                         ('validate'             , 'Validating node installation.'),
640                         ('rebuildinitrd', 'Rebuilding initrd'),
641                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
642                         ('update3'              , 'Updating node configuration.'),
643                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
644                         ('update4'              , 'Sending hardware configuration to PLC.'),
645                         ('debug'                , 'Starting debug mode'),
646                         ('bmexceptmount', 'BootManagerException during mount'),
647                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
648                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
649                         ('exception'    , 'Exception'),
650                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
651                         ('protoerror'   , 'XML RPC protocol error'),
652                         ('nodehostname' , 'Configured node hostname does not resolve'),
653                         ('implementerror', 'Implementation Error'),
654                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
655                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
656                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
657                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
658                         ('noinstall'    , 'notinstalled'),
659                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
660                         ('noblockdev'   , "No block devices detected."),
661                         ('dnserror'     , 'Name or service not known'),
662                         ('noconfig'             , "Unable to find and read a node configuration file"),
663                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
664                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
665                         ('hardwarerequirefail' , 'Hardware requirements not met'),
666                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
667                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
668                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
669                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
670                         ('modulefail'   , 'Unable to get list of system modules'),
671                         ('writeerror'   , 'write error: No space left on device'),
672                         ('nospace'      , "No space left on device"),
673                         ('nonode'       , 'Failed to authenticate call: No such node'),
674                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
675                         ('bootcheckfail'     , 'BootCheckAuthentication'),
676                         ('bootupdatefail'   , 'BootUpdateNode'),
677                 ]
678                 return steps
679
680         def getBootManagerSequenceFromLog(self, steps, child):
681                 sequence = []
682                 while True:
683                         
684                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
685                         id = index_to_id(steps,index)
686                         sequence.append(id)
687
688                         if id == "exception":
689                                 print "...Found An Exception!!!"
690                         elif id == "done": #index == len(steps_to_list(steps)):
691                                 #print "Reached EOF"
692                                 break
693
694                 return sequence
695                 
696 def restore(sitehist, hostname, config=None, forced_action=None):
697         ret = restore_basic(sitehist, hostname, config, forced_action)
698         session.flush()
699         return ret
700
701 def restore_basic(sitehist, hostname, config=None, forced_action=None):
702
703         # NOTE: Nothing works if the bootcd is REALLY old.
704         #       So, this is the first step.
705
706         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
707         recent_actions = sitehist.getRecentActions(hostname=hostname)
708
709         if fbnode['observed_category'] == "OLDBOOTCD":
710                 print "\t...Notify owner to update BootImage!!!"
711
712                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
713                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
714
715                         print "\tDisabling %s due to out-of-date BootImage" % hostname
716                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
717
718                 # NOTE: nothing else is possible.
719                 return True
720
721         debugnode = DebugInterface(hostname)
722         conn = debugnode.getConnection()
723         if type(conn) == type(False): return False
724
725         boot_state = conn.get_boot_state()
726         if boot_state != "debug":
727                 print "... %s in %s state: skipping..." % (hostname , boot_state)
728                 return boot_state == "boot"
729
730         if conn.bootmanager_running():
731                 print "...BootManager is currently running.  Skipping host %s" %hostname 
732                 return True
733
734         # Read persistent flags, tagged on one week intervals.
735
736         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
737         dmesg = conn.get_dmesg()
738         child = fdpexpect.fdspawn(dmesg)
739
740         steps = debugnode.getDiskSteps()
741         sequence = debugnode.getDiskSequence(steps, child)
742
743         s = Set(sequence)
744         if config and not config.quiet: print "\tSET: ", s
745
746         if len(s) > 1:
747                 print "...Potential drive errors on %s" % hostname 
748                 if len(s) == 2 and 'floppyerror' in s:
749                         print "...Should investigate.  Continuing with node."
750                 else:
751                         print "...Should investigate.  Skipping node."
752                         # TODO: send message related to these errors.
753
754                         if not found_within(recent_actions, 'baddisk_notice', 7):
755                                 print "baddisk_notice not found recently"
756
757                                 log=conn.get_dmesg().read()
758                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
759                                 conn.set_nodestate('disabled')
760
761                         return False
762
763         print "...Downloading bm.log from %s" %hostname 
764         log = conn.get_bootmanager_log()
765         child = fdpexpect.fdspawn(log)
766
767         if hasattr(config, 'collect') and config.collect: return True
768
769         if config and not config.quiet: print "...Scanning bm.log for errors"
770
771         time.sleep(1)
772
773         steps = debugnode.getBootManagerStepPatterns()
774         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
775                 
776         s = "-".join(sequence)
777         print "   FOUND SEQUENCE: ", s
778
779         # NOTE: We get or set the flag based on the current sequence identifier.
780         #  By using the sequence identifier, we guarantee that there will be no
781         #  frequent loops.  I'm guessing there is a better way to track loops,
782         #  though.
783
784         sequences = debugnode.getSequences()
785         flag_set = True
786         
787         if s not in sequences:
788                 print "   HOST %s" % hostname
789                 print "   UNKNOWN SEQUENCE: %s" % s
790
791                 args = {}
792                 args['hostname'] = hostname
793                 args['sequence'] = s
794                 args['bmlog'] = conn.get_bootmanager_log().read()
795                 args['viart'] = False
796                 args['saveact'] = True
797                 args['ccemail'] = True
798
799                 sitehist.sendMessage('unknownsequence_notice', **args)
800
801                 conn.restart_bootmanager('boot')
802
803                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
804                 # This way, we can check it again after we've fixed it.
805                 flag_set = False
806
807         else:
808
809                 if   sequences[s] == "restart_bootmanager_boot":
810                         print "...Restarting BootManager.py on %s "%hostname 
811                         conn.restart_bootmanager('boot')
812                 elif sequences[s] == "restart_bootmanager_rins":
813                         print "...Restarting BootManager.py on %s "%hostname 
814                         conn.restart_bootmanager('reinstall')
815                 elif sequences[s] == "restart_node_rins":
816                         conn.restart_node('reinstall')
817                 elif sequences[s] == "restart_node_boot":
818                         conn.restart_node('boot')
819                 elif sequences[s] == "fsck_repair":
820                         conn.fsck_repair_node()
821                 elif sequences[s] == "repair_node_keys":
822                         if conn.compare_and_repair_nodekeys():
823                                 # the keys either are in sync or were forced in sync.
824                                 # so try to start BM again.
825                                 conn.restart_bootmanager(conn.get_nodestate())
826                                 pass
827                         else:
828                                 # there was some failure to synchronize the keys.
829                                 print "...Unable to repair node keys on %s" %hostname 
830
831                 elif sequences[s] == "suspect_error_email":
832                         args = {}
833                         args['hostname'] = hostname
834                         args['sequence'] = s
835                         args['bmlog'] = conn.get_bootmanager_log().read()
836                         args['viart'] = False
837                         args['saveact'] = True
838                         args['ccemail'] = True
839
840                         sitehist.sendMessage('unknownsequence_notice', **args)
841                         conn.restart_bootmanager('boot')
842
843                 # TODO: differentiate this and the 'nodenetwork_email' actions.
844                 elif sequences[s] == "update_node_config_email":
845
846                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
847                                 args = {}
848                                 args['hostname'] = hostname
849                                 sitehist.sendMessage('nodeconfig_notice', **args)
850                                 conn.dump_plconf_file()
851
852                 elif sequences[s] == "nodenetwork_email":
853
854                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
855                                 args = {}
856                                 args['hostname'] = hostname
857                                 args['bmlog'] = conn.get_bootmanager_log().read()
858                                 sitehist.sendMessage('nodeconfig_notice', **args)
859                                 conn.dump_plconf_file()
860
861                 elif sequences[s] == "update_bootcd_email":
862
863                         if not found_within(recent_actions, 'newalphacd_notice', 3.5):
864                                 args = {}
865                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
866                                 args['hostname'] = hostname
867                         
868                                 sitehist.sendMessage('newalphacd_notice', **args)
869
870                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
871
872                 elif sequences[s] == "broken_hardware_email":
873                         # MAKE An ACTION record that this host has failed hardware.  May
874                         # require either an exception "/minhw" or other manual intervention.
875                         # Definitely need to send out some more EMAIL.
876                         # TODO: email notice of broken hardware
877                         if not found_within(recent_actions, 'baddisk_notice', 7):
878                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
879                                 args = {}
880                                 args['hostname'] = hostname
881                                 args['log'] = conn.get_dmesg().read()
882
883                                 sitehist.sendMessage('baddisk_notice', **args)
884                                 conn.set_nodestate('disabled')
885
886                 elif sequences[s] == "update_hardware_email":
887                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
888                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
889                                 args = {}
890                                 args['hostname'] = hostname
891                                 args['bmlog'] = conn.get_bootmanager_log().read()
892                                 sitehist.sendMessage('minimalhardware_notice', **args)
893
894                 elif sequences[s] == "bad_dns_email":
895                         if not found_within(recent_actions, 'baddns_notice', 1):
896                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
897                                 args = {}
898                                 try:
899                                         node = plccache.GetNodeByName(hostname)
900                                         net = api.GetInterfaces(node['interface_ids'])[0]
901                                 except:
902                                         email_exception()
903                                         print traceback.print_exc()
904                                         # TODO: api error. skip email, b/c all info is not available,
905                                         # flag_set will not be recorded.
906                                         return False
907                                 nodenet_str = network_config_to_str(net)
908
909                                 args['hostname'] = hostname
910                                 args['network_config'] = nodenet_str
911                                 args['interface_id'] = net['interface_id']
912
913                                 sitehist.sendMessage('baddns_notice', **args)
914
915         return True
916         
917
918 # MAIN -------------------------------------------------------------------
919
920 def main():
921         from monitor import parser as parsermodule
922         parser = parsermodule.getParser()
923
924         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
925                                                 force=None, quiet=False)
926         parser.add_option("", "--child", dest="child", action="store_true", 
927                                                 help="This is the child mode of this process.")
928         parser.add_option("", "--force", dest="force", metavar="boot_state",
929                                                 help="Force a boot state passed to BootManager.py.")
930         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
931                                                 help="Extra quiet output messages.")
932         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
933                                                 help="Extra debug output messages.")
934         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
935                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
936         parser.add_option("", "--collect", dest="collect", action="store_true", 
937                                                 help="No action, just collect dmesg, and bm.log")
938         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
939                                                 help="Do not perform the orginary setup phase.")
940
941         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
942         config = parsermodule.parse_args(parser)
943
944         if config.nodelist:
945                 nodes = config.getListFromFile(config.nodelist)
946         elif config.node:
947                 nodes = [ config.node ]
948         else:
949                 parser.print_help()
950                 sys.exit(1)
951
952         for node in nodes:
953                 # get sitehist
954                 lb = plccache.plcdb_hn2lb[node]
955                 sitehist = SiteInterface.get_or_make(loginbase=lb)
956                 #reboot(node, config)
957                 restore(sitehist, node, config=None, forced_action=None)
958
959 if __name__ == "__main__":
960         main()