more convenience functions to create an api interface as a user or node.
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class NodeConnection:
46         def __init__(self, connection, node, config):
47                 self.node = node
48                 self.c = connection
49                 self.config = config
50
51         def get_boot_state(self):
52                 try:
53                         if self.c.modules.os.path.exists('/tmp/source'):
54                                 return "debug"
55                         elif self.c.modules.os.path.exists('/vservers'): 
56                                 return "boot"
57                         else:
58                                 return "unknown"
59                 except EOFError:
60                         traceback.print_exc()
61                         print self.c.modules.sys.path
62                 except:
63                         email_exception()
64                         traceback.print_exc()
65
66                 return "unknown"
67
68         def get_dmesg(self):
69                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
70                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
71                 log = open("log/dmesg.%s.log" % self.node, 'r')
72                 return log
73
74         def get_bootmanager_log(self):
75                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
76                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
77                 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
78                 log = open("log/bm.%s.log" % self.node, 'r')
79                 return log
80
81         def dump_plconf_file(self):
82                 c = self.c
83                 self.c.modules.sys.path.append("/tmp/source/")
84                 self.c.modules.os.chdir('/tmp/source')
85
86                 log = c.modules.BootManager.log('/tmp/new.log')
87                 bm = c.modules.BootManager.BootManager(log,'boot')
88
89                 BootManagerException = c.modules.Exceptions.BootManagerException
90                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
91                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
92                 bm_continue = True
93
94                 InitializeBootManager.Run(bm.VARS, bm.LOG)
95                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
96                 except Exception, x:
97                         bm_continue = False
98                         print "   ERROR:", x
99                         print "   Possibly, unable to find valid configuration file"
100
101                 if bm_continue:
102                         for key in bm.VARS.keys():
103                                 print key, " == ", bm.VARS[key]
104                 else:
105                         print "   Unable to read Node Configuration"
106                 
107
108         def compare_and_repair_nodekeys(self):
109                 c = self.c
110                 self.c.modules.sys.path.append("/tmp/source/")
111                 self.c.modules.os.chdir('/tmp/source')
112
113                 log = c.modules.BootManager.log('/tmp/new.log')
114                 bm = c.modules.BootManager.BootManager(log,'boot')
115
116                 BootManagerException = c.modules.Exceptions.BootManagerException
117                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
118                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
119                 bm_continue = True
120
121                 plcnode = plccache.GetNodeByName(self.node)
122
123                 InitializeBootManager.Run(bm.VARS, bm.LOG)
124                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
125                 except Exception, x:
126                         bm_continue = False
127                         print "exception"
128                         print x
129                         print "   Possibly, unable to find valid configuration file"
130
131                 if bm_continue:
132                         print "   NODE: %s" % bm.VARS['NODE_KEY']
133                         print "   PLC : %s" % plcnode['key']
134
135                         if bm.VARS['NODE_KEY'] == plcnode['key']:
136                                 return True
137                         else:
138                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
139                                         print "   Successfully updated NODE_KEY with PLC"
140                                         return True
141                                 else:
142                                         return False
143                                 
144                         #for key in bm.VARS.keys():
145                         #       print key, " == ", bm.VARS[key]
146                 else:
147                         print "   Unable to retrieve NODE_KEY"
148
149         def bootmanager_running(self):
150                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
151                         return True
152                 else:
153                         return False
154
155         def set_nodestate(self, state='boot'):
156                 return api.UpdateNode(self.node, {'boot_state' : state})
157
158         def restart_node(self, state='boot'):
159                 api.UpdateNode(self.node, {'boot_state' : state})
160
161                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
162                 if not pflags.getRecentFlag('gentlekill'):
163                         print "   Killing all slice processes... : %s" %  self.node
164                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
165                         self.c.modules.os.system(cmd_slicekill)
166                         cmd = """ shutdown -r +1 & """
167                         print "   Restarting %s : %s" % ( self.node, cmd)
168                         self.c.modules.os.system(cmd)
169
170                         pflags.setRecentFlag('gentlekill')
171                         pflags.save()
172                 else:
173                         print "   Restarting with sysrq 'sub' %s" % self.node
174                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
175                         self.c.modules.os.system(cmd)
176
177                 return
178
179         def restart_bootmanager(self, forceState):
180
181                 self.c.modules.os.chdir('/tmp/source')
182                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
183                         print "   BootManager is already running: try again soon..."
184                 else:
185                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
186                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
187                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
188                                   "  rm -f /tmp/BM_RUNNING " + \
189                                   ") &" 
190                         cmd = cmd % forceState
191                         self.c.modules.os.system(cmd)
192
193                 return 
194
195
196 class PlanetLabSession:
197         globalport = 22000 + int(random.random()*1000)
198
199         def __init__(self, node, nosetup, verbose):
200                 self.verbose = verbose
201                 self.node = node
202                 self.port = None
203                 self.nosetup = nosetup
204                 self.command = None
205                 self.setup_host()
206
207         def get_connection(self, config):
208                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
209                 #i = 0
210                 #while i < 3: 
211                 #       print i, conn.c.modules.sys.path
212                 #       print conn.c.modules.os.path.exists('/tmp/source')
213                 #       i+=1
214                 #       time.sleep(1)
215                 return conn
216         
217         def setup_host(self):
218                 self.port = PlanetLabSession.globalport
219                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
220
221                 args = {}
222                 args['port'] = self.port
223                 args['user'] = 'root'
224                 args['hostname'] = self.node
225                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
226                 ssh_port = 22
227
228                 if self.nosetup:
229                         print "Skipping setup"
230                         return 
231
232                 # COPY Rpyc files to host
233                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
234                 if self.verbose: print cmd
235                 print cmd
236                 # TODO: Add timeout
237                 timeout = 120
238                 localos = moncommands.CMD()
239
240                 ret = localos.system(cmd, timeout)
241                 print ret
242                 if ret != 0:
243                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
244                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
245                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
246                         ret = localos.system(cmd, timeout)
247                         print ret
248                         if ret != 0:
249                                 print "\tFAILED TWICE"
250                                 #sys.exit(1)
251                                 raise Exception("Failed twice trying to login with updated ssh host key")
252
253                 t1 = time.time()
254                 # KILL any already running servers.
255                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
256                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
257             rm -f out.log
258             echo "kill server" >> out.log
259             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
260             echo "export" >> out.log
261             export PYTHONPATH=$HOME  ;
262             echo "start server" >> out.log
263             python Rpyc/Servers/forking_server.py &> server.log &
264             echo "done" >> out.log
265 EOF""")
266                 #cmd = """ssh %(user)s@%(hostname)s """ + \
267                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
268                 #cmd = cmd % args
269                 #if self.verbose: print cmd
270                 ## TODO: Add timeout
271                 #print localos.system(cmd,timeout)
272
273                 ## START a new rpyc server.
274                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
275                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
276                 #cmd = cmd % args
277                 #if self.verbose: print cmd
278                 #print localos.system(cmd,timeout)
279                 print "setup rpyc server over ssh"
280                 print ssh.ret
281
282                 # TODO: Add timeout
283                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
284                 # and the following options seems to work well.
285                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
286                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
287                           """-o ConnectTimeout=120 """ + \
288                           """-n -N -L %(port)s:localhost:18812 """ + \
289                           """%(user)s@%(hostname)s"""
290                 cmd = cmd % args
291                 if self.verbose: print cmd
292                 print cmd
293                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
294                 # TODO: the read() here may block indefinitely.  Need a better
295                 # approach therefore, that includes a timeout.
296                 #ret = self.command.stdout.read(5)
297                 ret = moncommands.read_t(self.command.stdout, 5)
298
299                 t2 = time.time()
300                 if 'READY' in ret:
301                         # NOTE: There is still a slight race for machines that are slow...
302                         self.timeout = 2*(t2-t1)
303                         print "Sleeping for %s sec" % self.timeout
304                         time.sleep(self.timeout)
305                         return
306
307                 if self.command.returncode is not None:
308                         print "Failed to establish tunnel!"
309                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
310
311                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
312
313         def __del__(self):
314                 if self.command:
315                         if self.verbose: print "Killing SSH session %s" % self.port
316                         print "Killing SSH session %s" % self.port
317                         self.command.kill()
318
319         
320 def steps_to_list(steps, index=1):
321         return map(lambda x: x[index], steps)
322
323 def index_to_id(steps,index):
324         if index < len(steps):
325                 return steps[index][0]
326         else:
327                 return "done"
328
329 class DebugInterface:
330         def __init__(self, hostname):
331                 self.hostname = hostname
332                 self.session = None
333
334         def getConnection(self):
335                 print "Creating session for %s" % self.hostname
336                 # update known_hosts file (in case the node has rebooted since last run)
337                 try:
338                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
339                 except:
340                         email_exception()
341                         print traceback.print_exc()
342                         return False
343
344                 try:
345                         if config == None:
346                                 self.session = PlanetLabSession(self.hostname, False, True)
347                         else:
348                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
349                 except Exception, e:
350                         msg = "ERROR setting up session for %s" % self.hostname
351                         print msg
352                         traceback.print_exc()
353                         email_exception(msg)
354                         return False
355
356                 try:
357                         conn = self.session.get_connection(config)
358                 except EOFError:
359                         # NOTE: sometimes the wait in setup_host() is not long enough.  
360                         # So, here we try to wait a little longer before giving up entirely.
361                         try:
362                                 time.sleep(self.session.timeout*5)
363                                 conn = self.session.get_connection(config)
364                         except:
365                                 traceback.print_exc()
366                                 email_exception(self.hostname)
367                                 return False
368                 #print "trying to use conn before returning it."
369                 #print conn.c.modules.sys.path
370                 #print conn.c.modules.os.path.exists('/tmp/source')
371                 #time.sleep(1)
372
373                 #print "conn: %s" % conn
374                 return conn
375
376         def getSequences(self):
377
378                 # TODO: This can be replaced with a DB definition at a future time.
379                 #               This would make it possible for an admin to introduce new
380                 #               patterns without touching code.
381                 
382                 sequences = {}
383                 # restart_bootmanager_boot
384                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
385                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
386                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
387
388                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
389
390                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
391                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
392                                 "bminit-cfg-auth-getplc-update-debug-done",
393                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
394                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
395                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
396                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
397                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
398                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
399                                 ]:
400                         sequences.update({n : "restart_bootmanager_boot"})
401
402                 #       conn.restart_bootmanager('rins')
403                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
404                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
405                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
406                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
407                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
408                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
409                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
410                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
411                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
412                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
413                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
414                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
416                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
417                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
418                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
419                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
420                                 # actual solution appears to involve removing the bad files, and
421                                 # continually trying to boot the node.
422                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
423                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
424                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
425                                 ]:
426                         sequences.update({n : "restart_bootmanager_rins"})
427
428                 # repair_node_keys
429                 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
430
431                 #   conn.restart_node('rins')
432                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
433                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
434                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
435                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
436                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
437                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
438                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
439                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
440                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
441                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
442                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
443                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
444                                 ]:
445                         sequences.update({n : "restart_node_rins"})
446
447                 #       restart_node_boot
448                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
449                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
450                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
451                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
452                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
453                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
454                                  ]:
455                         sequences.update({n: "restart_node_boot"})
456
457                 # update_node_config_email
458                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
459                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
460                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
461                                 ]:
462                         sequences.update({n : "update_node_config_email"})
463
464                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
465                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
466                                 ]:
467                         sequences.update({n : "nodenetwork_email"})
468
469                 # update_bootcd_email
470                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
471                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
472                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
473                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
474                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
475                                 ]:
476                         sequences.update({n : "update_bootcd_email"})
477
478                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
479                                 ]:
480                         sequences.update({n: "suspect_error_email"})
481
482                 # update_hardware_email
483                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
484                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
485
486                 # broken_hardware_email
487                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
488
489                 # bad_dns_email
490                 for n in [ 
491                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
492                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
493                         ]:
494                         sequences.update( { n : "bad_dns_email"})
495
496                 return sequences
497
498         def getDiskSteps(self):
499                 steps = [
500                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
501                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
502                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
503
504                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
505
506                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
507                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
508
509                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
510                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
511
512                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
513                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
514
515                         ('floppytimeout','floppy0: floppy timeout called'),
516                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
517
518                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
519                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
520
521                         # floppy0: floppy timeout called
522                         # end_request: I/O error, dev fd0, sector 0
523
524                         # Buffer I/O error on device dm-2, logical block 8888896
525                         # ata1: status=0x51 { DriveReady SeekComplete Error }
526                         # ata1: error=0x40 { UncorrectableError }
527                         # SCSI error : <0 0 0 0> return code = 0x8000002
528                         # sda: Current: sense key: Medium Error
529                         #       Additional sense: Unrecovered read error - auto reallocate failed
530
531                         # SCSI error : <0 2 0 0> return code = 0x40001
532                         # end_request: I/O error, dev sda, sector 572489600
533                 ]
534                 return steps
535
536         def getDiskSequence(self, steps, child):
537                 sequence = []
538                 while True:
539                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
540                         sequence.append(id)
541
542                         if id == "done":
543                                 break
544                 return sequence
545
546         def getBootManagerStepPatterns(self):
547                 steps = [
548                         ('bminit'               , 'Initializing the BootManager.'),
549                         ('cfg'                  , 'Reading node configuration file.'),
550                         ('auth'                 , 'Authenticating node with PLC.'),
551                         ('getplc'               , 'Retrieving details of node from PLC.'),
552                         ('update'               , 'Updating node boot state at PLC.'),
553                         ('hardware'             , 'Checking if hardware requirements met.'),
554                         ('installinit'  , 'Install: Initializing.'),
555                         ('installdisk'  , 'Install: partitioning disks.'),
556                         ('installbootfs', 'Install: bootstrapfs tarball.'),
557                         ('installcfg'   , 'Install: Writing configuration files.'),
558                         ('installstop'  , 'Install: Shutting down installer.'),
559                         ('update2'              , 'Updating node boot state at PLC.'),
560                         ('installinit2' , 'Install: Initializing.'),
561                         ('validate'             , 'Validating node installation.'),
562                         ('rebuildinitrd', 'Rebuilding initrd'),
563                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
564                         ('update3'              , 'Updating node configuration.'),
565                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
566                         ('update4'              , 'Sending hardware configuration to PLC.'),
567                         ('debug'                , 'Starting debug mode'),
568                         ('bmexceptmount', 'BootManagerException during mount'),
569                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
570                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
571                         ('exception'    , 'Exception'),
572                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
573                         ('protoerror'   , 'XML RPC protocol error'),
574                         ('nodehostname' , 'Configured node hostname does not resolve'),
575                         ('implementerror', 'Implementation Error'),
576                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
577                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
578                         ('noinstall'    , 'notinstalled'),
579                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
580                         ('noblockdev'   , "No block devices detected."),
581                         ('dnserror'     , 'Name or service not known'),
582                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
583                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
584                         ('hardwarerequirefail' , 'Hardware requirements not met'),
585                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
586                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
587                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
588                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
589                         ('modulefail'   , 'Unable to get list of system modules'),
590                         ('writeerror'   , 'write error: No space left on device'),
591                         ('nospace'      , "No space left on device"),
592                         ('nonode'       , 'Failed to authenticate call: No such node'),
593                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
594                         ('bootcheckfail'     , 'BootCheckAuthentication'),
595                         ('bootupdatefail'   , 'BootUpdateNode'),
596                 ]
597                 return steps
598
599         def getBootManagerSequenceFromLog(self, steps, child):
600                 sequence = []
601                 while True:
602                         
603                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
604                         id = index_to_id(steps,index)
605                         sequence.append(id)
606
607                         if id == "exception":
608                                 print "...Found An Exception!!!"
609                         elif id == "done": #index == len(steps_to_list(steps)):
610                                 #print "Reached EOF"
611                                 break
612
613                 return sequence
614                 
615
616 def restore(sitehist, hostname, config=None, forced_action=None):
617
618         # NOTE: Nothing works if the bootcd is REALLY old.
619         #       So, this is the first step.
620
621         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
622         recent_actions = sitehist.getRecentActions(hostname=hostname)
623
624         if fbnode['observed_category'] == "OLDBOOTCD":
625                 print "\t...Notify owner to update BootImage!!!"
626
627                 if not found_within(recent_actions, 'newbootcd_notice', 3):
628                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
629
630                         print "\tDisabling %s due to out-of-date BootImage" % hostname
631                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
632
633                 # NOTE: nothing else is possible.
634                 return True
635
636         debugnode = DebugInterface(hostname)
637         conn = debugnode.getConnection()
638         #print "conn: %s" % conn
639         #print "trying to use conn after returning it."
640         #print conn.c.modules.sys.path
641         #print conn.c.modules.os.path.exists('/tmp/source')
642         if type(conn) == type(False): return False
643
644         #if forced_action == "reboot":
645         #       conn.restart_node('rins')
646         #       return True
647
648         boot_state = conn.get_boot_state()
649         if boot_state != "debug":
650                 print "... %s in %s state: skipping..." % (hostname , boot_state)
651                 return boot_state == "boot"
652
653         if conn.bootmanager_running():
654                 print "...BootManager is currently running.  Skipping host %s" %hostname 
655                 return True
656
657         # Read persistent flags, tagged on one week intervals.
658
659         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
660         dmesg = conn.get_dmesg()
661         child = fdpexpect.fdspawn(dmesg)
662
663         steps = debugnode.getDiskSteps()
664         sequence = debugnode.getDiskSequence(steps, child)
665
666         s = Set(sequence)
667         if config and not config.quiet: print "\tSET: ", s
668
669         if len(s) > 1:
670                 print "...Potential drive errors on %s" % hostname 
671                 if len(s) == 2 and 'floppyerror' in s:
672                         print "...Should investigate.  Continuing with node."
673                 else:
674                         print "...Should investigate.  Skipping node."
675                         # TODO: send message related to these errors.
676
677                         if not found_within(recent_actions, 'newbootcd_notice', 3):
678
679                                 log=conn.get_dmesg().read()
680                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
681                                 conn.set_nodestate('disable')
682
683                         return False
684
685         print "...Downloading bm.log from %s" %hostname 
686         log = conn.get_bootmanager_log()
687         child = fdpexpect.fdspawn(log)
688
689         if hasattr(config, 'collect') and config.collect: return True
690
691         if config and not config.quiet: print "...Scanning bm.log for errors"
692
693         time.sleep(1)
694
695         steps = debugnode.getBootManagerStepPatterns()
696         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
697                 
698         s = "-".join(sequence)
699         print "   FOUND SEQUENCE: ", s
700
701         # NOTE: We get or set the flag based on the current sequence identifier.
702         #  By using the sequence identifier, we guarantee that there will be no
703         #  frequent loops.  I'm guessing there is a better way to track loops,
704         #  though.
705
706         sequences = debugnode.getSequences()
707         flag_set = True
708         
709         if s not in sequences:
710                 print "   HOST %s" % hostname
711                 print "   UNKNOWN SEQUENCE: %s" % s
712
713                 args = {}
714                 args['hostname'] = hostname
715                 args['sequence'] = s
716                 args['bmlog'] = conn.get_bootmanager_log().read()
717                 args['viart'] = False
718
719                 sitehist.sendMessage('unknownsequence_notice', **args)
720
721                 conn.restart_bootmanager('boot')
722
723                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
724                 # This way, we can check it again after we've fixed it.
725                 flag_set = False
726
727         else:
728
729                 if   sequences[s] == "restart_bootmanager_boot":
730                         print "...Restarting BootManager.py on %s "%hostname 
731                         conn.restart_bootmanager('boot')
732                 elif sequences[s] == "restart_bootmanager_rins":
733                         print "...Restarting BootManager.py on %s "%hostname 
734                         conn.restart_bootmanager('rins')
735                 elif sequences[s] == "restart_node_rins":
736                         conn.restart_node('rins')
737                 elif sequences[s] == "restart_node_boot":
738                         conn.restart_node('boot')
739                 elif sequences[s] == "repair_node_keys":
740                         if conn.compare_and_repair_nodekeys():
741                                 # the keys either are in sync or were forced in sync.
742                                 # so try to reboot the node again.
743                                 conn.restart_bootmanager('rins')
744                                 pass
745                         else:
746                                 # there was some failure to synchronize the keys.
747                                 print "...Unable to repair node keys on %s" %hostname 
748
749                 elif sequences[s] == "suspect_error_email":
750                         args = {}
751                         args['hostname'] = hostname
752                         args['sequence'] = s
753                         args['bmlog'] = conn.get_bootmanager_log().read()
754                         args['viart'] = False
755
756                         sitehist.sendMessage('unknownsequence_notice', **args)
757                         conn.restart_bootmanager('boot')
758
759                 # TODO: differentiate this and the 'nodenetwork_email' actions.
760                 elif sequences[s] == "update_node_config_email":
761
762                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
763                                 args = {}
764                                 args['hostname'] = hostname
765                                 sitehist.sendMessage('nodeconfig_notice', **args)
766                                 conn.dump_plconf_file()
767
768                 elif sequences[s] == "nodenetwork_email":
769
770                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
771                                 args = {}
772                                 args['hostname'] = hostname
773                                 args['bmlog'] = conn.get_bootmanager_log().read()
774                                 sitehist.sendMessage('nodeconfig_notice', **args)
775                                 conn.dump_plconf_file()
776
777                 elif sequences[s] == "update_bootcd_email":
778
779                         if not found_within(recent_actions, 'newalphacd_notice', 3):
780                                 args = {}
781                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
782                                 args['hostname'] = hostname
783                         
784                                 sitehist.sendMessage('newalphacd_notice', **args)
785
786                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
787
788                 elif sequences[s] == "broken_hardware_email":
789                         # MAKE An ACTION record that this host has failed hardware.  May
790                         # require either an exception "/minhw" or other manual intervention.
791                         # Definitely need to send out some more EMAIL.
792                         # TODO: email notice of broken hardware
793                         if not found_within(recent_actions, 'baddisk_notice', 1):
794                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
795                                 args = {}
796                                 args['hostname'] = hostname
797                                 args['log'] = conn.get_dmesg().read()
798
799                                 sitehist.sendMessage('baddisk_notice', **args)
800                                 conn.set_nodestate('disable')
801
802                 elif sequences[s] == "update_hardware_email":
803                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
804                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
805                                 args = {}
806                                 args['hostname'] = hostname
807                                 args['bmlog'] = conn.get_bootmanager_log().read()
808                                 sitehist.sendMessage('minimalhardware_notice', **args)
809
810                 elif sequences[s] == "bad_dns_email":
811                         if not found_within(recent_actions, 'baddns_notice', 1):
812                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
813                                 args = {}
814                                 try:
815                                         node = plccache.GetNodeByName(hostname)
816                                         net = api.GetInterfaces(node['interface_ids'])[0]
817                                 except:
818                                         email_exception()
819                                         print traceback.print_exc()
820                                         # TODO: api error. skip email, b/c all info is not available,
821                                         # flag_set will not be recorded.
822                                         return False
823                                 nodenet_str = network_config_to_str(net)
824
825                                 args['hostname'] = hostname
826                                 args['network_config'] = nodenet_str
827                                 args['nodenetwork_id'] = net['nodenetwork_id']
828
829                                 sitehist.sendMessage('baddns_notice', **args)
830
831         return True
832         
833
834 # MAIN -------------------------------------------------------------------
835
836 def main():
837         from monitor import parser as parsermodule
838         parser = parsermodule.getParser()
839
840         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
841                                                 force=None, quiet=False)
842         parser.add_option("", "--child", dest="child", action="store_true", 
843                                                 help="This is the child mode of this process.")
844         parser.add_option("", "--force", dest="force", metavar="boot_state",
845                                                 help="Force a boot state passed to BootManager.py.")
846         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
847                                                 help="Extra quiet output messages.")
848         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
849                                                 help="Extra debug output messages.")
850         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
851                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
852         parser.add_option("", "--collect", dest="collect", action="store_true", 
853                                                 help="No action, just collect dmesg, and bm.log")
854         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
855                                                 help="Do not perform the orginary setup phase.")
856
857         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
858         config = parsermodule.parse_args(parser)
859
860         if config.nodelist:
861                 nodes = config.getListFromFile(config.nodelist)
862         elif config.node:
863                 nodes = [ config.node ]
864         else:
865                 parser.print_help()
866                 sys.exit(1)
867
868         for node in nodes:
869                 # get sitehist
870                 lb = plccache.plcdb_hn2lb[node]
871                 sitehist = SiteInterface.get_or_make(loginbase=lb)
872                 #reboot(node, config)
873                 restore(sitehist, node, config=None, forced_action=None)
874
875 if __name__ == "__main__":
876         main()