1a04ef059813d0e16e09378ad6f6ead59dc9eef2
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from getsshkeys import SSHKnownHosts
17
18 from Rpyc import SocketConnection, Async
19 from Rpyc.Utils import *
20
21 import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.wrapper import plc
28 from monitor.wrapper import plccache
29 from monitor.wrapper.emailTxt import mailtxt
30
31 from pcucontrol.util import command as moncommands
32 from pcucontrol.util.command import Sopen
33 from pcucontrol.transports.ssh import pxssh as pxssh
34 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
35 from pcucontrol.transports.ssh import pexpect as pexpect
36
37 from nodeconfig import network_config_to_str
38
39
40 api = plc.getAuthAPI()
41 fb = None
42
43
44 class NodeConnection:
45         def __init__(self, connection, node, config):
46                 self.node = node
47                 self.c = connection
48                 self.config = config
49
50         def get_boot_state(self):
51                 try:
52                         if self.c.modules.os.path.exists('/tmp/source'):
53                                 return "debug"
54                         elif self.c.modules.os.path.exists('/vservers'): 
55                                 return "boot"
56                         else:
57                                 return "unknown"
58                 except EOFError:
59                         traceback.print_exc()
60                         print self.c.modules.sys.path
61                 except:
62                         traceback.print_exc()
63
64                 return "unknown"
65
66         def get_dmesg(self):
67                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
68                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
69                 log = open("log/dmesg.%s.log" % self.node, 'r')
70                 return log
71
72         def get_bootmanager_log(self):
73                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
74                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
75                 log = open("log/bm.%s.log" % self.node, 'r')
76                 return log
77
78         def dump_plconf_file(self):
79                 c = self.c
80                 self.c.modules.sys.path.append("/tmp/source/")
81                 self.c.modules.os.chdir('/tmp/source')
82
83                 log = c.modules.BootManager.log('/tmp/new.log')
84                 bm = c.modules.BootManager.BootManager(log,'boot')
85
86                 BootManagerException = c.modules.Exceptions.BootManagerException
87                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
88                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
89                 bm_continue = True
90
91                 InitializeBootManager.Run(bm.VARS, bm.LOG)
92                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
93                 except Exception, x:
94                         bm_continue = False
95                         print "   ERROR:", x
96                         print "   Possibly, unable to find valid configuration file"
97
98                 if bm_continue:
99                         for key in bm.VARS.keys():
100                                 print key, " == ", bm.VARS[key]
101                 else:
102                         print "   Unable to read Node Configuration"
103                 
104
105         def compare_and_repair_nodekeys(self):
106                 c = self.c
107                 self.c.modules.sys.path.append("/tmp/source/")
108                 self.c.modules.os.chdir('/tmp/source')
109
110                 log = c.modules.BootManager.log('/tmp/new.log')
111                 bm = c.modules.BootManager.BootManager(log,'boot')
112
113                 BootManagerException = c.modules.Exceptions.BootManagerException
114                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
116                 bm_continue = True
117
118                 plcnode = plccache.GetNodeByName(self.node)
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "exception"
125                         print x
126                         print "   Possibly, unable to find valid configuration file"
127
128                 if bm_continue:
129                         print "   NODE: %s" % bm.VARS['NODE_KEY']
130                         print "   PLC : %s" % plcnode['key']
131
132                         if bm.VARS['NODE_KEY'] == plcnode['key']:
133                                 return True
134                         else:
135                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
136                                         print "   Successfully updated NODE_KEY with PLC"
137                                         return True
138                                 else:
139                                         return False
140                                 
141                         #for key in bm.VARS.keys():
142                         #       print key, " == ", bm.VARS[key]
143                 else:
144                         print "   Unable to retrieve NODE_KEY"
145
146         def bootmanager_running(self):
147                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
148                         return True
149                 else:
150                         return False
151
152         def set_nodestate(self, state='boot'):
153                 return api.UpdateNode(self.node, {'boot_state' : state})
154
155         def restart_node(self, state='boot'):
156                 api.UpdateNode(self.node, {'boot_state' : state})
157
158                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
159                 if not pflags.getRecentFlag('gentlekill'):
160                         print "   Killing all slice processes... : %s" %  self.node
161                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
162                         self.c.modules.os.system(cmd_slicekill)
163                         cmd = """ shutdown -r +1 & """
164                         print "   Restarting %s : %s" % ( self.node, cmd)
165                         self.c.modules.os.system(cmd)
166
167                         pflags.setRecentFlag('gentlekill')
168                         pflags.save()
169                 else:
170                         print "   Restarting with sysrq 'sub' %s" % self.node
171                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
172                         self.c.modules.os.system(cmd)
173
174                 return
175
176         def restart_bootmanager(self, forceState):
177
178                 self.c.modules.os.chdir('/tmp/source')
179                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
180                         print "   BootManager is already running: try again soon..."
181                 else:
182                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
183                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
184                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
185                                   "  rm -f /tmp/BM_RUNNING " + \
186                                   ") &" 
187                         cmd = cmd % forceState
188                         self.c.modules.os.system(cmd)
189
190                 return 
191
192
193 class PlanetLabSession:
194         globalport = 22000 + int(random.random()*1000)
195
196         def __init__(self, node, nosetup, verbose):
197                 self.verbose = verbose
198                 self.node = node
199                 self.port = None
200                 self.nosetup = nosetup
201                 self.command = None
202                 self.setup_host()
203
204         def get_connection(self, config):
205                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
206                 #i = 0
207                 #while i < 3: 
208                 #       print i, conn.c.modules.sys.path
209                 #       print conn.c.modules.os.path.exists('/tmp/source')
210                 #       i+=1
211                 #       time.sleep(1)
212                 return conn
213         
214         def setup_host(self):
215                 self.port = PlanetLabSession.globalport
216                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
217
218                 args = {}
219                 args['port'] = self.port
220                 args['user'] = 'root'
221                 args['hostname'] = self.node
222                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
223                 ssh_port = 22
224
225                 if self.nosetup:
226                         print "Skipping setup"
227                         return 
228
229                 # COPY Rpyc files to host
230                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
231                 if self.verbose: print cmd
232                 print cmd
233                 # TODO: Add timeout
234                 timeout = 120
235                 localos = moncommands.CMD()
236
237                 ret = localos.system(cmd, timeout)
238                 print ret
239                 if ret != 0:
240                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
241                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
242                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
243                         ret = localos.system(cmd, timeout)
244                         print ret
245                         if ret != 0:
246                                 print "\tFAILED TWICE"
247                                 #sys.exit(1)
248                                 raise Exception("Failed twice trying to login with updated ssh host key")
249
250                 t1 = time.time()
251                 # KILL any already running servers.
252                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
253                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
254             rm -f out.log
255             echo "kill server" >> out.log
256             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
257             echo "export" >> out.log
258             export PYTHONPATH=$HOME  ;
259             echo "start server" >> out.log
260             python Rpyc/Servers/forking_server.py &> server.log &
261             echo "done" >> out.log
262 EOF""")
263                 #cmd = """ssh %(user)s@%(hostname)s """ + \
264                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
265                 #cmd = cmd % args
266                 #if self.verbose: print cmd
267                 ## TODO: Add timeout
268                 #print localos.system(cmd,timeout)
269
270                 ## START a new rpyc server.
271                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
272                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
273                 #cmd = cmd % args
274                 #if self.verbose: print cmd
275                 #print localos.system(cmd,timeout)
276                 print "setup rpyc server over ssh"
277                 print ssh.ret
278
279                 # TODO: Add timeout
280                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
281                 # and the following options seems to work well.
282                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
283                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
284                           """-o ConnectTimeout=120 """ + \
285                           """-n -N -L %(port)s:localhost:18812 """ + \
286                           """%(user)s@%(hostname)s"""
287                 cmd = cmd % args
288                 if self.verbose: print cmd
289                 print cmd
290                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
291                 # TODO: the read() here may block indefinitely.  Need a better
292                 # approach therefore, that includes a timeout.
293                 #ret = self.command.stdout.read(5)
294                 ret = moncommands.read_t(self.command.stdout, 5)
295
296                 t2 = time.time()
297                 if 'READY' in ret:
298                         # NOTE: There is still a slight race for machines that are slow...
299                         self.timeout = 2*(t2-t1)
300                         print "Sleeping for %s sec" % self.timeout
301                         time.sleep(self.timeout)
302                         return
303
304                 if self.command.returncode is not None:
305                         print "Failed to establish tunnel!"
306                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
307
308                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
309
310         def __del__(self):
311                 if self.command:
312                         if self.verbose: print "Killing SSH session %s" % self.port
313                         print "Killing SSH session %s" % self.port
314                         self.command.kill()
315
316         
317 def steps_to_list(steps, index=1):
318         return map(lambda x: x[index], steps)
319
320 def index_to_id(steps,index):
321         if index < len(steps):
322                 return steps[index][0]
323         else:
324                 return "done"
325
326 class DebugInterface:
327         def __init__(self, hostname):
328                 self.hostname = hostname
329                 self.session = None
330
331         def getConnection(self):
332                 print "Creating session for %s" % self.hostname
333                 # update known_hosts file (in case the node has rebooted since last run)
334                 try:
335                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
336                 except:
337                         email_exception()
338                         print traceback.print_exc()
339                         return False
340
341                 try:
342                         if config == None:
343                                 self.session = PlanetLabSession(self.hostname, False, True)
344                         else:
345                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
346                 except Exception, e:
347                         msg = "ERROR setting up session for %s" % self.hostname
348                         print msg
349                         traceback.print_exc()
350                         email_exception(msg)
351                         return False
352
353                 try:
354                         conn = self.session.get_connection(config)
355                 except EOFError:
356                         # NOTE: sometimes the wait in setup_host() is not long enough.  
357                         # So, here we try to wait a little longer before giving up entirely.
358                         try:
359                                 time.sleep(self.session.timeout*5)
360                                 conn = self.session.get_connection(config)
361                         except:
362                                 traceback.print_exc()
363                                 email_exception(self.hostname)
364                                 return False
365                 #print "trying to use conn before returning it."
366                 #print conn.c.modules.sys.path
367                 #print conn.c.modules.os.path.exists('/tmp/source')
368                 #time.sleep(1)
369
370                 #print "conn: %s" % conn
371                 return conn
372
373         def getSequences(self):
374
375                 # TODO: This can be replaced with a DB definition at a future time.
376                 #               This would make it possible for an admin to introduce new
377                 #               patterns without touching code.
378                 
379                 sequences = {}
380                 # restart_bootmanager_boot
381                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
382                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
383                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
384
385                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
386
387                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
388                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
389                                 "bminit-cfg-auth-getplc-update-debug-done",
390                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
391                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
392                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
393                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
394                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
395                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
396                                 ]:
397                         sequences.update({n : "restart_bootmanager_boot"})
398
399                 #       conn.restart_bootmanager('rins')
400                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
401                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
402                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
403                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
404                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
405                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
406                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
407                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
408                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
409                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
410                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
411                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
414                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
415                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
416                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
417                                 # actual solution appears to involve removing the bad files, and
418                                 # continually trying to boot the node.
419                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
420                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
421                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
422                                 ]:
423                         sequences.update({n : "restart_bootmanager_rins"})
424
425                 # repair_node_keys
426                 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
427
428                 #   conn.restart_node('rins')
429                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
430                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
431                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
432                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
433                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
434                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
435                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
436                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
437                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
438                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
439                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
440                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
441                                 ]:
442                         sequences.update({n : "restart_node_rins"})
443
444                 #       restart_node_boot
445                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
446                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
447                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
448                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
449                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
450                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
451                                  ]:
452                         sequences.update({n: "restart_node_boot"})
453
454                 # update_node_config_email
455                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
456                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
457                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
458                                 ]:
459                         sequences.update({n : "update_node_config_email"})
460
461                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
462                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
463                                 ]:
464                         sequences.update({n : "nodenetwork_email"})
465
466                 # update_bootcd_email
467                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
468                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
469                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
470                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
471                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
472                                 ]:
473                         sequences.update({n : "update_bootcd_email"})
474
475                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
476                                 ]:
477                         sequences.update({n: "suspect_error_email"})
478
479                 # update_hardware_email
480                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
481                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
482
483                 # broken_hardware_email
484                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
485
486                 # bad_dns_email
487                 for n in [ 
488                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
489                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
490                         ]:
491                         sequences.update( { n : "bad_dns_email"})
492
493                 return sequences
494
495         def getDiskSteps(self):
496                 steps = [
497                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
498                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
499                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
500
501                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
502
503                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
504                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
505
506                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
507                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
508
509                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
510                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
511
512                         ('floppytimeout','floppy0: floppy timeout called'),
513                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
514
515                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
516                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
517
518                         # floppy0: floppy timeout called
519                         # end_request: I/O error, dev fd0, sector 0
520
521                         # Buffer I/O error on device dm-2, logical block 8888896
522                         # ata1: status=0x51 { DriveReady SeekComplete Error }
523                         # ata1: error=0x40 { UncorrectableError }
524                         # SCSI error : <0 0 0 0> return code = 0x8000002
525                         # sda: Current: sense key: Medium Error
526                         #       Additional sense: Unrecovered read error - auto reallocate failed
527
528                         # SCSI error : <0 2 0 0> return code = 0x40001
529                         # end_request: I/O error, dev sda, sector 572489600
530                 ]
531                 return steps
532
533         def getDiskSequence(self, steps, child):
534                 sequence = []
535                 while True:
536                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
537                         sequence.append(id)
538
539                         if id == "done":
540                                 break
541                 return sequence
542
543         def getBootManagerStepPatterns(self):
544                 steps = [
545                         ('bminit'               , 'Initializing the BootManager.'),
546                         ('cfg'                  , 'Reading node configuration file.'),
547                         ('auth'                 , 'Authenticating node with PLC.'),
548                         ('getplc'               , 'Retrieving details of node from PLC.'),
549                         ('update'               , 'Updating node boot state at PLC.'),
550                         ('hardware'             , 'Checking if hardware requirements met.'),
551                         ('installinit'  , 'Install: Initializing.'),
552                         ('installdisk'  , 'Install: partitioning disks.'),
553                         ('installbootfs', 'Install: bootstrapfs tarball.'),
554                         ('installcfg'   , 'Install: Writing configuration files.'),
555                         ('installstop'  , 'Install: Shutting down installer.'),
556                         ('update2'              , 'Updating node boot state at PLC.'),
557                         ('installinit2' , 'Install: Initializing.'),
558                         ('validate'             , 'Validating node installation.'),
559                         ('rebuildinitrd', 'Rebuilding initrd'),
560                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
561                         ('update3'              , 'Updating node configuration.'),
562                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
563                         ('update4'              , 'Sending hardware configuration to PLC.'),
564                         ('debug'                , 'Starting debug mode'),
565                         ('bmexceptmount', 'BootManagerException during mount'),
566                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
567                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
568                         ('exception'    , 'Exception'),
569                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
570                         ('protoerror'   , 'XML RPC protocol error'),
571                         ('nodehostname' , 'Configured node hostname does not resolve'),
572                         ('implementerror', 'Implementation Error'),
573                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
574                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
575                         ('noinstall'    , 'notinstalled'),
576                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
577                         ('noblockdev'   , "No block devices detected."),
578                         ('dnserror'     , 'Name or service not known'),
579                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
580                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
581                         ('hardwarerequirefail' , 'Hardware requirements not met'),
582                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
583                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
584                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
585                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
586                         ('modulefail'   , 'Unable to get list of system modules'),
587                         ('writeerror'   , 'write error: No space left on device'),
588                         ('nospace'      , "No space left on device"),
589                         ('nonode'       , 'Failed to authenticate call: No such node'),
590                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
591                         ('bootcheckfail'     , 'BootCheckAuthentication'),
592                         ('bootupdatefail'   , 'BootUpdateNode'),
593                 ]
594                 return steps
595
596         def getBootManagerSequenceFromLog(self, steps, child):
597                 sequence = []
598                 while True:
599                         
600                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
601                         id = index_to_id(steps,index)
602                         sequence.append(id)
603
604                         if id == "exception":
605                                 print "...Found An Exception!!!"
606                         elif id == "done": #index == len(steps_to_list(steps)):
607                                 #print "Reached EOF"
608                                 break
609
610                 return sequence
611                 
612
613 def restore(sitehist, hostname, config=None, forced_action=None):
614
615         # NOTE: Nothing works if the bootcd is REALLY old.
616         #       So, this is the first step.
617
618         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
619         recent_actions = sitehist.getRecentActions(hostname=hostname)
620
621         if fbnode['observed_category'] == "OLDBOOTCD":
622                 print "\t...Notify owner to update BootImage!!!"
623
624                 if not found_within(recent_actions, 'newbootcd_notice', 3):
625                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
626
627                         print "\tDisabling %s due to out-of-date BootImage" % hostname
628                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
629
630                 # NOTE: nothing else is possible.
631                 return True
632
633         debugnode = DebugInterface(hostname)
634         conn = debugnode.getConnection()
635         #print "conn: %s" % conn
636         #print "trying to use conn after returning it."
637         #print conn.c.modules.sys.path
638         #print conn.c.modules.os.path.exists('/tmp/source')
639         if type(conn) == type(False): return False
640
641         #if forced_action == "reboot":
642         #       conn.restart_node('rins')
643         #       return True
644
645         boot_state = conn.get_boot_state()
646         if boot_state != "debug":
647                 print "... %s in %s state: skipping..." % (hostname , boot_state)
648                 return boot_state == "boot"
649
650         if conn.bootmanager_running():
651                 print "...BootManager is currently running.  Skipping host %s" %hostname 
652                 return True
653
654         # Read persistent flags, tagged on one week intervals.
655
656         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
657         dmesg = conn.get_dmesg()
658         child = fdpexpect.fdspawn(dmesg)
659
660         steps = debugnode.getDiskSteps()
661         sequence = debugnode.getDiskSequence(steps, child)
662
663         s = Set(sequence)
664         if config and not config.quiet: print "\tSET: ", s
665
666         if len(s) > 1:
667                 print "...Potential drive errors on %s" % hostname 
668                 if len(s) == 2 and 'floppyerror' in s:
669                         print "...Should investigate.  Continuing with node."
670                 else:
671                         print "...Should investigate.  Skipping node."
672                         # TODO: send message related to these errors.
673
674                         if not found_within(recent_actions, 'newbootcd_notice', 3):
675
676                                 log=conn.get_dmesg().read()
677                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
678                                 conn.set_nodestate('disable')
679
680                         return False
681
682         print "...Downloading bm.log from %s" %hostname 
683         log = conn.get_bootmanager_log()
684         child = fdpexpect.fdspawn(log)
685
686         if hasattr(config, 'collect') and config.collect: return True
687
688         if config and not config.quiet: print "...Scanning bm.log for errors"
689
690         time.sleep(1)
691
692         steps = debugnode.getBootManagerStepPatterns()
693         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
694                 
695         s = "-".join(sequence)
696         print "   FOUND SEQUENCE: ", s
697
698         # NOTE: We get or set the flag based on the current sequence identifier.
699         #  By using the sequence identifier, we guarantee that there will be no
700         #  frequent loops.  I'm guessing there is a better way to track loops,
701         #  though.
702
703         sequences = debugnode.getSequences()
704         flag_set = True
705         
706         if s not in sequences:
707                 print "   HOST %s" % hostname
708                 print "   UNKNOWN SEQUENCE: %s" % s
709
710                 args = {}
711                 args['hostname'] = hostname
712                 args['sequence'] = s
713                 args['bmlog'] = conn.get_bootmanager_log().read()
714                 args['viart'] = False
715
716                 sitehist.sendMessage('unknownsequence_notice', **args)
717
718                 conn.restart_bootmanager('boot')
719
720                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
721                 # This way, we can check it again after we've fixed it.
722                 flag_set = False
723
724         else:
725
726                 if   sequences[s] == "restart_bootmanager_boot":
727                         print "...Restarting BootManager.py on %s "%hostname 
728                         conn.restart_bootmanager('boot')
729                 elif sequences[s] == "restart_bootmanager_rins":
730                         print "...Restarting BootManager.py on %s "%hostname 
731                         conn.restart_bootmanager('rins')
732                 elif sequences[s] == "restart_node_rins":
733                         conn.restart_node('rins')
734                 elif sequences[s] == "restart_node_boot":
735                         conn.restart_node('boot')
736                 elif sequences[s] == "repair_node_keys":
737                         if conn.compare_and_repair_nodekeys():
738                                 # the keys either are in sync or were forced in sync.
739                                 # so try to reboot the node again.
740                                 conn.restart_bootmanager('rins')
741                                 pass
742                         else:
743                                 # there was some failure to synchronize the keys.
744                                 print "...Unable to repair node keys on %s" %hostname 
745
746                 elif sequences[s] == "suspect_error_email":
747                         args = {}
748                         args['hostname'] = hostname
749                         args['sequence'] = s
750                         args['bmlog'] = conn.get_bootmanager_log().read()
751                         args['viart'] = False
752
753                         sitehist.sendMessage('unknownsequence_notice', **args)
754                         conn.restart_bootmanager('boot')
755
756                 # TODO: differentiate this and the 'nodenetwork_email' actions.
757                 elif sequences[s] == "update_node_config_email":
758
759                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
760                                 args = {}
761                                 args['hostname'] = hostname
762                                 sitehist.sendMessage('nodeconfig_notice', **args)
763                                 conn.dump_plconf_file()
764
765                 elif sequences[s] == "nodenetwork_email":
766
767                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
768                                 args = {}
769                                 args['hostname'] = hostname
770                                 args['bmlog'] = conn.get_bootmanager_log().read()
771                                 sitehist.sendMessage('nodeconfig_notice', **args)
772                                 conn.dump_plconf_file()
773
774                 elif sequences[s] == "update_bootcd_email":
775
776                         if not found_within(recent_actions, 'newalphacd_notice', 3):
777                                 args = {}
778                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
779                                 args['hostname'] = hostname
780                         
781                                 sitehist.sendMessage('newalphacd_notice', **args)
782
783                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
784
785                 elif sequences[s] == "broken_hardware_email":
786                         # MAKE An ACTION record that this host has failed hardware.  May
787                         # require either an exception "/minhw" or other manual intervention.
788                         # Definitely need to send out some more EMAIL.
789                         # TODO: email notice of broken hardware
790                         if not found_within(recent_actions, 'baddisk_notice', 1):
791                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
792                                 args = {}
793                                 args['hostname'] = hostname
794                                 args['log'] = conn.get_dmesg().read()
795
796                                 sitehist.sendMessage('baddisk_notice', **args)
797                                 conn.set_nodestate('disable')
798
799                 elif sequences[s] == "update_hardware_email":
800                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
801                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
802                                 args = {}
803                                 args['hostname'] = hostname
804                                 args['bmlog'] = conn.get_bootmanager_log().read()
805                                 sitehist.sendMessage('minimalhardware_notice', **args)
806
807                 elif sequences[s] == "bad_dns_email":
808                         if not found_within(recent_actions, 'baddns_notice', 1):
809                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
810                                 args = {}
811                                 try:
812                                         node = plccache.GetNodeByName(hostname)
813                                         net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
814                                 except:
815                                         email_exception()
816                                         print traceback.print_exc()
817                                         # TODO: api error. skip email, b/c all info is not available,
818                                         # flag_set will not be recorded.
819                                         return False
820                                 nodenet_str = network_config_to_str(net)
821
822                                 args['hostname'] = hostname
823                                 args['network_config'] = nodenet_str
824                                 args['nodenetwork_id'] = net['nodenetwork_id']
825
826                                 sitehist.sendMessage('baddns_notice', **args)
827
828         return True
829         
830
831 # MAIN -------------------------------------------------------------------
832
833 def main():
834         from monitor import parser as parsermodule
835         parser = parsermodule.getParser()
836
837         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
838                                                 force=None, quiet=False)
839         parser.add_option("", "--child", dest="child", action="store_true", 
840                                                 help="This is the child mode of this process.")
841         parser.add_option("", "--force", dest="force", metavar="boot_state",
842                                                 help="Force a boot state passed to BootManager.py.")
843         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
844                                                 help="Extra quiet output messages.")
845         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
846                                                 help="Extra debug output messages.")
847         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
848                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
849         parser.add_option("", "--collect", dest="collect", action="store_true", 
850                                                 help="No action, just collect dmesg, and bm.log")
851         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
852                                                 help="Do not perform the orginary setup phase.")
853
854         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855         config = parsermodule.parse_args(parser)
856
857         if config.nodelist:
858                 nodes = config.getListFromFile(config.nodelist)
859         elif config.node:
860                 nodes = [ config.node ]
861         else:
862                 parser.print_help()
863                 sys.exit(1)
864
865         for node in nodes:
866                 reboot(node, config)
867
868 if __name__ == "__main__":
869         main()