added policy.py and updated bootman.py to work with the new policy framework.
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from getsshkeys import SSHKnownHosts
17
18 from Rpyc import SocketConnection, Async
19 from Rpyc.Utils import *
20
21 import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.wrapper import plc
28 from monitor.wrapper.emailTxt import mailtxt
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 from nodeconfig import network_config_to_str
37
38
39 api = plc.getAuthAPI()
40 fb = None
41
42
43 class NodeConnection:
44         def __init__(self, connection, node, config):
45                 self.node = node
46                 self.c = connection
47                 self.config = config
48
49         def get_boot_state(self):
50                 try:
51                         if self.c.modules.os.path.exists('/tmp/source'):
52                                 return "debug"
53                         elif self.c.modules.os.path.exists('/vservers'): 
54                                 return "boot"
55                         else:
56                                 return "unknown"
57                 except EOFError:
58                         traceback.print_exc()
59                         print self.c.modules.sys.path
60                 except:
61                         traceback.print_exc()
62
63                 return "unknown"
64
65         def get_dmesg(self):
66                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
67                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
68                 log = open("log/dmesg.%s.log" % self.node, 'r')
69                 return log
70
71         def get_bootmanager_log(self):
72                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
73                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
74                 log = open("log/bm.%s.log" % self.node, 'r')
75                 return log
76
77         def dump_plconf_file(self):
78                 c = self.c
79                 self.c.modules.sys.path.append("/tmp/source/")
80                 self.c.modules.os.chdir('/tmp/source')
81
82                 log = c.modules.BootManager.log('/tmp/new.log')
83                 bm = c.modules.BootManager.BootManager(log,'boot')
84
85                 BootManagerException = c.modules.Exceptions.BootManagerException
86                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
87                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
88                 bm_continue = True
89
90                 InitializeBootManager.Run(bm.VARS, bm.LOG)
91                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
92                 except Exception, x:
93                         bm_continue = False
94                         print "   ERROR:", x
95                         print "   Possibly, unable to find valid configuration file"
96
97                 if bm_continue:
98                         for key in bm.VARS.keys():
99                                 print key, " == ", bm.VARS[key]
100                 else:
101                         print "   Unable to read Node Configuration"
102                 
103
104         def compare_and_repair_nodekeys(self):
105                 c = self.c
106                 self.c.modules.sys.path.append("/tmp/source/")
107                 self.c.modules.os.chdir('/tmp/source')
108
109                 log = c.modules.BootManager.log('/tmp/new.log')
110                 bm = c.modules.BootManager.BootManager(log,'boot')
111
112                 BootManagerException = c.modules.Exceptions.BootManagerException
113                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
114                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
115                 bm_continue = True
116
117                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
118
119                 InitializeBootManager.Run(bm.VARS, bm.LOG)
120                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
121                 except Exception, x:
122                         bm_continue = False
123                         print "exception"
124                         print x
125                         print "   Possibly, unable to find valid configuration file"
126
127                 if bm_continue:
128                         print "   NODE: %s" % bm.VARS['NODE_KEY']
129                         print "   PLC : %s" % plcnode['key']
130
131                         if bm.VARS['NODE_KEY'] == plcnode['key']:
132                                 return True
133                         else:
134                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
135                                         print "   Successfully updated NODE_KEY with PLC"
136                                         return True
137                                 else:
138                                         return False
139                                 
140                         #for key in bm.VARS.keys():
141                         #       print key, " == ", bm.VARS[key]
142                 else:
143                         print "   Unable to retrieve NODE_KEY"
144
145         def bootmanager_running(self):
146                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
147                         return True
148                 else:
149                         return False
150
151         def set_nodestate(self, state='boot'):
152                 return api.UpdateNode(self.node, {'boot_state' : state})
153
154         def restart_node(self, state='boot'):
155                 api.UpdateNode(self.node, {'boot_state' : state})
156
157                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
158                 if not pflags.getRecentFlag('gentlekill'):
159                         print "   Killing all slice processes... : %s" %  self.node
160                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
161                         self.c.modules.os.system(cmd_slicekill)
162                         cmd = """ shutdown -r +1 & """
163                         print "   Restarting %s : %s" % ( self.node, cmd)
164                         self.c.modules.os.system(cmd)
165
166                         pflags.setRecentFlag('gentlekill')
167                         pflags.save()
168                 else:
169                         print "   Restarting with sysrq 'sub' %s" % self.node
170                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
171                         self.c.modules.os.system(cmd)
172
173                 return
174
175         def restart_bootmanager(self, forceState):
176
177                 self.c.modules.os.chdir('/tmp/source')
178                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
179                         print "   BootManager is already running: try again soon..."
180                 else:
181                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
182                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
183                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
184                                   "  rm -f /tmp/BM_RUNNING " + \
185                                   ") &" 
186                         cmd = cmd % forceState
187                         self.c.modules.os.system(cmd)
188
189                 return 
190
191
192 class PlanetLabSession:
193         globalport = 22000 + int(random.random()*1000)
194
195         def __init__(self, node, nosetup, verbose):
196                 self.verbose = verbose
197                 self.node = node
198                 self.port = None
199                 self.nosetup = nosetup
200                 self.command = None
201                 self.setup_host()
202
203         def get_connection(self, config):
204                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
205                 #i = 0
206                 #while i < 3: 
207                 #       print i, conn.c.modules.sys.path
208                 #       print conn.c.modules.os.path.exists('/tmp/source')
209                 #       i+=1
210                 #       time.sleep(1)
211                 return conn
212         
213         def setup_host(self):
214                 self.port = PlanetLabSession.globalport
215                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
216
217                 args = {}
218                 args['port'] = self.port
219                 args['user'] = 'root'
220                 args['hostname'] = self.node
221                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
222                 ssh_port = 22
223
224                 if self.nosetup:
225                         print "Skipping setup"
226                         return 
227
228                 # COPY Rpyc files to host
229                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
230                 if self.verbose: print cmd
231                 print cmd
232                 # TODO: Add timeout
233                 timeout = 120
234                 localos = moncommands.CMD()
235
236                 ret = localos.system(cmd, timeout)
237                 print ret
238                 if ret != 0:
239                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
240                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
241                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
242                         ret = localos.system(cmd, timeout)
243                         print ret
244                         if ret != 0:
245                                 print "\tFAILED TWICE"
246                                 #sys.exit(1)
247                                 raise Exception("Failed twice trying to login with updated ssh host key")
248
249                 t1 = time.time()
250                 # KILL any already running servers.
251                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
252                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
253             rm -f out.log
254             echo "kill server" >> out.log
255             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
256             echo "export" >> out.log
257             export PYTHONPATH=$HOME  ;
258             echo "start server" >> out.log
259             python Rpyc/Servers/forking_server.py &> server.log &
260             echo "done" >> out.log
261 EOF""")
262                 #cmd = """ssh %(user)s@%(hostname)s """ + \
263                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
264                 #cmd = cmd % args
265                 #if self.verbose: print cmd
266                 ## TODO: Add timeout
267                 #print localos.system(cmd,timeout)
268
269                 ## START a new rpyc server.
270                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
271                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
272                 #cmd = cmd % args
273                 #if self.verbose: print cmd
274                 #print localos.system(cmd,timeout)
275                 print "setup rpyc server over ssh"
276                 print ssh.ret
277
278                 # TODO: Add timeout
279                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
280                 # and the following options seems to work well.
281                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
282                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
283                           """-o ConnectTimeout=120 """ + \
284                           """-n -N -L %(port)s:localhost:18812 """ + \
285                           """%(user)s@%(hostname)s"""
286                 cmd = cmd % args
287                 if self.verbose: print cmd
288                 print cmd
289                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
290                 # TODO: the read() here may block indefinitely.  Need a better
291                 # approach therefore, that includes a timeout.
292                 #ret = self.command.stdout.read(5)
293                 ret = moncommands.read_t(self.command.stdout, 5)
294
295                 t2 = time.time()
296                 if 'READY' in ret:
297                         # NOTE: There is still a slight race for machines that are slow...
298                         self.timeout = 2*(t2-t1)
299                         print "Sleeping for %s sec" % self.timeout
300                         time.sleep(self.timeout)
301                         return
302
303                 if self.command.returncode is not None:
304                         print "Failed to establish tunnel!"
305                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
306
307                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
308
309         def __del__(self):
310                 if self.command:
311                         if self.verbose: print "Killing SSH session %s" % self.port
312                         print "Killing SSH session %s" % self.port
313                         self.command.kill()
314
315         
316 def steps_to_list(steps, index=1):
317         return map(lambda x: x[index], steps)
318
319 def index_to_id(steps,index):
320         if index < len(steps):
321                 return steps[index][0]
322         else:
323                 return "done"
324
325 class DebugInterface:
326         def __init__(self, hostname):
327                 self.hostname = hostname
328                 self.session = None
329
330         def getConnection(self):
331                 print "Creating session for %s" % self.hostname
332                 # update known_hosts file (in case the node has rebooted since last run)
333                 try:
334                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
335                 except:
336                         email_exception()
337                         print traceback.print_exc()
338                         return False
339
340                 try:
341                         if config == None:
342                                 self.session = PlanetLabSession(self.hostname, False, True)
343                         else:
344                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
345                 except Exception, e:
346                         msg = "ERROR setting up session for %s" % self.hostname
347                         print msg
348                         traceback.print_exc()
349                         email_exception(msg)
350                         return False
351
352                 try:
353                         conn = self.session.get_connection(config)
354                 except EOFError:
355                         # NOTE: sometimes the wait in setup_host() is not long enough.  
356                         # So, here we try to wait a little longer before giving up entirely.
357                         try:
358                                 time.sleep(self.session.timeout*5)
359                                 conn = self.session.get_connection(config)
360                         except:
361                                 traceback.print_exc()
362                                 email_exception(self.hostname)
363                                 return False
364                 #print "trying to use conn before returning it."
365                 #print conn.c.modules.sys.path
366                 #print conn.c.modules.os.path.exists('/tmp/source')
367                 #time.sleep(1)
368
369                 #print "conn: %s" % conn
370                 return conn
371
372         def getSequences(self):
373
374                 # TODO: This can be replaced with a DB definition at a future time.
375                 #               This would make it possible for an admin to introduce new
376                 #               patterns without touching code.
377                 
378                 sequences = {}
379                 # restart_bootmanager_boot
380                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
381                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
382                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
383
384                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
385
386                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
387                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
388                                 "bminit-cfg-auth-getplc-update-debug-done",
389                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
390                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
391                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
392                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
393                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
394                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
395                                 ]:
396                         sequences.update({n : "restart_bootmanager_boot"})
397
398                 #       conn.restart_bootmanager('rins')
399                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
400                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
401                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
402                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
403                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
404                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
405                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
406                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
407                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
408                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
409                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
410                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
411                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
414                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
416                                 # actual solution appears to involve removing the bad files, and
417                                 # continually trying to boot the node.
418                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
419                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
420                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
421                                 ]:
422                         sequences.update({n : "restart_bootmanager_rins"})
423
424                 # repair_node_keys
425                 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
426
427                 #   conn.restart_node('rins')
428                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
429                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
430                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
433                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
434                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
435                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
436                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
437                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
438                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
439                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
440                                 ]:
441                         sequences.update({n : "restart_node_rins"})
442
443                 #       restart_node_boot
444                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
445                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
446                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
447                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
448                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
449                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
450                                  ]:
451                         sequences.update({n: "restart_node_boot"})
452
453                 # update_node_config_email
454                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
455                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
456                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
457                                 ]:
458                         sequences.update({n : "update_node_config_email"})
459
460                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
461                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
462                                 ]:
463                         sequences.update({n : "nodenetwork_email"})
464
465                 # update_bootcd_email
466                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
467                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
468                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
469                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
470                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
471                                 ]:
472                         sequences.update({n : "update_bootcd_email"})
473
474                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
475                                 ]:
476                         sequences.update({n: "suspect_error_email"})
477
478                 # update_hardware_email
479                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
480                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
481
482                 # broken_hardware_email
483                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
484
485                 # bad_dns_email
486                 for n in [ 
487                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
488                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
489                         ]:
490                         sequences.update( { n : "bad_dns_email"})
491
492                 return sequences
493
494         def getDiskSteps(self):
495                 steps = [
496                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
497                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
498                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
499
500                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
501
502                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
503                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
504
505                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
506                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
507
508                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
509                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
510
511                         ('floppytimeout','floppy0: floppy timeout called'),
512                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
513
514                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
515                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
516
517                         # floppy0: floppy timeout called
518                         # end_request: I/O error, dev fd0, sector 0
519
520                         # Buffer I/O error on device dm-2, logical block 8888896
521                         # ata1: status=0x51 { DriveReady SeekComplete Error }
522                         # ata1: error=0x40 { UncorrectableError }
523                         # SCSI error : <0 0 0 0> return code = 0x8000002
524                         # sda: Current: sense key: Medium Error
525                         #       Additional sense: Unrecovered read error - auto reallocate failed
526
527                         # SCSI error : <0 2 0 0> return code = 0x40001
528                         # end_request: I/O error, dev sda, sector 572489600
529                 ]
530                 return steps
531
532         def getDiskSequence(self, steps, child):
533                 sequence = []
534                 while True:
535                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
536                         sequence.append(id)
537
538                         if id == "done":
539                                 break
540                 return sequence
541
542         def getBootManagerStepPatterns(self):
543                 steps = [
544                         ('bminit'               , 'Initializing the BootManager.'),
545                         ('cfg'                  , 'Reading node configuration file.'),
546                         ('auth'                 , 'Authenticating node with PLC.'),
547                         ('getplc'               , 'Retrieving details of node from PLC.'),
548                         ('update'               , 'Updating node boot state at PLC.'),
549                         ('hardware'             , 'Checking if hardware requirements met.'),
550                         ('installinit'  , 'Install: Initializing.'),
551                         ('installdisk'  , 'Install: partitioning disks.'),
552                         ('installbootfs', 'Install: bootstrapfs tarball.'),
553                         ('installcfg'   , 'Install: Writing configuration files.'),
554                         ('installstop'  , 'Install: Shutting down installer.'),
555                         ('update2'              , 'Updating node boot state at PLC.'),
556                         ('installinit2' , 'Install: Initializing.'),
557                         ('validate'             , 'Validating node installation.'),
558                         ('rebuildinitrd', 'Rebuilding initrd'),
559                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
560                         ('update3'              , 'Updating node configuration.'),
561                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
562                         ('update4'              , 'Sending hardware configuration to PLC.'),
563                         ('debug'                , 'Starting debug mode'),
564                         ('bmexceptmount', 'BootManagerException during mount'),
565                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
566                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
567                         ('exception'    , 'Exception'),
568                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
569                         ('protoerror'   , 'XML RPC protocol error'),
570                         ('nodehostname' , 'Configured node hostname does not resolve'),
571                         ('implementerror', 'Implementation Error'),
572                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
573                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
574                         ('noinstall'    , 'notinstalled'),
575                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
576                         ('noblockdev'   , "No block devices detected."),
577                         ('dnserror'     , 'Name or service not known'),
578                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
579                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
580                         ('hardwarerequirefail' , 'Hardware requirements not met'),
581                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
582                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
583                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
584                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
585                         ('modulefail'   , 'Unable to get list of system modules'),
586                         ('writeerror'   , 'write error: No space left on device'),
587                         ('nospace'      , "No space left on device"),
588                         ('nonode'       , 'Failed to authenticate call: No such node'),
589                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
590                         ('bootcheckfail'     , 'BootCheckAuthentication'),
591                         ('bootupdatefail'   , 'BootUpdateNode'),
592                 ]
593                 return steps
594
595         def getBootManagerSequenceFromLog(self, steps, child):
596                 sequence = []
597                 while True:
598                         
599                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
600                         id = index_to_id(steps,index)
601                         sequence.append(id)
602
603                         if id == "exception":
604                                 print "...Found An Exception!!!"
605                         elif id == "done": #index == len(steps_to_list(steps)):
606                                 #print "Reached EOF"
607                                 break
608
609                 return sequence
610                 
611
612 def restore(sitehist, hostname, config=None, forced_action=None):
613
614         # NOTE: Nothing works if the bootcd is REALLY old.
615         #       So, this is the first step.
616
617         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
618         recent_actions = sitehist.getRecentActions(hostname=hostname)
619
620         if fbnode['observed_category'] == "OLDBOOTCD":
621                 print "\t...Notify owner to update BootImage!!!"
622
623                 if not found_within(recent_actions, 'newbootcd_notice', 3):
624                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
625
626                         print "\tDisabling %s due to out-of-date BootImage" % hostname
627                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
628
629                 # NOTE: nothing else is possible.
630                 return True
631
632         debugnode = DebugInterface(hostname)
633         conn = debugnode.getConnection()
634         #print "conn: %s" % conn
635         #print "trying to use conn after returning it."
636         #print conn.c.modules.sys.path
637         #print conn.c.modules.os.path.exists('/tmp/source')
638         if type(conn) == type(False): return False
639
640         #if forced_action == "reboot":
641         #       conn.restart_node('rins')
642         #       return True
643
644         boot_state = conn.get_boot_state()
645         if boot_state != "debug":
646                 print "... %s in %s state: skipping..." % (hostname , boot_state)
647                 return boot_state == "boot"
648
649         if conn.bootmanager_running():
650                 print "...BootManager is currently running.  Skipping host %s" %hostname 
651                 return True
652
653         # Read persistent flags, tagged on one week intervals.
654         #pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
655
656         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
657         dmesg = conn.get_dmesg()
658         child = fdpexpect.fdspawn(dmesg)
659
660         steps = debugnode.getDiskSteps()
661         sequence = debugnode.getDiskSequence(steps, child)
662
663         s = Set(sequence)
664         if config and not config.quiet: print "\tSET: ", s
665
666         if len(s) > 1:
667                 print "...Potential drive errors on %s" % hostname 
668                 if len(s) == 2 and 'floppyerror' in s:
669                         print "...Should investigate.  Continuing with node."
670                 else:
671                         print "...Should investigate.  Skipping node."
672                         # TODO: send message related to these errors.
673
674                         if not found_within(recent_actions, 'newbootcd_notice', 3):
675
676                                 log=conn.get_dmesg().read()
677                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
678                                 conn.set_nodestate('disable')
679
680                         return False
681
682         print "...Downloading bm.log from %s" %hostname 
683         log = conn.get_bootmanager_log()
684         child = fdpexpect.fdspawn(log)
685
686         if hasattr(config, 'collect') and config.collect: return True
687
688         if config and not config.quiet: print "...Scanning bm.log for errors"
689
690         time.sleep(1)
691
692         steps = debugnode.getBootManagerStepPatterns()
693         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
694                 
695         s = "-".join(sequence)
696         print "   FOUND SEQUENCE: ", s
697
698         # NOTE: We get or set the flag based on the current sequence identifier.
699         #  By using the sequence identifier, we guarantee that there will be no
700         #  frequent loops.  I'm guessing there is a better way to track loops,
701         #  though.
702
703         sequences = debugnode.getSequences()
704         flag_set = True
705         
706         if s not in sequences:
707                 print "   HOST %s" % hostname
708                 print "   UNKNOWN SEQUENCE: %s" % s
709
710                 args = {}
711                 args['hostname'] = hostname
712                 args['sequence'] = s
713                 args['bmlog'] = conn.get_bootmanager_log().read()
714                 args['viart'] = False
715
716                 sitehist.sendMessage('unknownsequence_notice', **args)
717
718                 conn.restart_bootmanager('boot')
719
720                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
721                 # This way, we can check it again after we've fixed it.
722                 flag_set = False
723
724         else:
725
726                 if   sequences[s] == "restart_bootmanager_boot":
727                         print "...Restarting BootManager.py on %s "%hostname 
728                         conn.restart_bootmanager('boot')
729                 elif sequences[s] == "restart_bootmanager_rins":
730                         print "...Restarting BootManager.py on %s "%hostname 
731                         conn.restart_bootmanager('rins')
732                 elif sequences[s] == "restart_node_rins":
733                         conn.restart_node('rins')
734                 elif sequences[s] == "restart_node_boot":
735                         conn.restart_node('boot')
736                 elif sequences[s] == "repair_node_keys":
737                         if conn.compare_and_repair_nodekeys():
738                                 # the keys either are in sync or were forced in sync.
739                                 # so try to reboot the node again.
740                                 conn.restart_bootmanager('rins')
741                                 pass
742                         else:
743                                 # there was some failure to synchronize the keys.
744                                 print "...Unable to repair node keys on %s" %hostname 
745
746                 elif sequences[s] == "suspect_error_email":
747                         args = {}
748                         args['hostname'] = hostname
749                         args['sequence'] = s
750                         args['bmlog'] = conn.get_bootmanager_log().read()
751                         args['viart'] = False
752
753                         sitehist.sendMessage('unknownsequence_notice', **args)
754                         conn.restart_bootmanager('boot')
755
756                 # TODO: differentiate this and the 'nodenetwork_email' actions.
757                 elif sequences[s] == "update_node_config_email":
758
759                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
760                                 args = {}
761                                 args['hostname'] = hostname
762                                 sitehist.sendMessage('nodeconfig_notice', **args)
763                                 conn.dump_plconf_file()
764
765                 elif sequences[s] == "nodenetwork_email":
766
767                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
768                                 args = {}
769                                 args['hostname'] = hostname
770                                 args['bmlog'] = conn.get_bootmanager_log().read()
771                                 sitehist.sendMessage('nodeconfig_notice', **args)
772                                 conn.dump_plconf_file()
773
774                 elif sequences[s] == "update_bootcd_email":
775
776                         if not found_within(recent_actions, 'newalphacd_notice', 3):
777                                 args = {}
778                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
779                                 args['hostname'] = hostname
780                         
781                                 sitehist.sendMessage('newalphacd_notice', **args)
782
783                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
784
785                 elif sequences[s] == "broken_hardware_email":
786                         # MAKE An ACTION record that this host has failed hardware.  May
787                         # require either an exception "/minhw" or other manual intervention.
788                         # Definitely need to send out some more EMAIL.
789                         # TODO: email notice of broken hardware
790                         if not found_within(recent_actions, 'baddisk_notice', 1):
791                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
792                                 args = {}
793                                 args['hostname'] = hostname
794                                 args['log'] = conn.get_dmesg().read()
795
796                                 sitehist.sendMessage('baddisk_notice', **args)
797                                 conn.set_nodestate('disable')
798
799                 elif sequences[s] == "update_hardware_email":
800                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
801                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
802                                 args = {}
803                                 args['hostname'] = hostname
804                                 args['bmlog'] = conn.get_bootmanager_log().read()
805                                 sitehist.sendMessage('minimalhardware_notice', **args)
806
807                 elif sequences[s] == "bad_dns_email":
808                         if not found_within(recent_actions, 'baddns_notice', 1):
809                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
810                                 args = {}
811                                 try:
812                                         node = api.GetNodes(hostname)[0]
813                                         net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
814                                 except:
815                                         email_exception()
816                                         print traceback.print_exc()
817                                         # TODO: api error. skip email, b/c all info is not available,
818                                         # flag_set will not be recorded.
819                                         return False
820                                 nodenet_str = network_config_to_str(net)
821
822                                 args['hostname'] = hostname
823                                 args['network_config'] = nodenet_str
824                                 args['nodenetwork_id'] = net['nodenetwork_id']
825
826                                 sitehist.sendMessage('baddns_notice', **args)
827
828         return True
829         
830
831 # MAIN -------------------------------------------------------------------
832
833 def main():
834         from monitor import parser as parsermodule
835         parser = parsermodule.getParser()
836
837         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
838                                                 force=None, quiet=False)
839         parser.add_option("", "--child", dest="child", action="store_true", 
840                                                 help="This is the child mode of this process.")
841         parser.add_option("", "--force", dest="force", metavar="boot_state",
842                                                 help="Force a boot state passed to BootManager.py.")
843         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
844                                                 help="Extra quiet output messages.")
845         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
846                                                 help="Extra debug output messages.")
847         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
848                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
849         parser.add_option("", "--collect", dest="collect", action="store_true", 
850                                                 help="No action, just collect dmesg, and bm.log")
851         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
852                                                 help="Do not perform the orginary setup phase.")
853
854         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855         config = parsermodule.parse_args(parser)
856
857         if config.nodelist:
858                 nodes = config.getListFromFile(config.nodelist)
859         elif config.node:
860                 nodes = [ config.node ]
861         else:
862                 parser.print_help()
863                 sys.exit(1)
864
865         for node in nodes:
866                 reboot(node, config)
867
868 if __name__ == "__main__":
869         main()