remove old blacklist
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from getsshkeys import SSHKnownHosts
17
18 from Rpyc import SocketConnection, Async
19 from Rpyc.Utils import *
20
21 import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.wrapper import plc
28 from monitor.wrapper.emailTxt import mailtxt
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 from nodeconfig import network_config_to_str
37
38
39 api = plc.getAuthAPI()
40 fb = None
41
42
43 class NodeConnection:
44         def __init__(self, connection, node, config):
45                 self.node = node
46                 self.c = connection
47                 self.config = config
48
49         def get_boot_state(self):
50                 try:
51                         if self.c.modules.os.path.exists('/tmp/source'):
52                                 return "debug"
53                         elif self.c.modules.os.path.exists('/vservers'): 
54                                 return "boot"
55                         else:
56                                 return "unknown"
57                 except EOFError:
58                         traceback.print_exc()
59                         print self.c.modules.sys.path
60                 except:
61                         traceback.print_exc()
62
63                 return "unknown"
64
65         def get_dmesg(self):
66                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
67                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
68                 log = open("log/dmesg.%s.log" % self.node, 'r')
69                 return log
70
71         def get_bootmanager_log(self):
72                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
73                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
74                 log = open("log/bm.%s.log" % self.node, 'r')
75                 return log
76
77         def dump_plconf_file(self):
78                 c = self.c
79                 self.c.modules.sys.path.append("/tmp/source/")
80                 self.c.modules.os.chdir('/tmp/source')
81
82                 log = c.modules.BootManager.log('/tmp/new.log')
83                 bm = c.modules.BootManager.BootManager(log,'boot')
84
85                 BootManagerException = c.modules.Exceptions.BootManagerException
86                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
87                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
88                 bm_continue = True
89
90                 InitializeBootManager.Run(bm.VARS, bm.LOG)
91                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
92                 except Exception, x:
93                         bm_continue = False
94                         print "   ERROR:", x
95                         print "   Possibly, unable to find valid configuration file"
96
97                 if bm_continue:
98                         for key in bm.VARS.keys():
99                                 print key, " == ", bm.VARS[key]
100                 else:
101                         print "   Unable to read Node Configuration"
102                 
103
104         def compare_and_repair_nodekeys(self):
105                 c = self.c
106                 self.c.modules.sys.path.append("/tmp/source/")
107                 self.c.modules.os.chdir('/tmp/source')
108
109                 log = c.modules.BootManager.log('/tmp/new.log')
110                 bm = c.modules.BootManager.BootManager(log,'boot')
111
112                 BootManagerException = c.modules.Exceptions.BootManagerException
113                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
114                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
115                 bm_continue = True
116
117                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
118
119                 InitializeBootManager.Run(bm.VARS, bm.LOG)
120                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
121                 except Exception, x:
122                         bm_continue = False
123                         print "exception"
124                         print x
125                         print "   Possibly, unable to find valid configuration file"
126
127                 if bm_continue:
128                         print "   NODE: %s" % bm.VARS['NODE_KEY']
129                         print "   PLC : %s" % plcnode['key']
130
131                         if bm.VARS['NODE_KEY'] == plcnode['key']:
132                                 return True
133                         else:
134                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
135                                         print "   Successfully updated NODE_KEY with PLC"
136                                         return True
137                                 else:
138                                         return False
139                                 
140                         #for key in bm.VARS.keys():
141                         #       print key, " == ", bm.VARS[key]
142                 else:
143                         print "   Unable to retrieve NODE_KEY"
144
145         def bootmanager_running(self):
146                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
147                         return True
148                 else:
149                         return False
150
151         def set_nodestate(self, state='boot'):
152                 return api.UpdateNode(self.node, {'boot_state' : state})
153
154         def restart_node(self, state='boot'):
155                 api.UpdateNode(self.node, {'boot_state' : state})
156
157                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
158                 if not pflags.getRecentFlag('gentlekill'):
159                         print "   Killing all slice processes... : %s" %  self.node
160                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
161                         self.c.modules.os.system(cmd_slicekill)
162                         cmd = """ shutdown -r +1 & """
163                         print "   Restarting %s : %s" % ( self.node, cmd)
164                         self.c.modules.os.system(cmd)
165
166                         pflags.setRecentFlag('gentlekill')
167                         pflags.save()
168                 else:
169                         print "   Restarting with sysrq 'sub' %s" % self.node
170                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
171                         self.c.modules.os.system(cmd)
172
173                 return
174
175         def restart_bootmanager(self, forceState):
176
177                 self.c.modules.os.chdir('/tmp/source')
178                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
179                         print "   BootManager is already running: try again soon..."
180                 else:
181                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
182                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
183                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
184                                   "  rm -f /tmp/BM_RUNNING " + \
185                                   ") &" 
186                         cmd = cmd % forceState
187                         self.c.modules.os.system(cmd)
188
189                 return 
190
191
192 class PlanetLabSession:
193         globalport = 22000 + int(random.random()*1000)
194
195         def __init__(self, node, nosetup, verbose):
196                 self.verbose = verbose
197                 self.node = node
198                 self.port = None
199                 self.nosetup = nosetup
200                 self.command = None
201                 self.setup_host()
202
203         def get_connection(self, config):
204                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
205                 #i = 0
206                 #while i < 3: 
207                 #       print i, conn.c.modules.sys.path
208                 #       print conn.c.modules.os.path.exists('/tmp/source')
209                 #       i+=1
210                 #       time.sleep(1)
211                 return conn
212         
213         def setup_host(self):
214                 self.port = PlanetLabSession.globalport
215                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
216
217                 args = {}
218                 args['port'] = self.port
219                 args['user'] = 'root'
220                 args['hostname'] = self.node
221                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
222                 ssh_port = 22
223
224                 if self.nosetup:
225                         print "Skipping setup"
226                         return 
227
228                 # COPY Rpyc files to host
229                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
230                 if self.verbose: print cmd
231                 print cmd
232                 # TODO: Add timeout
233                 timeout = 120
234                 localos = moncommands.CMD()
235
236                 ret = localos.system(cmd, timeout)
237                 print ret
238                 if ret != 0:
239                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
240                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
241                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
242                         ret = localos.system(cmd, timeout)
243                         print ret
244                         if ret != 0:
245                                 print "\tFAILED TWICE"
246                                 #sys.exit(1)
247                                 raise Exception("Failed twice trying to login with updated ssh host key")
248
249                 t1 = time.time()
250                 # KILL any already running servers.
251                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
252                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
253             rm -f out.log
254             echo "kill server" >> out.log
255             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
256             echo "export" >> out.log
257             export PYTHONPATH=$HOME  ;
258             echo "start server" >> out.log
259             python Rpyc/Servers/forking_server.py &> server.log &
260             echo "done" >> out.log
261 EOF""")
262                 #cmd = """ssh %(user)s@%(hostname)s """ + \
263                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
264                 #cmd = cmd % args
265                 #if self.verbose: print cmd
266                 ## TODO: Add timeout
267                 #print localos.system(cmd,timeout)
268
269                 ## START a new rpyc server.
270                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
271                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
272                 #cmd = cmd % args
273                 #if self.verbose: print cmd
274                 #print localos.system(cmd,timeout)
275                 print "setup rpyc server over ssh"
276                 print ssh.ret
277
278                 # TODO: Add timeout
279                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
280                 # and the following options seems to work well.
281                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
282                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
283                           """-o ConnectTimeout=120 """ + \
284                           """-n -N -L %(port)s:localhost:18812 """ + \
285                           """%(user)s@%(hostname)s"""
286                 cmd = cmd % args
287                 if self.verbose: print cmd
288                 print cmd
289                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
290                 # TODO: the read() here may block indefinitely.  Need a better
291                 # approach therefore, that includes a timeout.
292                 #ret = self.command.stdout.read(5)
293                 ret = moncommands.read_t(self.command.stdout, 5)
294
295                 t2 = time.time()
296                 if 'READY' in ret:
297                         # NOTE: There is still a slight race for machines that are slow...
298                         self.timeout = 2*(t2-t1)
299                         print "Sleeping for %s sec" % self.timeout
300                         time.sleep(self.timeout)
301                         return
302
303                 if self.command.returncode is not None:
304                         print "Failed to establish tunnel!"
305                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
306
307                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
308
309         def __del__(self):
310                 if self.command:
311                         if self.verbose: print "Killing SSH session %s" % self.port
312                         print "Killing SSH session %s" % self.port
313                         self.command.kill()
314
315         
316 def steps_to_list(steps, index=1):
317         return map(lambda x: x[index], steps)
318
319 def index_to_id(steps,index):
320         if index < len(steps):
321                 return steps[index][0]
322         else:
323                 return "done"
324
325 class DebugInterface:
326         def __init__(self, hostname):
327                 self.hostname = hostname
328                 self.session = None
329
330         def getConnection(self):
331                 print "Creating session for %s" % self.hostname
332                 # update known_hosts file (in case the node has rebooted since last run)
333                 try:
334                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
335                 except:
336                         email_exception()
337                         print traceback.print_exc()
338                         return False
339
340                 try:
341                         if config == None:
342                                 self.session = PlanetLabSession(self.hostname, False, True)
343                         else:
344                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
345                 except Exception, e:
346                         msg = "ERROR setting up session for %s" % self.hostname
347                         print msg
348                         traceback.print_exc()
349                         email_exception(msg)
350                         return False
351
352                 try:
353                         conn = self.session.get_connection(config)
354                 except EOFError:
355                         # NOTE: sometimes the wait in setup_host() is not long enough.  
356                         # So, here we try to wait a little longer before giving up entirely.
357                         try:
358                                 time.sleep(self.session.timeout*5)
359                                 conn = self.session.get_connection(config)
360                         except:
361                                 traceback.print_exc()
362                                 email_exception(self.hostname)
363                                 return False
364                 #print "trying to use conn before returning it."
365                 #print conn.c.modules.sys.path
366                 #print conn.c.modules.os.path.exists('/tmp/source')
367                 #time.sleep(1)
368
369                 #print "conn: %s" % conn
370                 return conn
371
372         def getSequences(self):
373
374                 # TODO: This can be replaced with a DB definition at a future time.
375                 #               This would make it possible for an admin to introduce new
376                 #               patterns without touching code.
377                 
378                 sequences = {}
379                 # restart_bootmanager_boot
380                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
381                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
382                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
383
384                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
385
386                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
387                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
388                                 "bminit-cfg-auth-getplc-update-debug-done",
389                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
390                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
391                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
392                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
393                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
394                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
395                                 ]:
396                         sequences.update({n : "restart_bootmanager_boot"})
397
398                 #       conn.restart_bootmanager('rins')
399                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
400                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
401                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
402                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
403                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
404                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
405                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
406                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
407                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
408                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
409                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
410                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
411                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
414                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
416                                 # actual solution appears to involve removing the bad files, and
417                                 # continually trying to boot the node.
418                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
419                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
420                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
421                                 ]:
422                         sequences.update({n : "restart_bootmanager_rins"})
423
424                 # repair_node_keys
425                 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
426
427                 #   conn.restart_node('rins')
428                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
429                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
430                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
433                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
434                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
435                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
436                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
437                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
438                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
439                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
440                                 ]:
441                         sequences.update({n : "restart_node_rins"})
442
443                 #       restart_node_boot
444                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
445                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
446                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
447                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
448                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
449                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
450                                  ]:
451                         sequences.update({n: "restart_node_boot"})
452
453                 # update_node_config_email
454                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
455                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
456                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
457                                 ]:
458                         sequences.update({n : "update_node_config_email"})
459
460                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
461                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
462                                 ]:
463                         sequences.update({n : "nodenetwork_email"})
464
465                 # update_bootcd_email
466                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
467                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
468                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
469                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
470                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
471                                 ]:
472                         sequences.update({n : "update_bootcd_email"})
473
474                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
475                                 ]:
476                         sequences.update({n: "suspect_error_email"})
477
478                 # update_hardware_email
479                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
480                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
481
482                 # broken_hardware_email
483                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
484
485                 # bad_dns_email
486                 for n in [ 
487                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
488                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
489                         ]:
490                         sequences.update( { n : "bad_dns_email"})
491
492                 return sequences
493
494         def getDiskSteps(self):
495                 steps = [
496                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
497                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
498                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
499
500                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
501
502                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
503                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
504
505                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
506                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
507
508                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
509                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
510
511                         ('floppytimeout','floppy0: floppy timeout called'),
512                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
513
514                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
515                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
516
517                         # floppy0: floppy timeout called
518                         # end_request: I/O error, dev fd0, sector 0
519
520                         # Buffer I/O error on device dm-2, logical block 8888896
521                         # ata1: status=0x51 { DriveReady SeekComplete Error }
522                         # ata1: error=0x40 { UncorrectableError }
523                         # SCSI error : <0 0 0 0> return code = 0x8000002
524                         # sda: Current: sense key: Medium Error
525                         #       Additional sense: Unrecovered read error - auto reallocate failed
526
527                         # SCSI error : <0 2 0 0> return code = 0x40001
528                         # end_request: I/O error, dev sda, sector 572489600
529                 ]
530                 return steps
531
532         def getDiskSequence(self, steps, child):
533                 sequence = []
534                 while True:
535                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
536                         sequence.append(id)
537
538                         if id == "done":
539                                 break
540                 return sequence
541
542         def getBootManagerStepPatterns(self):
543                 steps = [
544                         ('bminit'               , 'Initializing the BootManager.'),
545                         ('cfg'                  , 'Reading node configuration file.'),
546                         ('auth'                 , 'Authenticating node with PLC.'),
547                         ('getplc'               , 'Retrieving details of node from PLC.'),
548                         ('update'               , 'Updating node boot state at PLC.'),
549                         ('hardware'             , 'Checking if hardware requirements met.'),
550                         ('installinit'  , 'Install: Initializing.'),
551                         ('installdisk'  , 'Install: partitioning disks.'),
552                         ('installbootfs', 'Install: bootstrapfs tarball.'),
553                         ('installcfg'   , 'Install: Writing configuration files.'),
554                         ('installstop'  , 'Install: Shutting down installer.'),
555                         ('update2'              , 'Updating node boot state at PLC.'),
556                         ('installinit2' , 'Install: Initializing.'),
557                         ('validate'             , 'Validating node installation.'),
558                         ('rebuildinitrd', 'Rebuilding initrd'),
559                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
560                         ('update3'              , 'Updating node configuration.'),
561                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
562                         ('update4'              , 'Sending hardware configuration to PLC.'),
563                         ('debug'                , 'Starting debug mode'),
564                         ('bmexceptmount', 'BootManagerException during mount'),
565                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
566                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
567                         ('exception'    , 'Exception'),
568                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
569                         ('protoerror'   , 'XML RPC protocol error'),
570                         ('nodehostname' , 'Configured node hostname does not resolve'),
571                         ('implementerror', 'Implementation Error'),
572                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
573                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
574                         ('noinstall'    , 'notinstalled'),
575                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
576                         ('noblockdev'   , "No block devices detected."),
577                         ('dnserror'     , 'Name or service not known'),
578                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
579                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
580                         ('hardwarerequirefail' , 'Hardware requirements not met'),
581                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
582                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
583                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
584                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
585                         ('modulefail'   , 'Unable to get list of system modules'),
586                         ('writeerror'   , 'write error: No space left on device'),
587                         ('nospace'      , "No space left on device"),
588                         ('nonode'       , 'Failed to authenticate call: No such node'),
589                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
590                         ('bootcheckfail'     , 'BootCheckAuthentication'),
591                         ('bootupdatefail'   , 'BootUpdateNode'),
592                 ]
593                 return steps
594
595         def getBootManagerSequenceFromLog(self, steps, child):
596                 sequence = []
597                 while True:
598                         
599                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
600                         id = index_to_id(steps,index)
601                         sequence.append(id)
602
603                         if id == "exception":
604                                 print "...Found An Exception!!!"
605                         elif id == "done": #index == len(steps_to_list(steps)):
606                                 #print "Reached EOF"
607                                 break
608
609                 return sequence
610                 
611
612 def restore(sitehist, hostname, config=None, forced_action=None):
613
614         # NOTE: Nothing works if the bootcd is REALLY old.
615         #       So, this is the first step.
616
617         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
618         recent_actions = sitehist.getRecentActions(hostname=hostname)
619
620         if fbnode['observed_category'] == "OLDBOOTCD":
621                 print "\t...Notify owner to update BootImage!!!"
622
623                 if not found_within(recent_actions, 'newbootcd_notice', 3):
624                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
625
626                         print "\tDisabling %s due to out-of-date BootImage" % hostname
627                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
628
629                 # NOTE: nothing else is possible.
630                 return True
631
632         debugnode = DebugInterface(hostname)
633         conn = debugnode.getConnection()
634         #print "conn: %s" % conn
635         #print "trying to use conn after returning it."
636         #print conn.c.modules.sys.path
637         #print conn.c.modules.os.path.exists('/tmp/source')
638         if type(conn) == type(False): return False
639
640         #if forced_action == "reboot":
641         #       conn.restart_node('rins')
642         #       return True
643
644         boot_state = conn.get_boot_state()
645         if boot_state != "debug":
646                 print "... %s in %s state: skipping..." % (hostname , boot_state)
647                 return boot_state == "boot"
648
649         if conn.bootmanager_running():
650                 print "...BootManager is currently running.  Skipping host %s" %hostname 
651                 return True
652
653         # Read persistent flags, tagged on one week intervals.
654
655         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
656         dmesg = conn.get_dmesg()
657         child = fdpexpect.fdspawn(dmesg)
658
659         steps = debugnode.getDiskSteps()
660         sequence = debugnode.getDiskSequence(steps, child)
661
662         s = Set(sequence)
663         if config and not config.quiet: print "\tSET: ", s
664
665         if len(s) > 1:
666                 print "...Potential drive errors on %s" % hostname 
667                 if len(s) == 2 and 'floppyerror' in s:
668                         print "...Should investigate.  Continuing with node."
669                 else:
670                         print "...Should investigate.  Skipping node."
671                         # TODO: send message related to these errors.
672
673                         if not found_within(recent_actions, 'newbootcd_notice', 3):
674
675                                 log=conn.get_dmesg().read()
676                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
677                                 conn.set_nodestate('disable')
678
679                         return False
680
681         print "...Downloading bm.log from %s" %hostname 
682         log = conn.get_bootmanager_log()
683         child = fdpexpect.fdspawn(log)
684
685         if hasattr(config, 'collect') and config.collect: return True
686
687         if config and not config.quiet: print "...Scanning bm.log for errors"
688
689         time.sleep(1)
690
691         steps = debugnode.getBootManagerStepPatterns()
692         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
693                 
694         s = "-".join(sequence)
695         print "   FOUND SEQUENCE: ", s
696
697         # NOTE: We get or set the flag based on the current sequence identifier.
698         #  By using the sequence identifier, we guarantee that there will be no
699         #  frequent loops.  I'm guessing there is a better way to track loops,
700         #  though.
701
702         sequences = debugnode.getSequences()
703         flag_set = True
704         
705         if s not in sequences:
706                 print "   HOST %s" % hostname
707                 print "   UNKNOWN SEQUENCE: %s" % s
708
709                 args = {}
710                 args['hostname'] = hostname
711                 args['sequence'] = s
712                 args['bmlog'] = conn.get_bootmanager_log().read()
713                 args['viart'] = False
714
715                 sitehist.sendMessage('unknownsequence_notice', **args)
716
717                 conn.restart_bootmanager('boot')
718
719                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
720                 # This way, we can check it again after we've fixed it.
721                 flag_set = False
722
723         else:
724
725                 if   sequences[s] == "restart_bootmanager_boot":
726                         print "...Restarting BootManager.py on %s "%hostname 
727                         conn.restart_bootmanager('boot')
728                 elif sequences[s] == "restart_bootmanager_rins":
729                         print "...Restarting BootManager.py on %s "%hostname 
730                         conn.restart_bootmanager('rins')
731                 elif sequences[s] == "restart_node_rins":
732                         conn.restart_node('rins')
733                 elif sequences[s] == "restart_node_boot":
734                         conn.restart_node('boot')
735                 elif sequences[s] == "repair_node_keys":
736                         if conn.compare_and_repair_nodekeys():
737                                 # the keys either are in sync or were forced in sync.
738                                 # so try to reboot the node again.
739                                 conn.restart_bootmanager('rins')
740                                 pass
741                         else:
742                                 # there was some failure to synchronize the keys.
743                                 print "...Unable to repair node keys on %s" %hostname 
744
745                 elif sequences[s] == "suspect_error_email":
746                         args = {}
747                         args['hostname'] = hostname
748                         args['sequence'] = s
749                         args['bmlog'] = conn.get_bootmanager_log().read()
750                         args['viart'] = False
751
752                         sitehist.sendMessage('unknownsequence_notice', **args)
753                         conn.restart_bootmanager('boot')
754
755                 # TODO: differentiate this and the 'nodenetwork_email' actions.
756                 elif sequences[s] == "update_node_config_email":
757
758                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
759                                 args = {}
760                                 args['hostname'] = hostname
761                                 sitehist.sendMessage('nodeconfig_notice', **args)
762                                 conn.dump_plconf_file()
763
764                 elif sequences[s] == "nodenetwork_email":
765
766                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
767                                 args = {}
768                                 args['hostname'] = hostname
769                                 args['bmlog'] = conn.get_bootmanager_log().read()
770                                 sitehist.sendMessage('nodeconfig_notice', **args)
771                                 conn.dump_plconf_file()
772
773                 elif sequences[s] == "update_bootcd_email":
774
775                         if not found_within(recent_actions, 'newalphacd_notice', 3):
776                                 args = {}
777                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
778                                 args['hostname'] = hostname
779                         
780                                 sitehist.sendMessage('newalphacd_notice', **args)
781
782                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
783
784                 elif sequences[s] == "broken_hardware_email":
785                         # MAKE An ACTION record that this host has failed hardware.  May
786                         # require either an exception "/minhw" or other manual intervention.
787                         # Definitely need to send out some more EMAIL.
788                         # TODO: email notice of broken hardware
789                         if not found_within(recent_actions, 'baddisk_notice', 1):
790                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
791                                 args = {}
792                                 args['hostname'] = hostname
793                                 args['log'] = conn.get_dmesg().read()
794
795                                 sitehist.sendMessage('baddisk_notice', **args)
796                                 conn.set_nodestate('disable')
797
798                 elif sequences[s] == "update_hardware_email":
799                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
800                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
801                                 args = {}
802                                 args['hostname'] = hostname
803                                 args['bmlog'] = conn.get_bootmanager_log().read()
804                                 sitehist.sendMessage('minimalhardware_notice', **args)
805
806                 elif sequences[s] == "bad_dns_email":
807                         if not found_within(recent_actions, 'baddns_notice', 1):
808                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
809                                 args = {}
810                                 try:
811                                         node = api.GetNodes(hostname)[0]
812                                         net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
813                                 except:
814                                         email_exception()
815                                         print traceback.print_exc()
816                                         # TODO: api error. skip email, b/c all info is not available,
817                                         # flag_set will not be recorded.
818                                         return False
819                                 nodenet_str = network_config_to_str(net)
820
821                                 args['hostname'] = hostname
822                                 args['network_config'] = nodenet_str
823                                 args['nodenetwork_id'] = net['nodenetwork_id']
824
825                                 sitehist.sendMessage('baddns_notice', **args)
826
827         return True
828         
829
830 # MAIN -------------------------------------------------------------------
831
832 def main():
833         from monitor import parser as parsermodule
834         parser = parsermodule.getParser()
835
836         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
837                                                 force=None, quiet=False)
838         parser.add_option("", "--child", dest="child", action="store_true", 
839                                                 help="This is the child mode of this process.")
840         parser.add_option("", "--force", dest="force", metavar="boot_state",
841                                                 help="Force a boot state passed to BootManager.py.")
842         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
843                                                 help="Extra quiet output messages.")
844         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
845                                                 help="Extra debug output messages.")
846         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
847                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
848         parser.add_option("", "--collect", dest="collect", action="store_true", 
849                                                 help="No action, just collect dmesg, and bm.log")
850         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
851                                                 help="Do not perform the orginary setup phase.")
852
853         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
854         config = parsermodule.parse_args(parser)
855
856         if config.nodelist:
857                 nodes = config.getListFromFile(config.nodelist)
858         elif config.node:
859                 nodes = [ config.node ]
860         else:
861                 parser.print_help()
862                 sys.exit(1)
863
864         for node in nodes:
865                 reboot(node, config)
866
867 if __name__ == "__main__":
868         main()