531f883e6adad05612e458f1235a7f5395643662
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73                 log = open("log/dmesg.%s.log" % self.node, 'r')
74                 return log
75
76         def get_bootmanager_log(self):
77                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79                 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80                 log = open("log/bm.%s.log" % self.node, 'r')
81                 return log
82
83         def dump_plconf_file(self):
84                 c = self.c
85                 self.c.modules.sys.path.append("/tmp/source/")
86                 self.c.modules.os.chdir('/tmp/source')
87
88                 log = c.modules.BootManager.log('/tmp/new.log')
89                 bm = c.modules.BootManager.BootManager(log,'boot')
90
91                 BootManagerException = c.modules.Exceptions.BootManagerException
92                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
94                 bm_continue = True
95
96                 InitializeBootManager.Run(bm.VARS, bm.LOG)
97                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
98                 except Exception, x:
99                         bm_continue = False
100                         print "   ERROR:", x
101                         print "   Possibly, unable to find valid configuration file"
102
103                 if bm_continue:
104                         for key in bm.VARS.keys():
105                                 print key, " == ", bm.VARS[key]
106                 else:
107                         print "   Unable to read Node Configuration"
108                 
109
110         def compare_and_repair_nodekeys(self):
111                 c = self.c
112                 self.c.modules.sys.path.append("/tmp/source/")
113                 self.c.modules.os.chdir('/tmp/source')
114
115                 log = c.modules.BootManager.log('/tmp/new.log')
116                 bm = c.modules.BootManager.BootManager(log,'boot')
117
118                 BootManagerException = c.modules.Exceptions.BootManagerException
119                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
121                 bm_continue = True
122
123                 plcnode = plccache.GetNodeByName(self.node)
124
125                 InitializeBootManager.Run(bm.VARS, bm.LOG)
126                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
127                 except Exception, x:
128                         bm_continue = False
129                         print "exception"
130                         print x
131                         print "   Possibly, unable to find valid configuration file"
132
133                 if bm_continue:
134                         print "   NODE: %s" % bm.VARS['NODE_KEY']
135                         print "   PLC : %s" % plcnode['key']
136
137                         if bm.VARS['NODE_KEY'] == plcnode['key']:
138                                 return True
139                         else:
140                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141                                         print "   Successfully updated NODE_KEY with PLC"
142                                         return True
143                                 else:
144                                         return False
145                                 
146                         #for key in bm.VARS.keys():
147                         #       print key, " == ", bm.VARS[key]
148                 else:
149                         print "   Unable to retrieve NODE_KEY"
150
151         def bootmanager_running(self):
152                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
153                         return True
154                 else:
155                         return False
156
157         def set_nodestate(self, state='boot'):
158                 return api.UpdateNode(self.node, {'boot_state' : state})
159
160         def restart_node(self, state='boot'):
161                 api.UpdateNode(self.node, {'boot_state' : state})
162
163                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164                 if not pflags.getRecentFlag('gentlekill'):
165                         print "   Killing all slice processes... : %s" %  self.node
166                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167                         self.c.modules.os.system(cmd_slicekill)
168                         cmd = """ shutdown -r +1 & """
169                         print "   Restarting %s : %s" % ( self.node, cmd)
170                         self.c.modules.os.system(cmd)
171
172                         pflags.setRecentFlag('gentlekill')
173                         pflags.save()
174                 else:
175                         print "   Restarting with sysrq 'sub' %s" % self.node
176                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177                         self.c.modules.os.system(cmd)
178
179                 return
180
181         def restart_bootmanager(self, forceState):
182
183                 self.c.modules.os.chdir('/tmp/source')
184                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185                         print "   BootManager is already running: try again soon..."
186                 else:
187                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
189                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
190                                   "  rm -f /tmp/BM_RUNNING " + \
191                                   ") &" 
192                         cmd = cmd % forceState
193                         self.c.modules.os.system(cmd)
194
195                 return 
196
197
198 class PlanetLabSession:
199         globalport = 22000 + int(random.random()*1000)
200
201         def __init__(self, node, nosetup, verbose):
202                 self.verbose = verbose
203                 self.node = node
204                 self.port = None
205                 self.nosetup = nosetup
206                 self.command = None
207                 self.setup_host()
208
209         def get_connection(self, config):
210                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
211                 #i = 0
212                 #while i < 3: 
213                 #       print i, conn.c.modules.sys.path
214                 #       print conn.c.modules.os.path.exists('/tmp/source')
215                 #       i+=1
216                 #       time.sleep(1)
217                 return conn
218         
219         def setup_host(self):
220                 self.port = PlanetLabSession.globalport
221                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
222
223                 args = {}
224                 args['port'] = self.port
225                 args['user'] = 'root'
226                 args['hostname'] = self.node
227                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
228                 ssh_port = 22
229
230                 if self.nosetup:
231                         print "Skipping setup"
232                         return 
233
234                 # COPY Rpyc files to host
235                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236                 if self.verbose: print cmd
237                 print cmd
238                 # TODO: Add timeout
239                 timeout = 120
240                 localos = moncommands.CMD()
241
242                 ret = localos.system(cmd, timeout)
243                 print ret
244                 if ret != 0:
245                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248                         ret = localos.system(cmd, timeout)
249                         print ret
250                         if ret != 0:
251                                 print "\tFAILED TWICE"
252                                 #sys.exit(1)
253                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
254
255                 t1 = time.time()
256                 # KILL any already running servers.
257                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
259             rm -f out.log
260             echo "kill server" >> out.log
261             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
262             echo "export" >> out.log
263             export PYTHONPATH=$HOME  ;
264             echo "start server" >> out.log
265             python Rpyc/Servers/forking_server.py &> server.log &
266             echo "done" >> out.log
267 EOF""")
268                 #cmd = """ssh %(user)s@%(hostname)s """ + \
269                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
270                 #cmd = cmd % args
271                 #if self.verbose: print cmd
272                 ## TODO: Add timeout
273                 #print localos.system(cmd,timeout)
274
275                 ## START a new rpyc server.
276                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
278                 #cmd = cmd % args
279                 #if self.verbose: print cmd
280                 #print localos.system(cmd,timeout)
281                 print "setup rpyc server over ssh"
282                 print ssh.ret
283
284                 # TODO: Add timeout
285                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
286                 # and the following options seems to work well.
287                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289                           """-o ConnectTimeout=120 """ + \
290                           """-n -N -L %(port)s:localhost:18812 """ + \
291                           """%(user)s@%(hostname)s"""
292                 cmd = cmd % args
293                 if self.verbose: print cmd
294                 print cmd
295                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296                 # TODO: the read() here may block indefinitely.  Need a better
297                 # approach therefore, that includes a timeout.
298                 #ret = self.command.stdout.read(5)
299                 ret = moncommands.read_t(self.command.stdout, 5)
300
301                 t2 = time.time()
302                 if 'READY' in ret:
303                         # NOTE: There is still a slight race for machines that are slow...
304                         self.timeout = 2*(t2-t1)
305                         print "Sleeping for %s sec" % self.timeout
306                         time.sleep(self.timeout)
307                         return
308
309                 if self.command.returncode is not None:
310                         print "Failed to establish tunnel!"
311                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
312
313                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
314
315         def __del__(self):
316                 if self.command:
317                         if self.verbose: print "Killing SSH session %s" % self.port
318                         print "Killing SSH session %s" % self.port
319                         self.command.kill()
320
321         
322 def steps_to_list(steps, index=1):
323         return map(lambda x: x[index], steps)
324
325 def index_to_id(steps,index):
326         if index < len(steps):
327                 return steps[index][0]
328         else:
329                 return "done"
330
331 class DebugInterface:
332         def __init__(self, hostname):
333                 self.hostname = hostname
334                 self.session = None
335
336         def getConnection(self):
337                 print "Creating session for %s" % self.hostname
338                 # update known_hosts file (in case the node has rebooted since last run)
339                 try:
340                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
341                 except:
342                         email_exception()
343                         print traceback.print_exc()
344                         return False
345
346                 try:
347                         if config == None:
348                                 self.session = PlanetLabSession(self.hostname, False, True)
349                         else:
350                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351                 except ExceptionDoubleSSHError, e:
352                         msg = "ERROR setting up session for %s" % self.hostname
353                         print msg
354                         return False
355                 except Exception, e:
356                         traceback.print_exc()
357                         email_exception(msg)
358                         return False
359
360                 try:
361                         conn = self.session.get_connection(config)
362                 except EOFError:
363                         # NOTE: sometimes the wait in setup_host() is not long enough.  
364                         # So, here we try to wait a little longer before giving up entirely.
365                         try:
366                                 time.sleep(self.session.timeout*5)
367                                 conn = self.session.get_connection(config)
368                         except EOFError:
369                                 # failed twice... no need to report this really, it's just in a
370                                 # weird state...
371                                 return False
372                         except:
373                                 traceback.print_exc()
374                                 email_exception(self.hostname)
375                                 return False
376                 #print "trying to use conn before returning it."
377                 #print conn.c.modules.sys.path
378                 #print conn.c.modules.os.path.exists('/tmp/source')
379                 #time.sleep(1)
380
381                 #print "conn: %s" % conn
382                 return conn
383
384         def getSequences(self):
385
386                 # TODO: This can be replaced with a DB definition at a future time.
387                 #               This would make it possible for an admin to introduce new
388                 #               patterns without touching code.
389                 
390                 sequences = {}
391                 # restart_bootmanager_boot
392                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
395
396                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
397
398                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400                                 "bminit-cfg-auth-getplc-update-debug-done",
401                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
407                                 ]:
408                         sequences.update({n : "restart_bootmanager_boot"})
409
410                 #       conn.restart_bootmanager('reinstall')
411                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428                                 # actual solution appears to involve removing the bad files, and
429                                 # continually trying to boot the node.
430                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
433                                 ]:
434                         sequences.update({n : "restart_bootmanager_rins"})
435
436                 # repair_node_keys
437                 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
438
439                 #   conn.restart_node('reinstall')
440                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
441                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
442                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
443                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
444                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
445                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
446                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
447                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
448                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
449                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
450                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
451                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
452                                 ]:
453                         sequences.update({n : "restart_node_rins"})
454
455                 #       restart_node_boot
456                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
457                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
458                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
459                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
460                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
461                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
462                                  ]:
463                         sequences.update({n: "restart_node_boot"})
464
465                 # update_node_config_email
466                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
467                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
468                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
469                                 ]:
470                         sequences.update({n : "update_node_config_email"})
471
472                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
473                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
474                                 ]:
475                         sequences.update({n : "nodenetwork_email"})
476
477                 # update_bootcd_email
478                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
479                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
480                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
481                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
482                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
483                                 ]:
484                         sequences.update({n : "update_bootcd_email"})
485
486                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
487                                 ]:
488                         sequences.update({n: "suspect_error_email"})
489
490                 # update_hardware_email
491                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
492                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
493
494                 # broken_hardware_email
495                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
496
497                 # bad_dns_email
498                 for n in [ 
499                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
500                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
501                         ]:
502                         sequences.update( { n : "bad_dns_email"})
503
504                 return sequences
505
506         def getDiskSteps(self):
507                 steps = [
508                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
509                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
510                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
511
512                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
513
514                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
515                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
516
517                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
518                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
519
520                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
521                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
522
523                         ('floppytimeout','floppy0: floppy timeout called'),
524                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
525
526                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
527                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
528
529                         # floppy0: floppy timeout called
530                         # end_request: I/O error, dev fd0, sector 0
531
532                         # Buffer I/O error on device dm-2, logical block 8888896
533                         # ata1: status=0x51 { DriveReady SeekComplete Error }
534                         # ata1: error=0x40 { UncorrectableError }
535                         # SCSI error : <0 0 0 0> return code = 0x8000002
536                         # sda: Current: sense key: Medium Error
537                         #       Additional sense: Unrecovered read error - auto reallocate failed
538
539                         # SCSI error : <0 2 0 0> return code = 0x40001
540                         # end_request: I/O error, dev sda, sector 572489600
541                 ]
542                 return steps
543
544         def getDiskSequence(self, steps, child):
545                 sequence = []
546                 while True:
547                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
548                         sequence.append(id)
549
550                         if id == "done":
551                                 break
552                 return sequence
553
554         def getBootManagerStepPatterns(self):
555                 steps = [
556                         ('bminit'               , 'Initializing the BootManager.'),
557                         ('cfg'                  , 'Reading node configuration file.'),
558                         ('auth'                 , 'Authenticating node with PLC.'),
559                         ('getplc'               , 'Retrieving details of node from PLC.'),
560                         ('update'               , 'Updating node boot state at PLC.'),
561                         ('hardware'             , 'Checking if hardware requirements met.'),
562                         ('installinit'  , 'Install: Initializing.'),
563                         ('installdisk'  , 'Install: partitioning disks.'),
564                         ('installbootfs', 'Install: bootstrapfs tarball.'),
565                         ('installcfg'   , 'Install: Writing configuration files.'),
566                         ('installstop'  , 'Install: Shutting down installer.'),
567                         ('update2'              , 'Updating node boot state at PLC.'),
568                         ('installinit2' , 'Install: Initializing.'),
569                         ('validate'             , 'Validating node installation.'),
570                         ('rebuildinitrd', 'Rebuilding initrd'),
571                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
572                         ('update3'              , 'Updating node configuration.'),
573                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
574                         ('update4'              , 'Sending hardware configuration to PLC.'),
575                         ('debug'                , 'Starting debug mode'),
576                         ('bmexceptmount', 'BootManagerException during mount'),
577                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
578                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
579                         ('exception'    , 'Exception'),
580                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
581                         ('protoerror'   , 'XML RPC protocol error'),
582                         ('nodehostname' , 'Configured node hostname does not resolve'),
583                         ('implementerror', 'Implementation Error'),
584                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
585                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
586                         ('noinstall'    , 'notinstalled'),
587                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
588                         ('noblockdev'   , "No block devices detected."),
589                         ('dnserror'     , 'Name or service not known'),
590                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
591                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
592                         ('hardwarerequirefail' , 'Hardware requirements not met'),
593                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
594                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
595                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
596                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
597                         ('modulefail'   , 'Unable to get list of system modules'),
598                         ('writeerror'   , 'write error: No space left on device'),
599                         ('nospace'      , "No space left on device"),
600                         ('nonode'       , 'Failed to authenticate call: No such node'),
601                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
602                         ('bootcheckfail'     , 'BootCheckAuthentication'),
603                         ('bootupdatefail'   , 'BootUpdateNode'),
604                 ]
605                 return steps
606
607         def getBootManagerSequenceFromLog(self, steps, child):
608                 sequence = []
609                 while True:
610                         
611                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
612                         id = index_to_id(steps,index)
613                         sequence.append(id)
614
615                         if id == "exception":
616                                 print "...Found An Exception!!!"
617                         elif id == "done": #index == len(steps_to_list(steps)):
618                                 #print "Reached EOF"
619                                 break
620
621                 return sequence
622                 
623
624 def restore(sitehist, hostname, config=None, forced_action=None):
625
626         # NOTE: Nothing works if the bootcd is REALLY old.
627         #       So, this is the first step.
628
629         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
630         recent_actions = sitehist.getRecentActions(hostname=hostname)
631
632         if fbnode['observed_category'] == "OLDBOOTCD":
633                 print "\t...Notify owner to update BootImage!!!"
634
635                 if not found_within(recent_actions, 'newbootcd_notice', 3):
636                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
637
638                         print "\tDisabling %s due to out-of-date BootImage" % hostname
639                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
640
641                 # NOTE: nothing else is possible.
642                 return True
643
644         debugnode = DebugInterface(hostname)
645         conn = debugnode.getConnection()
646         #print "conn: %s" % conn
647         #print "trying to use conn after returning it."
648         #print conn.c.modules.sys.path
649         #print conn.c.modules.os.path.exists('/tmp/source')
650         if type(conn) == type(False): return False
651
652         #if forced_action == "reboot":
653         #       conn.restart_node('reinstall')
654         #       return True
655
656         boot_state = conn.get_boot_state()
657         if boot_state != "debug":
658                 print "... %s in %s state: skipping..." % (hostname , boot_state)
659                 return boot_state == "boot"
660
661         if conn.bootmanager_running():
662                 print "...BootManager is currently running.  Skipping host %s" %hostname 
663                 return True
664
665         # Read persistent flags, tagged on one week intervals.
666
667         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
668         dmesg = conn.get_dmesg()
669         child = fdpexpect.fdspawn(dmesg)
670
671         steps = debugnode.getDiskSteps()
672         sequence = debugnode.getDiskSequence(steps, child)
673
674         s = Set(sequence)
675         if config and not config.quiet: print "\tSET: ", s
676
677         if len(s) > 1:
678                 print "...Potential drive errors on %s" % hostname 
679                 if len(s) == 2 and 'floppyerror' in s:
680                         print "...Should investigate.  Continuing with node."
681                 else:
682                         print "...Should investigate.  Skipping node."
683                         # TODO: send message related to these errors.
684
685                         if not found_within(recent_actions, 'newbootcd_notice', 3):
686
687                                 log=conn.get_dmesg().read()
688                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
689                                 conn.set_nodestate('disable')
690
691                         return False
692
693         print "...Downloading bm.log from %s" %hostname 
694         log = conn.get_bootmanager_log()
695         child = fdpexpect.fdspawn(log)
696
697         if hasattr(config, 'collect') and config.collect: return True
698
699         if config and not config.quiet: print "...Scanning bm.log for errors"
700
701         time.sleep(1)
702
703         steps = debugnode.getBootManagerStepPatterns()
704         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
705                 
706         s = "-".join(sequence)
707         print "   FOUND SEQUENCE: ", s
708
709         # NOTE: We get or set the flag based on the current sequence identifier.
710         #  By using the sequence identifier, we guarantee that there will be no
711         #  frequent loops.  I'm guessing there is a better way to track loops,
712         #  though.
713
714         sequences = debugnode.getSequences()
715         flag_set = True
716         
717         if s not in sequences:
718                 print "   HOST %s" % hostname
719                 print "   UNKNOWN SEQUENCE: %s" % s
720
721                 args = {}
722                 args['hostname'] = hostname
723                 args['sequence'] = s
724                 args['bmlog'] = conn.get_bootmanager_log().read()
725                 args['viart'] = False
726
727                 sitehist.sendMessage('unknownsequence_notice', **args)
728
729                 conn.restart_bootmanager('boot')
730
731                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
732                 # This way, we can check it again after we've fixed it.
733                 flag_set = False
734
735         else:
736
737                 if   sequences[s] == "restart_bootmanager_boot":
738                         print "...Restarting BootManager.py on %s "%hostname 
739                         conn.restart_bootmanager('boot')
740                 elif sequences[s] == "restart_bootmanager_rins":
741                         print "...Restarting BootManager.py on %s "%hostname 
742                         conn.restart_bootmanager('reinstall')
743                 elif sequences[s] == "restart_node_rins":
744                         conn.restart_node('reinstall')
745                 elif sequences[s] == "restart_node_boot":
746                         conn.restart_node('boot')
747                 elif sequences[s] == "repair_node_keys":
748                         if conn.compare_and_repair_nodekeys():
749                                 # the keys either are in sync or were forced in sync.
750                                 # so try to reboot the node again.
751                                 conn.restart_bootmanager('reinstall')
752                                 pass
753                         else:
754                                 # there was some failure to synchronize the keys.
755                                 print "...Unable to repair node keys on %s" %hostname 
756
757                 elif sequences[s] == "suspect_error_email":
758                         args = {}
759                         args['hostname'] = hostname
760                         args['sequence'] = s
761                         args['bmlog'] = conn.get_bootmanager_log().read()
762                         args['viart'] = False
763
764                         sitehist.sendMessage('unknownsequence_notice', **args)
765                         conn.restart_bootmanager('boot')
766
767                 # TODO: differentiate this and the 'nodenetwork_email' actions.
768                 elif sequences[s] == "update_node_config_email":
769
770                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
771                                 args = {}
772                                 args['hostname'] = hostname
773                                 sitehist.sendMessage('nodeconfig_notice', **args)
774                                 conn.dump_plconf_file()
775
776                 elif sequences[s] == "nodenetwork_email":
777
778                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
779                                 args = {}
780                                 args['hostname'] = hostname
781                                 args['bmlog'] = conn.get_bootmanager_log().read()
782                                 sitehist.sendMessage('nodeconfig_notice', **args)
783                                 conn.dump_plconf_file()
784
785                 elif sequences[s] == "update_bootcd_email":
786
787                         if not found_within(recent_actions, 'newalphacd_notice', 3):
788                                 args = {}
789                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
790                                 args['hostname'] = hostname
791                         
792                                 sitehist.sendMessage('newalphacd_notice', **args)
793
794                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
795
796                 elif sequences[s] == "broken_hardware_email":
797                         # MAKE An ACTION record that this host has failed hardware.  May
798                         # require either an exception "/minhw" or other manual intervention.
799                         # Definitely need to send out some more EMAIL.
800                         # TODO: email notice of broken hardware
801                         if not found_within(recent_actions, 'baddisk_notice', 1):
802                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
803                                 args = {}
804                                 args['hostname'] = hostname
805                                 args['log'] = conn.get_dmesg().read()
806
807                                 sitehist.sendMessage('baddisk_notice', **args)
808                                 conn.set_nodestate('disable')
809
810                 elif sequences[s] == "update_hardware_email":
811                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
812                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
813                                 args = {}
814                                 args['hostname'] = hostname
815                                 args['bmlog'] = conn.get_bootmanager_log().read()
816                                 sitehist.sendMessage('minimalhardware_notice', **args)
817
818                 elif sequences[s] == "bad_dns_email":
819                         if not found_within(recent_actions, 'baddns_notice', 1):
820                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
821                                 args = {}
822                                 try:
823                                         node = plccache.GetNodeByName(hostname)
824                                         net = api.GetInterfaces(node['interface_ids'])[0]
825                                 except:
826                                         email_exception()
827                                         print traceback.print_exc()
828                                         # TODO: api error. skip email, b/c all info is not available,
829                                         # flag_set will not be recorded.
830                                         return False
831                                 nodenet_str = network_config_to_str(net)
832
833                                 args['hostname'] = hostname
834                                 args['network_config'] = nodenet_str
835                                 args['interface_id'] = net['interface_id']
836
837                                 sitehist.sendMessage('baddns_notice', **args)
838
839         return True
840         
841
842 # MAIN -------------------------------------------------------------------
843
844 def main():
845         from monitor import parser as parsermodule
846         parser = parsermodule.getParser()
847
848         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
849                                                 force=None, quiet=False)
850         parser.add_option("", "--child", dest="child", action="store_true", 
851                                                 help="This is the child mode of this process.")
852         parser.add_option("", "--force", dest="force", metavar="boot_state",
853                                                 help="Force a boot state passed to BootManager.py.")
854         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
855                                                 help="Extra quiet output messages.")
856         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
857                                                 help="Extra debug output messages.")
858         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
859                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
860         parser.add_option("", "--collect", dest="collect", action="store_true", 
861                                                 help="No action, just collect dmesg, and bm.log")
862         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
863                                                 help="Do not perform the orginary setup phase.")
864
865         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
866         config = parsermodule.parse_args(parser)
867
868         if config.nodelist:
869                 nodes = config.getListFromFile(config.nodelist)
870         elif config.node:
871                 nodes = [ config.node ]
872         else:
873                 parser.print_help()
874                 sys.exit(1)
875
876         for node in nodes:
877                 # get sitehist
878                 lb = plccache.plcdb_hn2lb[node]
879                 sitehist = SiteInterface.get_or_make(loginbase=lb)
880                 #reboot(node, config)
881                 restore(sitehist, node, config=None, forced_action=None)
882
883 if __name__ == "__main__":
884         main()