added comonquery command-line tool.
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6
7 import os
8 import sys
9 import time
10 import random
11 import signal
12 import traceback
13 import subprocess
14 from sets import Set
15
16 from monitor.getsshkeys import SSHKnownHosts
17
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
20
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
32
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
38
39
40
41 api = plc.getAuthAPI()
42 fb = None
43
44
45 class ExceptionDoubleSSHError(Exception): pass
46
47 class NodeConnection:
48         def __init__(self, connection, node, config):
49                 self.node = node
50                 self.c = connection
51                 self.config = config
52
53         def get_boot_state(self):
54                 try:
55                         if self.c.modules.os.path.exists('/tmp/source'):
56                                 return "debug"
57                         elif self.c.modules.os.path.exists('/vservers'): 
58                                 return "boot"
59                         else:
60                                 return "unknown"
61                 except EOFError:
62                         traceback.print_exc()
63                         print self.c.modules.sys.path
64                 except:
65                         email_exception()
66                         traceback.print_exc()
67
68                 return "unknown"
69
70         def get_dmesg(self):
71                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73                 log = open("log/dmesg.%s.log" % self.node, 'r')
74                 return log
75
76         def get_bootmanager_log(self):
77                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79                 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80                 log = open("log/bm.%s.log" % self.node, 'r')
81                 return log
82
83         def dump_plconf_file(self):
84                 c = self.c
85                 self.c.modules.sys.path.append("/tmp/source/")
86                 self.c.modules.os.chdir('/tmp/source')
87
88                 log = c.modules.BootManager.log('/tmp/new.log')
89                 bm = c.modules.BootManager.BootManager(log,'boot')
90
91                 BootManagerException = c.modules.Exceptions.BootManagerException
92                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
94                 bm_continue = True
95
96                 InitializeBootManager.Run(bm.VARS, bm.LOG)
97                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
98                 except Exception, x:
99                         bm_continue = False
100                         print "   ERROR:", x
101                         print "   Possibly, unable to find valid configuration file"
102
103                 if bm_continue:
104                         for key in bm.VARS.keys():
105                                 print key, " == ", bm.VARS[key]
106                 else:
107                         print "   Unable to read Node Configuration"
108                 
109
110         def compare_and_repair_nodekeys(self):
111                 c = self.c
112                 self.c.modules.sys.path.append("/tmp/source/")
113                 self.c.modules.os.chdir('/tmp/source')
114
115                 log = c.modules.BootManager.log('/tmp/new.log')
116                 bm = c.modules.BootManager.BootManager(log,'boot')
117
118                 BootManagerException = c.modules.Exceptions.BootManagerException
119                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
121                 bm_continue = True
122
123                 plcnode = plccache.GetNodeByName(self.node)
124
125                 InitializeBootManager.Run(bm.VARS, bm.LOG)
126                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
127                 except Exception, x:
128                         bm_continue = False
129                         print "exception"
130                         print x
131                         print "   Possibly, unable to find valid configuration file"
132
133                 if bm_continue:
134                         print "   NODE: %s" % bm.VARS['NODE_KEY']
135                         print "   PLC : %s" % plcnode['key']
136
137                         if bm.VARS['NODE_KEY'] == plcnode['key']:
138                                 return True
139                         else:
140                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141                                         print "   Successfully updated NODE_KEY with PLC"
142                                         return True
143                                 else:
144                                         return False
145                                 
146                         #for key in bm.VARS.keys():
147                         #       print key, " == ", bm.VARS[key]
148                 else:
149                         print "   Unable to retrieve NODE_KEY"
150
151         def bootmanager_running(self):
152                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
153                         return True
154                 else:
155                         return False
156
157         def set_nodestate(self, state='boot'):
158                 return api.UpdateNode(self.node, {'boot_state' : state})
159
160         def restart_node(self, state='boot'):
161                 api.UpdateNode(self.node, {'boot_state' : state})
162
163                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164                 if not pflags.getRecentFlag('gentlekill'):
165                         print "   Killing all slice processes... : %s" %  self.node
166                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167                         self.c.modules.os.system(cmd_slicekill)
168                         cmd = """ shutdown -r +1 & """
169                         print "   Restarting %s : %s" % ( self.node, cmd)
170                         self.c.modules.os.system(cmd)
171
172                         pflags.setRecentFlag('gentlekill')
173                         pflags.save()
174                 else:
175                         print "   Restarting with sysrq 'sub' %s" % self.node
176                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177                         self.c.modules.os.system(cmd)
178
179                 return
180
181         def restart_bootmanager(self, forceState):
182
183                 self.c.modules.os.chdir('/tmp/source')
184                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185                         print "   BootManager is already running: try again soon..."
186                 else:
187                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
189                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
190                                   "  rm -f /tmp/BM_RUNNING " + \
191                                   ") &" 
192                         cmd = cmd % forceState
193                         self.c.modules.os.system(cmd)
194
195                 return 
196
197
198 class PlanetLabSession:
199         globalport = 22000 + int(random.random()*1000)
200
201         def __init__(self, node, nosetup, verbose):
202                 self.verbose = verbose
203                 self.node = node
204                 self.port = None
205                 self.nosetup = nosetup
206                 self.command = None
207                 self.setup_host()
208
209         def get_connection(self, config):
210                 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
211                 #i = 0
212                 #while i < 3: 
213                 #       print i, conn.c.modules.sys.path
214                 #       print conn.c.modules.os.path.exists('/tmp/source')
215                 #       i+=1
216                 #       time.sleep(1)
217                 return conn
218         
219         def setup_host(self):
220                 self.port = PlanetLabSession.globalport
221                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
222
223                 args = {}
224                 args['port'] = self.port
225                 args['user'] = 'root'
226                 args['hostname'] = self.node
227                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
228                 ssh_port = 22
229
230                 if self.nosetup:
231                         print "Skipping setup"
232                         return 
233
234                 # COPY Rpyc files to host
235                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236                 if self.verbose: print cmd
237                 print cmd
238                 # TODO: Add timeout
239                 timeout = 120
240                 localos = moncommands.CMD()
241
242                 ret = localos.system(cmd, timeout)
243                 print ret
244                 if ret != 0:
245                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248                         ret = localos.system(cmd, timeout)
249                         print ret
250                         if ret != 0:
251                                 print "\tFAILED TWICE"
252                                 #sys.exit(1)
253                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
254
255                 t1 = time.time()
256                 # KILL any already running servers.
257                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
259             rm -f out.log
260             echo "kill server" >> out.log
261             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
262             echo "export" >> out.log
263             export PYTHONPATH=$HOME  ;
264             echo "start server" >> out.log
265             python Rpyc/Servers/forking_server.py &> server.log &
266             echo "done" >> out.log
267 EOF""")
268                 #cmd = """ssh %(user)s@%(hostname)s """ + \
269                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
270                 #cmd = cmd % args
271                 #if self.verbose: print cmd
272                 ## TODO: Add timeout
273                 #print localos.system(cmd,timeout)
274
275                 ## START a new rpyc server.
276                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
278                 #cmd = cmd % args
279                 #if self.verbose: print cmd
280                 #print localos.system(cmd,timeout)
281                 print "setup rpyc server over ssh"
282                 print ssh.ret
283
284                 # TODO: Add timeout
285                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
286                 # and the following options seems to work well.
287                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289                           """-o ConnectTimeout=120 """ + \
290                           """-n -N -L %(port)s:localhost:18812 """ + \
291                           """%(user)s@%(hostname)s"""
292                 cmd = cmd % args
293                 if self.verbose: print cmd
294                 print cmd
295                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296                 # TODO: the read() here may block indefinitely.  Need a better
297                 # approach therefore, that includes a timeout.
298                 #ret = self.command.stdout.read(5)
299                 ret = moncommands.read_t(self.command.stdout, 5)
300
301                 t2 = time.time()
302                 if 'READY' in ret:
303                         # NOTE: There is still a slight race for machines that are slow...
304                         self.timeout = 2*(t2-t1)
305                         print "Sleeping for %s sec" % self.timeout
306                         time.sleep(self.timeout)
307                         return
308
309                 if self.command.returncode is not None:
310                         print "Failed to establish tunnel!"
311                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
312
313                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
314
315         def __del__(self):
316                 if self.command:
317                         if self.verbose: print "Killing SSH session %s" % self.port
318                         print "Killing SSH session %s" % self.port
319                         self.command.kill()
320
321         
322 def steps_to_list(steps, index=1):
323         return map(lambda x: x[index], steps)
324
325 def index_to_id(steps,index):
326         if index < len(steps):
327                 return steps[index][0]
328         else:
329                 return "done"
330
331 class DebugInterface:
332         def __init__(self, hostname):
333                 self.hostname = hostname
334                 self.session = None
335
336         def getConnection(self):
337                 print "Creating session for %s" % self.hostname
338                 # update known_hosts file (in case the node has rebooted since last run)
339                 try:
340                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
341                 except:
342                         email_exception()
343                         print traceback.print_exc()
344                         return False
345
346                 try:
347                         if config == None:
348                                 self.session = PlanetLabSession(self.hostname, False, True)
349                         else:
350                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351                 except ExceptionDoubleSSHError, e:
352                         msg = "ERROR setting up session for %s" % self.hostname
353                         print msg
354                         return False
355                 except Exception, e:
356                         traceback.print_exc()
357                         email_exception(msg)
358                         return False
359
360                 try:
361                         conn = self.session.get_connection(config)
362                 except EOFError:
363                         # NOTE: sometimes the wait in setup_host() is not long enough.  
364                         # So, here we try to wait a little longer before giving up entirely.
365                         try:
366                                 time.sleep(self.session.timeout*5)
367                                 conn = self.session.get_connection(config)
368                         except EOFError:
369                                 # failed twice... no need to report this really, it's just in a
370                                 # weird state...
371                                 return False
372                         except:
373                                 traceback.print_exc()
374                                 email_exception(self.hostname)
375                                 return False
376                 #print "trying to use conn before returning it."
377                 #print conn.c.modules.sys.path
378                 #print conn.c.modules.os.path.exists('/tmp/source')
379                 #time.sleep(1)
380
381                 #print "conn: %s" % conn
382                 return conn
383
384         def getSequences(self):
385
386                 # TODO: This can be replaced with a DB definition at a future time.
387                 #               This would make it possible for an admin to introduce new
388                 #               patterns without touching code.
389                 
390                 sequences = {}
391                 # restart_bootmanager_boot
392                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
395
396                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
397
398                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400                                 "bminit-cfg-auth-getplc-update-debug-done",
401                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403                                 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404                                 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
407                                 ]:
408                         sequences.update({n : "restart_bootmanager_boot"})
409
410                 #       conn.restart_bootmanager('reinstall')
411                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427                                 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428                                 # actual solution appears to involve removing the bad files, and
429                                 # continually trying to boot the node.
430                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431                                 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
433                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
434                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
435                                 ]:
436                         sequences.update({n : "restart_bootmanager_rins"})
437
438                 # repair_node_keys
439                 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
440                                         "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
441                                 ]:
442                         sequences.update({n: "repair_node_keys"})
443
444                 #   conn.restart_node('reinstall')
445                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
446                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
447                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
448                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
449                                 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
450                                 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
451                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
452                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
453                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
454                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
455                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
456                                 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
457                                 ]:
458                         sequences.update({n : "restart_node_rins"})
459
460                 #       restart_node_boot
461                 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
462                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
463                                  "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
464                                  "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
465                                  "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
466                                  "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
467                                  "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
468                                  ]:
469                         sequences.update({n: "restart_node_boot"})
470
471                 # update_node_config_email
472                 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
473                                   "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
474                                   "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
475                                 ]:
476                         sequences.update({n : "update_node_config_email"})
477
478                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
479                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
480                                 ]:
481                         sequences.update({n : "nodenetwork_email"})
482
483                 # update_bootcd_email
484                 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
485                                 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
486                                 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
487                                 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
488                                 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
489                                 ]:
490                         sequences.update({n : "update_bootcd_email"})
491
492                 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
493                                 ]:
494                         sequences.update({n: "suspect_error_email"})
495
496                 # update_hardware_email
497                 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
498                 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
499
500                 # broken_hardware_email
501                 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
502
503                 # bad_dns_email
504                 for n in [ 
505                  "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
506                         "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
507                         ]:
508                         sequences.update( { n : "bad_dns_email"})
509
510                 return sequences
511
512         def getDiskSteps(self):
513                 steps = [
514                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
515                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
516                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
517
518                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
519
520                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
521                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
522
523                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
524                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
525
526                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
527                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
528
529                         ('floppytimeout','floppy0: floppy timeout called'),
530                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
531
532                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
533                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
534
535                         # floppy0: floppy timeout called
536                         # end_request: I/O error, dev fd0, sector 0
537
538                         # Buffer I/O error on device dm-2, logical block 8888896
539                         # ata1: status=0x51 { DriveReady SeekComplete Error }
540                         # ata1: error=0x40 { UncorrectableError }
541                         # SCSI error : <0 0 0 0> return code = 0x8000002
542                         # sda: Current: sense key: Medium Error
543                         #       Additional sense: Unrecovered read error - auto reallocate failed
544
545                         # SCSI error : <0 2 0 0> return code = 0x40001
546                         # end_request: I/O error, dev sda, sector 572489600
547                 ]
548                 return steps
549
550         def getDiskSequence(self, steps, child):
551                 sequence = []
552                 while True:
553                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
554                         sequence.append(id)
555
556                         if id == "done":
557                                 break
558                 return sequence
559
560         def getBootManagerStepPatterns(self):
561                 steps = [
562                         ('bminit'               , 'Initializing the BootManager.'),
563                         ('cfg'                  , 'Reading node configuration file.'),
564                         ('auth'                 , 'Authenticating node with PLC.'),
565                         ('getplc'               , 'Retrieving details of node from PLC.'),
566                         ('update'               , 'Updating node boot state at PLC.'),
567                         ('hardware'             , 'Checking if hardware requirements met.'),
568                         ('installinit'  , 'Install: Initializing.'),
569                         ('installdisk'  , 'Install: partitioning disks.'),
570                         ('installbootfs', 'Install: bootstrapfs tarball.'),
571                         ('installcfg'   , 'Install: Writing configuration files.'),
572                         ('installstop'  , 'Install: Shutting down installer.'),
573                         ('update2'              , 'Updating node boot state at PLC.'),
574                         ('installinit2' , 'Install: Initializing.'),
575                         ('validate'             , 'Validating node installation.'),
576                         ('rebuildinitrd', 'Rebuilding initrd'),
577                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
578                         ('update3'              , 'Updating node configuration.'),
579                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
580                         ('update4'              , 'Sending hardware configuration to PLC.'),
581                         ('debug'                , 'Starting debug mode'),
582                         ('bmexceptmount', 'BootManagerException during mount'),
583                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
584                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
585                         ('exception'    , 'Exception'),
586                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
587                         ('protoerror'   , 'XML RPC protocol error'),
588                         ('nodehostname' , 'Configured node hostname does not resolve'),
589                         ('implementerror', 'Implementation Error'),
590                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
591                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
592                         ('noinstall'    , 'notinstalled'),
593                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
594                         ('noblockdev'   , "No block devices detected."),
595                         ('dnserror'     , 'Name or service not known'),
596                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
597                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
598                         ('hardwarerequirefail' , 'Hardware requirements not met'),
599                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
600                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
601                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
602                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
603                         ('modulefail'   , 'Unable to get list of system modules'),
604                         ('writeerror'   , 'write error: No space left on device'),
605                         ('nospace'      , "No space left on device"),
606                         ('nonode'       , 'Failed to authenticate call: No such node'),
607                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
608                         ('bootcheckfail'     , 'BootCheckAuthentication'),
609                         ('bootupdatefail'   , 'BootUpdateNode'),
610                 ]
611                 return steps
612
613         def getBootManagerSequenceFromLog(self, steps, child):
614                 sequence = []
615                 while True:
616                         
617                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
618                         id = index_to_id(steps,index)
619                         sequence.append(id)
620
621                         if id == "exception":
622                                 print "...Found An Exception!!!"
623                         elif id == "done": #index == len(steps_to_list(steps)):
624                                 #print "Reached EOF"
625                                 break
626
627                 return sequence
628                 
629
630 def restore(sitehist, hostname, config=None, forced_action=None):
631
632         # NOTE: Nothing works if the bootcd is REALLY old.
633         #       So, this is the first step.
634
635         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
636         recent_actions = sitehist.getRecentActions(hostname=hostname)
637
638         if fbnode['observed_category'] == "OLDBOOTCD":
639                 print "\t...Notify owner to update BootImage!!!"
640
641                 if not found_within(recent_actions, 'newbootcd_notice', 3):
642                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
643
644                         print "\tDisabling %s due to out-of-date BootImage" % hostname
645                         api.UpdateNode(hostname, {'boot_state' : 'disable'})
646
647                 # NOTE: nothing else is possible.
648                 return True
649
650         debugnode = DebugInterface(hostname)
651         conn = debugnode.getConnection()
652         #print "conn: %s" % conn
653         #print "trying to use conn after returning it."
654         #print conn.c.modules.sys.path
655         #print conn.c.modules.os.path.exists('/tmp/source')
656         if type(conn) == type(False): return False
657
658         #if forced_action == "reboot":
659         #       conn.restart_node('reinstall')
660         #       return True
661
662         boot_state = conn.get_boot_state()
663         if boot_state != "debug":
664                 print "... %s in %s state: skipping..." % (hostname , boot_state)
665                 return boot_state == "boot"
666
667         if conn.bootmanager_running():
668                 print "...BootManager is currently running.  Skipping host %s" %hostname 
669                 return True
670
671         # Read persistent flags, tagged on one week intervals.
672
673         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
674         dmesg = conn.get_dmesg()
675         child = fdpexpect.fdspawn(dmesg)
676
677         steps = debugnode.getDiskSteps()
678         sequence = debugnode.getDiskSequence(steps, child)
679
680         s = Set(sequence)
681         if config and not config.quiet: print "\tSET: ", s
682
683         if len(s) > 1:
684                 print "...Potential drive errors on %s" % hostname 
685                 if len(s) == 2 and 'floppyerror' in s:
686                         print "...Should investigate.  Continuing with node."
687                 else:
688                         print "...Should investigate.  Skipping node."
689                         # TODO: send message related to these errors.
690
691                         if not found_within(recent_actions, 'newbootcd_notice', 3):
692
693                                 log=conn.get_dmesg().read()
694                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
695                                 conn.set_nodestate('disable')
696
697                         return False
698
699         print "...Downloading bm.log from %s" %hostname 
700         log = conn.get_bootmanager_log()
701         child = fdpexpect.fdspawn(log)
702
703         if hasattr(config, 'collect') and config.collect: return True
704
705         if config and not config.quiet: print "...Scanning bm.log for errors"
706
707         time.sleep(1)
708
709         steps = debugnode.getBootManagerStepPatterns()
710         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
711                 
712         s = "-".join(sequence)
713         print "   FOUND SEQUENCE: ", s
714
715         # NOTE: We get or set the flag based on the current sequence identifier.
716         #  By using the sequence identifier, we guarantee that there will be no
717         #  frequent loops.  I'm guessing there is a better way to track loops,
718         #  though.
719
720         sequences = debugnode.getSequences()
721         flag_set = True
722         
723         if s not in sequences:
724                 print "   HOST %s" % hostname
725                 print "   UNKNOWN SEQUENCE: %s" % s
726
727                 args = {}
728                 args['hostname'] = hostname
729                 args['sequence'] = s
730                 args['bmlog'] = conn.get_bootmanager_log().read()
731                 args['viart'] = False
732
733                 sitehist.sendMessage('unknownsequence_notice', **args)
734
735                 conn.restart_bootmanager('boot')
736
737                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
738                 # This way, we can check it again after we've fixed it.
739                 flag_set = False
740
741         else:
742
743                 if   sequences[s] == "restart_bootmanager_boot":
744                         print "...Restarting BootManager.py on %s "%hostname 
745                         conn.restart_bootmanager('boot')
746                 elif sequences[s] == "restart_bootmanager_rins":
747                         print "...Restarting BootManager.py on %s "%hostname 
748                         conn.restart_bootmanager('reinstall')
749                 elif sequences[s] == "restart_node_rins":
750                         conn.restart_node('reinstall')
751                 elif sequences[s] == "restart_node_boot":
752                         conn.restart_node('boot')
753                 elif sequences[s] == "repair_node_keys":
754                         if conn.compare_and_repair_nodekeys():
755                                 # the keys either are in sync or were forced in sync.
756                                 # so try to reboot the node again.
757                                 # TODO: why was this originally 'reinstall' instead of 'boot'??
758                                 conn.restart_bootmanager('boot')
759                                 pass
760                         else:
761                                 # there was some failure to synchronize the keys.
762                                 print "...Unable to repair node keys on %s" %hostname 
763
764                 elif sequences[s] == "suspect_error_email":
765                         args = {}
766                         args['hostname'] = hostname
767                         args['sequence'] = s
768                         args['bmlog'] = conn.get_bootmanager_log().read()
769                         args['viart'] = False
770
771                         sitehist.sendMessage('unknownsequence_notice', **args)
772                         conn.restart_bootmanager('boot')
773
774                 # TODO: differentiate this and the 'nodenetwork_email' actions.
775                 elif sequences[s] == "update_node_config_email":
776
777                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
778                                 args = {}
779                                 args['hostname'] = hostname
780                                 sitehist.sendMessage('nodeconfig_notice', **args)
781                                 conn.dump_plconf_file()
782
783                 elif sequences[s] == "nodenetwork_email":
784
785                         if not found_within(recent_actions, 'nodeconfig_notice', 3):
786                                 args = {}
787                                 args['hostname'] = hostname
788                                 args['bmlog'] = conn.get_bootmanager_log().read()
789                                 sitehist.sendMessage('nodeconfig_notice', **args)
790                                 conn.dump_plconf_file()
791
792                 elif sequences[s] == "update_bootcd_email":
793
794                         if not found_within(recent_actions, 'newalphacd_notice', 3):
795                                 args = {}
796                                 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
797                                 args['hostname'] = hostname
798                         
799                                 sitehist.sendMessage('newalphacd_notice', **args)
800
801                                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
802
803                 elif sequences[s] == "broken_hardware_email":
804                         # MAKE An ACTION record that this host has failed hardware.  May
805                         # require either an exception "/minhw" or other manual intervention.
806                         # Definitely need to send out some more EMAIL.
807                         # TODO: email notice of broken hardware
808                         if not found_within(recent_actions, 'baddisk_notice', 1):
809                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
810                                 args = {}
811                                 args['hostname'] = hostname
812                                 args['log'] = conn.get_dmesg().read()
813
814                                 sitehist.sendMessage('baddisk_notice', **args)
815                                 conn.set_nodestate('disable')
816
817                 elif sequences[s] == "update_hardware_email":
818                         if not found_within(recent_actions, 'minimalhardware_notice', 1):
819                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
820                                 args = {}
821                                 args['hostname'] = hostname
822                                 args['bmlog'] = conn.get_bootmanager_log().read()
823                                 sitehist.sendMessage('minimalhardware_notice', **args)
824
825                 elif sequences[s] == "bad_dns_email":
826                         if not found_within(recent_actions, 'baddns_notice', 1):
827                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
828                                 args = {}
829                                 try:
830                                         node = plccache.GetNodeByName(hostname)
831                                         net = api.GetInterfaces(node['interface_ids'])[0]
832                                 except:
833                                         email_exception()
834                                         print traceback.print_exc()
835                                         # TODO: api error. skip email, b/c all info is not available,
836                                         # flag_set will not be recorded.
837                                         return False
838                                 nodenet_str = network_config_to_str(net)
839
840                                 args['hostname'] = hostname
841                                 args['network_config'] = nodenet_str
842                                 args['interface_id'] = net['interface_id']
843
844                                 sitehist.sendMessage('baddns_notice', **args)
845
846         return True
847         
848
849 # MAIN -------------------------------------------------------------------
850
851 def main():
852         from monitor import parser as parsermodule
853         parser = parsermodule.getParser()
854
855         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
856                                                 force=None, quiet=False)
857         parser.add_option("", "--child", dest="child", action="store_true", 
858                                                 help="This is the child mode of this process.")
859         parser.add_option("", "--force", dest="force", metavar="boot_state",
860                                                 help="Force a boot state passed to BootManager.py.")
861         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
862                                                 help="Extra quiet output messages.")
863         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
864                                                 help="Extra debug output messages.")
865         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
866                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
867         parser.add_option("", "--collect", dest="collect", action="store_true", 
868                                                 help="No action, just collect dmesg, and bm.log")
869         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
870                                                 help="Do not perform the orginary setup phase.")
871
872         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
873         config = parsermodule.parse_args(parser)
874
875         if config.nodelist:
876                 nodes = config.getListFromFile(config.nodelist)
877         elif config.node:
878                 nodes = [ config.node ]
879         else:
880                 parser.print_help()
881                 sys.exit(1)
882
883         for node in nodes:
884                 # get sitehist
885                 lb = plccache.plcdb_hn2lb[node]
886                 sitehist = SiteInterface.get_or_make(loginbase=lb)
887                 #reboot(node, config)
888                 restore(sitehist, node, config=None, forced_action=None)
889
890 if __name__ == "__main__":
891         main()