correct message
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         return
61         
62
63 class ExceptionDoubleSSHError(Exception): pass
64
65 class NodeConnection:
66         def __init__(self, connection, node, config):
67                 print "init nodeconnection"
68                 self.node = node
69                 self.c = connection
70                 self.config = config
71
72         def get_boot_state(self):
73                 print "get_boot_state(self)"
74                 try:
75                         if self.c.modules.os.path.exists('/tmp/source'):
76                                 return "debug"
77                         elif self.c.modules.os.path.exists('/vservers'): 
78                                 return "boot"
79                         else:
80                                 return "unknown"
81                 except EOFError:
82                         traceback.print_exc()
83                         print self.c.modules.sys.path
84                 except:
85                         email_exception()
86                         traceback.print_exc()
87
88                 return "unknown"
89
90         def get_dmesg(self):
91                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
96                 return log
97
98         def get_bootmanager_log(self):
99                 bm_name = bootmanager_log_name(self.node)
100                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101                 #email_exception(self.node, "collected BM log for %s" % self.node)
102                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
105                 return log
106
107         def dump_plconf_file(self):
108                 c = self.c
109                 self.c.modules.sys.path.append("/tmp/source/")
110                 self.c.modules.os.chdir('/tmp/source')
111
112                 log = c.modules.BootManager.log('/tmp/new.log')
113                 bm = c.modules.BootManager.BootManager(log,'boot')
114
115                 BootManagerException = c.modules.Exceptions.BootManagerException
116                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
118                 bm_continue = True
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "   ERROR:", x
125                         print "   Possibly, unable to find valid configuration file"
126
127                 if bm_continue:
128                         for key in bm.VARS.keys():
129                                 print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to read Node Configuration"
132                 
133         def fsck_repair_node(self):
134                 c = self.c
135                 self.c.modules.sys.path.append("/tmp/source/")
136                 self.c.modules.os.chdir('/tmp/source')
137                 # TODO: restart
138                 # TODO: set boot state to node's actually boot state.
139                 # could be 'boot' or 'safeboot'
140                 self.c.modules.os.chdir('/tmp/source')
141                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
142                         print "Running MANUAL FSCK already... try again soon."
143                 else:
144                         print "Running MANUAL fsck on %s" % self.node
145                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
146                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
147                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
148                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
149                                   "  rm -f /tmp/BM_RUNNING " + \
150                                   ") &" 
151                         cmd = cmd % self.get_nodestate()
152                         self.c.modules.os.system(cmd)
153                 #self.restart_bootmanager('boot')       
154                 pass
155
156         def compare_and_repair_nodekeys(self):
157                 c = self.c
158                 self.c.modules.sys.path.append("/tmp/source/")
159                 self.c.modules.os.chdir('/tmp/source')
160
161                 log = c.modules.BootManager.log('/tmp/new.log')
162                 bm = c.modules.BootManager.BootManager(log,'boot')
163
164                 BootManagerException = c.modules.Exceptions.BootManagerException
165                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
166                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
167                 bm_continue = True
168
169                 plcnode = plccache.GetNodeByName(self.node)
170
171                 InitializeBootManager.Run(bm.VARS, bm.LOG)
172                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
173                 except Exception, x:
174                         bm_continue = False
175                         print "exception"
176                         print x
177                         print "   Possibly, unable to find valid configuration file"
178
179                 if bm_continue:
180                         print "   NODE: %s" % bm.VARS['NODE_KEY']
181                         print "   PLC : %s" % plcnode['key']
182
183                         if bm.VARS['NODE_KEY'] == plcnode['key']:
184                                 return True
185                         else:
186                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
187                                         print "   Successfully updated NODE_KEY with PLC"
188                                         return True
189                                 else:
190                                         return False
191                                 
192                         #for key in bm.VARS.keys():
193                         #       print key, " == ", bm.VARS[key]
194                 else:
195                         print "   Unable to retrieve NODE_KEY"
196
197         def bootmanager_running(self):
198                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
199                         return True
200                 else:
201                         return False
202
203         def set_nodestate(self, state='boot'):
204                 return api.UpdateNode(self.node, {'boot_state' : state})
205
206         def get_nodestate(self):
207                 try:
208                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
209                 except:
210                         traceback.print_exc()
211                         # NOTE: use last cached value from plc
212                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
213                         return fbnode['plc_node_stats']['boot_state']
214
215
216         def restart_node(self, state='boot'):
217                 api.UpdateNode(self.node, {'boot_state' : state})
218
219                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
220                 if not pflags.getRecentFlag('gentlekill'):
221                         print "   Killing all slice processes... : %s" %  self.node
222                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
223                         self.c.modules.os.system(cmd_slicekill)
224                         cmd = """ shutdown -r +1 & """
225                         print "   Restarting %s : %s" % ( self.node, cmd)
226                         self.c.modules.os.system(cmd)
227
228                         pflags.setRecentFlag('gentlekill')
229                         pflags.save()
230                 else:
231                         print "   Restarting with sysrq 'sub' %s" % self.node
232                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
233                         self.c.modules.os.system(cmd)
234
235                 return
236
237         def restart_bootmanager(self, forceState):
238
239                 self.c.modules.os.chdir('/tmp/source')
240                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
241                         print "   BootManager is already running: try again soon..."
242                 else:
243                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
244                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
245                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
246                                   "  rm -f /tmp/BM_RUNNING " + \
247                                   ") &" 
248                         cmd = cmd % forceState
249                         self.c.modules.os.system(cmd)
250
251                 return 
252
253
254 class PlanetLabSession:
255         globalport = 22000 + int(random.random()*1000)
256
257         def __init__(self, node, nosetup, verbose):
258                 self.verbose = verbose
259                 self.node = node
260                 self.port = None
261                 self.nosetup = nosetup
262                 self.command = None
263                 self.setup_host()
264
265         def get_connection(self, config):
266                 try:
267                         print "SocketConnection(localhost, %s" % self.port
268                         sc = SocketConnection("localhost", self.port)
269                         print "NodeConnection(%s, %s)" % (sc, self.node)
270                         conn = NodeConnection(sc, self.node, config)
271                 except:
272                         # NOTE: try twice since this can sometimes fail the first time. If
273                         #               it fails again, let it go.
274                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
275                 return conn
276         
277         def setup_host(self):
278                 self.port = PlanetLabSession.globalport
279                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
280
281                 args = {}
282                 args['port'] = self.port
283                 args['user'] = 'root'
284                 args['hostname'] = self.node
285                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
286                 ssh_port = 22
287
288                 if self.nosetup:
289                         print "Skipping setup"
290                         return 
291
292                 # COPY Rpyc files to host
293                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
294                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
295                 if self.verbose: print cmd
296                 print cmd
297                 # TODO: Add timeout
298                 timeout = 120
299                 localos = moncommands.CMD()
300
301                 ret = localos.system(cmd, timeout)
302                 print ret
303                 if ret != 0:
304                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
305                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
306                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
307                         print "trying: ", cmd
308                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
309                         ret = localos.system(cmd, timeout)
310                         print ret
311                         if ret != 0:
312                                 print "\tFAILED TWICE"
313                                 #email_exception("%s rsync failed twice" % self.node)
314                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
315
316                 t1 = time.time()
317                 # KILL any already running servers.
318                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
319                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
320             rm -f out.log
321             echo "kill server" >> out.log
322                         netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
323             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
324             echo "export" >> out.log
325             export PYTHONPATH=$HOME  ;
326             echo "start server" >> out.log
327             python Rpyc/Servers/forking_server.py &> server.log &
328             echo "done" >> out.log
329 EOF""")
330                 print "setup rpyc server over ssh"
331                 print ssh.ret
332
333                 # TODO: Add timeout
334                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
335                 # and the following options seems to work well.
336                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
337                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
338                           """-o ConnectTimeout=120 """ + \
339                           """-n -N -L %(port)s:localhost:18812 """ + \
340                           """%(user)s@%(hostname)s"""
341                 cmd = cmd % args
342                 if self.verbose: print cmd
343                 print cmd
344                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
345                 # TODO: the read() here may block indefinitely.  Need a better
346                 # approach therefore, that includes a timeout.
347                 #ret = self.command.stdout.read(5)
348                 ret = moncommands.read_t(self.command.stdout, 5)
349
350                 t2 = time.time()
351                 if 'READY' in ret:
352                         # NOTE: There is still a slight race for machines that are slow...
353                         self.timeout = 2*(t2-t1)
354                         print "Sleeping for %s sec" % self.timeout
355                         time.sleep(self.timeout)
356                         return
357
358                 if self.command.returncode is not None:
359                         print "Failed to establish tunnel!"
360                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
361
362                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
363
364         def __del__(self):
365                 if self.command:
366                         if self.verbose: print "Killing SSH session %s" % self.port
367                         print "Killing SSH session %s" % self.port
368                         self.command.kill()
369
370         
371 def steps_to_list(steps, index=1):
372         return map(lambda x: x[index], steps)
373
374 def index_to_id(steps,index):
375         if index < len(steps):
376                 return steps[index][0]
377         else:
378                 return "done"
379
380 class DebugInterface:
381         def __init__(self, hostname):
382                 self.hostname = hostname
383                 self.session = None
384
385         def getConnection(self):
386                 print "Creating session for %s" % self.hostname
387                 # update known_hosts file (in case the node has rebooted since last run)
388                 try:
389                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
390                 except:
391                         email_exception()
392                         print traceback.print_exc()
393                         return False
394
395                 msg = "ERROR setting up session for %s" % self.hostname
396                 try:
397                         if config == None:
398                                 self.session = PlanetLabSession(self.hostname, False, True)
399                         else:
400                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
401                 except ExceptionDoubleSSHError, e:
402                         print msg
403                         return False
404                 except Exception, e:
405                         traceback.print_exc()
406                         email_exception(msg)
407                         return False
408
409                 print "Getting connection: 1st try"
410                 try:
411                         conn = self.session.get_connection(config)
412                 except EOFError:
413                         # NOTE: sometimes the wait in setup_host() is not long enough.  
414                         # So, here we try to wait a little longer before giving up entirely.
415                         try:
416                                 print "Getting connection: 2nd try"
417                                 time.sleep(self.session.timeout*5)
418                                 conn = self.session.get_connection(config)
419                         except EOFError:
420                                 # failed twice... no need to report this really, it's just in a
421                                 # weird state...
422                                 print "Getting connection: failed"
423                                 email_exception(self.hostname, "failed twice to get connection")
424                                 return False
425                         except:
426                                 traceback.print_exc()
427                                 email_exception(self.hostname)
428                                 return False
429                 print "Getting connection: ok"
430                 #print "trying to use conn before returning it."
431                 #print conn.c.modules.sys.path
432                 #print conn.c.modules.os.path.exists('/tmp/source')
433                 #time.sleep(1)
434
435                 #print "conn: %s" % conn
436                 return conn
437
438         def getSequences(self):
439
440                 # NOTE: The DB is now the autoritative record for all BM sequences. 
441                 #               An admin can introduce new patterns and actions without touching code.
442                 sequences = {}
443
444                 bms = BootmanSequenceRecord.query.all()
445                 for s in bms:
446                         sequences[s.sequence] = s.action
447                 
448                 return sequences
449
450         def getDiskSteps(self):
451                 steps = [
452                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
453                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
454                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
455
456                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
457
458                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
459                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
460
461                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
462                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
463
464                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
465                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
466
467                         ('floppytimeout','floppy0: floppy timeout called'),
468                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
469
470                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
471                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
472
473                         # floppy0: floppy timeout called
474                         # end_request: I/O error, dev fd0, sector 0
475
476                         # Buffer I/O error on device dm-2, logical block 8888896
477                         # ata1: status=0x51 { DriveReady SeekComplete Error }
478                         # ata1: error=0x40 { UncorrectableError }
479                         # SCSI error : <0 0 0 0> return code = 0x8000002
480                         # sda: Current: sense key: Medium Error
481                         #       Additional sense: Unrecovered read error - auto reallocate failed
482
483                         # SCSI error : <0 2 0 0> return code = 0x40001
484                         # end_request: I/O error, dev sda, sector 572489600
485                 ]
486                 return steps
487
488         def getDiskSequence(self, steps, child):
489                 sequence = []
490                 while True:
491                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
492                         sequence.append(id)
493
494                         if id == "done":
495                                 break
496                 return sequence
497
498         def getBootManagerStepPatterns(self):
499                 steps = [
500                         ('bminit'               , 'Initializing the BootManager.'),
501                         ('cfg'                  , 'Reading node configuration file.'),
502                         ('auth'                 , 'Authenticating node with PLC.'),
503                         ('getplc'               , 'Retrieving details of node from PLC.'),
504                         ('update'               , 'Updating node boot state at PLC.'),
505                         ('hardware'             , 'Checking if hardware requirements met.'),
506                         ('installinit'  , 'Install: Initializing.'),
507                         ('installdisk'  , 'Install: partitioning disks.'),
508                         ('installbootfs', 'Install: bootstrapfs tarball.'),
509                         ('installcfg'   , 'Install: Writing configuration files.'),
510                         ('installstop'  , 'Install: Shutting down installer.'),
511                         ('update2'              , 'Updating node boot state at PLC.'),
512                         ('installinit2' , 'Install: Initializing.'),
513                         ('validate'             , 'Validating node installation.'),
514                         ('rebuildinitrd', 'Rebuilding initrd'),
515                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
516                         ('update3'              , 'Updating node configuration.'),
517                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
518                         ('update4'              , 'Sending hardware configuration to PLC.'),
519                         ('debug'                , 'Starting debug mode'),
520                         ('bmexceptmount', 'BootManagerException during mount'),
521                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
522                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
523                         ('exception'    , 'Exception'),
524                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
525                         ('protoerror2'  , '500 Internal Server Error'),
526                         ('protoerror'   , 'XML RPC protocol error'),
527                         ('nodehostname' , 'Configured node hostname does not resolve'),
528                         ('implementerror', 'Implementation Error'),
529                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
530                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
531                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
532                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
533                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
534                         ('noinstall'    , 'notinstalled'),
535                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
536                         ('noblockdev'   , "No block devices detected."),
537                         ('missingkernel', "missingkernel"),
538                         ('dnserror'     , 'Name or service not known'),
539                         ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
540                         ('noconfig'             , "Unable to find and read a node configuration file"),
541                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
542                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
543                         ('hardwarerequirefail' , 'Hardware requirements not met'),
544                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
545                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
546                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
547                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
548                         ('modulefail'   , 'Unable to get list of system modules'),
549                         ('writeerror'   , 'write error: No space left on device'),
550                         ('nospace'      , "No space left on device"),
551                         ('nonode'       , 'Failed to authenticate call: No such node'),
552                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
553                         ('authfail2'    , 'Authentication Failed'),
554                         ('bootcheckfail'  , 'BootCheckAuthentication'),
555                         ('bootupdatefail' , 'BootUpdateNode'),
556                 ]
557                 return steps
558
559         def getBootManagerSequenceFromLog(self, steps, child):
560                 sequence = []
561                 while True:
562                         
563                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
564                         id = index_to_id(steps,index)
565                         sequence.append(id)
566
567                         if id == "exception":
568                                 print "...Found An Exception!!!"
569                         elif id == "done": #index == len(steps_to_list(steps)):
570                                 #print "Reached EOF"
571                                 break
572
573                 return sequence
574                 
575 def restore(sitehist, hostname, config=None, forced_action=None):
576         ret = restore_basic(sitehist, hostname, config, forced_action)
577         session.flush()
578         return ret
579
580 def restore_basic(sitehist, hostname, config=None, forced_action=None):
581
582         # NOTE: Nothing works if the bootcd is REALLY old.
583         #       So, this is the first step.
584
585         bootman_action = "unknown"
586
587         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
588         recent_actions = sitehist.getRecentActions(hostname=hostname)
589
590         if fbnode['observed_category'] == "OLDBOOTCD":
591                 print "\t...Notify owner to update BootImage!!!"
592
593                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
594                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
595
596                         print "\tDisabling %s due to out-of-date BootImage" % hostname
597                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
598
599                 # NOTE: nothing else is possible.
600                 return "disabled"
601
602         debugnode = DebugInterface(hostname)
603         conn = debugnode.getConnection()
604         if type(conn) == type(False): return "connect_failed"
605
606         boot_state = conn.get_boot_state()
607         if boot_state != "debug":
608                 print "... %s in %s state: skipping..." % (hostname , boot_state)
609                 return "skipped" #boot_state == "boot"
610
611         if conn.bootmanager_running():
612                 print "...BootManager is currently running.  Skipping host %s" %hostname 
613                 return "skipped" # True
614
615         # Read persistent flags, tagged on one week intervals.
616
617         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
618         dmesg = conn.get_dmesg()
619         child = fdpexpect.fdspawn(dmesg)
620
621         steps = debugnode.getDiskSteps()
622         sequence = debugnode.getDiskSequence(steps, child)
623
624         s = Set(sequence)
625         if config and not config.quiet: print "\tSET: ", s
626
627         if len(s) > 1:
628                 print "...Potential drive errors on %s" % hostname 
629                 if len(s) == 2 and 'floppyerror' in s:
630                         print "...Should investigate.  Continuing with node."
631                 else:
632                         print "...Should investigate.  Skipping node."
633                         # TODO: send message related to these errors.
634
635                         if not found_within(recent_actions, 'baddisk_notice', 7):
636                                 print "baddisk_notice not found recently"
637
638                                 log=conn.get_dmesg().read()
639                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
640                                 return "skipping_baddisk"
641                         else:
642                                 # NOTE: "" does not add a new action record
643                                 return ""
644
645
646         print "...Downloading bm.log from %s" %hostname 
647         log = conn.get_bootmanager_log()
648         bm_log_data = log.read() # get data
649         log.seek(0)     # reset fd pointer for fdspawn
650         child = fdpexpect.fdspawn(log)
651
652         if hasattr(config, 'collect') and config.collect: return "collect"
653
654         if config and not config.quiet: print "...Scanning bm.log for errors"
655
656         time.sleep(1)
657
658         steps = debugnode.getBootManagerStepPatterns()
659         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
660                 
661         s = "-".join(sequence)
662         print "   FOUND SEQUENCE: ", s
663
664         # NOTE: We get or set the flag based on the current sequence identifier.
665         #  By using the sequence identifier, we guarantee that there will be no
666         #  frequent loops.  I'm guessing there is a better way to track loops,
667         #  though.
668
669         sequences = debugnode.getSequences()
670         flag_set = True
671         
672         if s not in sequences:
673                 print "   HOST %s" % hostname
674                 print "   UNKNOWN SEQUENCE: %s" % s
675
676                 args = {}
677                 args['hostname'] = hostname
678                 args['sequence'] = s
679                 args['bmlog'] = bm_log_data
680                 args['viart'] = False
681                 args['saveact'] = True
682                 args['ccemail'] = True
683
684                 sitehist.sendMessage('unknownsequence_notice', **args)
685
686                 conn.restart_bootmanager('boot')
687
688                 bootman_action = "restart_bootmanager"
689
690                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
691                 # This way, we can check it again after we've fixed it.
692                 flag_set = False
693
694         else:
695                 bootman_action = sequences[s]
696
697                 if   sequences[s] == "restart_bootmanager_boot":
698                         print "...Restarting BootManager.py on %s "%hostname 
699                         conn.restart_bootmanager('boot')
700                 elif sequences[s] == "restart_bootmanager_rins":
701                         print "...Restarting BootManager.py on %s "%hostname 
702                         conn.restart_bootmanager('reinstall')
703                 elif sequences[s] == "restart_node_rins":
704                         conn.restart_node('reinstall')
705                 elif sequences[s] == "restart_node_boot":
706                         conn.restart_node('boot')
707                 elif sequences[s] == "fsck_repair":
708                         conn.fsck_repair_node()
709                 elif sequences[s] == "repair_node_keys":
710                         if conn.compare_and_repair_nodekeys():
711                                 # the keys either are in sync or were forced in sync.
712                                 # so try to start BM again.
713                                 conn.restart_bootmanager(conn.get_nodestate())
714                         else:
715                                 # there was some failure to synchronize the keys.
716                                 print "...Unable to repair node keys on %s" %hostname 
717                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
718                                         args = {}
719                                         args['hostname'] = hostname
720                                         sitehist.sendMessage('nodeconfig_notice', **args)
721                                         conn.dump_plconf_file()
722                                 else:
723                                         # NOTE: do not add a new action record
724                                         return ""
725
726                 elif sequences[s] == "unknownsequence_notice":
727                         args = {}
728                         args['hostname'] = hostname
729                         args['sequence'] = s
730                         args['bmlog'] = bm_log_data
731                         args['viart'] = False
732                         args['saveact'] = True
733                         args['ccemail'] = True
734
735                         sitehist.sendMessage('unknownsequence_notice', **args)
736                         conn.restart_bootmanager('boot')
737
738                 elif sequences[s] == "nodeconfig_notice":
739
740                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
741                                 args = {}
742                                 args['hostname'] = hostname
743                                 sitehist.sendMessage('nodeconfig_notice', **args)
744                                 conn.dump_plconf_file()
745                         else:
746                                 # NOTE: do not add a new action record
747                                 return ""
748
749                 elif sequences[s] == "nodenetwork_email":
750
751                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
752                                 args = {}
753                                 args['hostname'] = hostname
754                                 args['bmlog'] = bm_log_data
755                                 sitehist.sendMessage('nodeconfig_notice', **args)
756                                 conn.dump_plconf_file()
757                         else:
758                                 # NOTE: do not add a new action record
759                                 return ""
760
761                 elif sequences[s] == "noblockdevice_notice":
762
763                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
764                                 args = {}
765                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
766                                 args['hostname'] = hostname
767                         
768                                 sitehist.sendMessage('noblockdevice_notice', **args)
769                         else:
770                                 # NOTE: do not add a new action record
771                                 return ""
772
773                 elif sequences[s] == "baddisk_notice":
774                         # MAKE An ACTION record that this host has failed hardware.  May
775                         # require either an exception "/minhw" or other manual intervention.
776                         # Definitely need to send out some more EMAIL.
777                         # TODO: email notice of broken hardware
778                         if not found_within(recent_actions, 'baddisk_notice', 7):
779                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
780                                 args = {}
781                                 args['hostname'] = hostname
782                                 args['log'] = conn.get_dmesg().read()
783
784                                 sitehist.sendMessage('baddisk_notice', **args)
785                                 #conn.set_nodestate('disabled')
786                         else:
787                                 # NOTE: do not add a new action record
788                                 return ""
789
790                 elif sequences[s] == "minimalhardware_notice":
791                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
792                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
793                                 args = {}
794                                 args['hostname'] = hostname
795                                 args['bmlog'] = bm_log_data
796                                 sitehist.sendMessage('minimalhardware_notice', **args)
797                         else:
798                                 # NOTE: do not add a new action record
799                                 return ""
800
801                 elif sequences[s] == "baddns_notice":
802                         if not found_within(recent_actions, 'baddns_notice', 1):
803                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
804                                 args = {}
805                                 try:
806                                         node = plccache.GetNodeByName(hostname)
807                                         net = api.GetInterfaces(node['interface_ids'])[0]
808                                 except:
809                                         email_exception()
810                                         print traceback.print_exc()
811                                         # TODO: api error. skip email, b/c all info is not available,
812                                         # flag_set will not be recorded.
813                                         return "exception"
814                                 nodenet_str = network_config_to_str(net)
815
816                                 args['hostname'] = hostname
817                                 args['network_config'] = nodenet_str
818                                 args['interface_id'] = net['interface_id']
819
820                                 sitehist.sendMessage('baddns_notice', **args)
821                         else:
822                                 # NOTE: do not add a new action record
823                                 return ""
824
825         return bootman_action
826         
827
828 # MAIN -------------------------------------------------------------------
829
830 def main():
831         from monitor import parser as parsermodule
832         parser = parsermodule.getParser()
833
834         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
835                                                 force=None, quiet=False)
836         parser.add_option("", "--child", dest="child", action="store_true", 
837                                                 help="This is the child mode of this process.")
838         parser.add_option("", "--force", dest="force", metavar="boot_state",
839                                                 help="Force a boot state passed to BootManager.py.")
840         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
841                                                 help="Extra quiet output messages.")
842         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
843                                                 help="Extra debug output messages.")
844         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
845                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
846         parser.add_option("", "--collect", dest="collect", action="store_true", 
847                                                 help="No action, just collect dmesg, and bm.log")
848         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
849                                                 help="Do not perform the orginary setup phase.")
850
851         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
852         config = parsermodule.parse_args(parser)
853
854         if config.nodelist:
855                 nodes = config.getListFromFile(config.nodelist)
856         elif config.node:
857                 nodes = [ config.node ]
858         else:
859                 parser.print_help()
860                 sys.exit(1)
861
862         for node in nodes:
863                 # get sitehist
864                 lb = plccache.plcdb_hn2lb[node]
865                 sitehist = SiteInterface.get_or_make(loginbase=lb)
866                 #reboot(node, config)
867                 restore(sitehist, node, config=None, forced_action=None)
868
869 if __name__ == "__main__":
870         main()