Many small updates and fixes:
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.util.sshknownhosts import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         return
61         
62
63 class ExceptionDoubleSSHError(Exception): pass
64
65 class NodeConnection:
66         def __init__(self, connection, node, config):
67                 print "init nodeconnection"
68                 self.node = node
69                 self.c = connection
70                 self.config = config
71
72         def get_boot_state(self):
73                 print "get_boot_state(self)"
74                 try:
75                         if self.c.modules.os.path.exists('/tmp/source'):
76                                 return "debug"
77                         elif self.c.modules.os.path.exists('/vservers'): 
78                                 return "boot"
79                         else:
80                                 return "unknown"
81                 except EOFError:
82                         traceback.print_exc()
83                         print self.c.modules.sys.path
84                 except:
85                         email_exception()
86                         traceback.print_exc()
87
88                 return "unknown"
89
90         def get_dmesg(self):
91                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
96                 return log
97
98         def get_bootmanager_log(self):
99                 bm_name = bootmanager_log_name(self.node)
100                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101                 #email_exception(self.node, "collected BM log for %s" % self.node)
102                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
105                 return log
106
107         def dump_plconf_file(self):
108                 c = self.c
109                 self.c.modules.sys.path.append("/tmp/source/")
110                 self.c.modules.os.chdir('/tmp/source')
111
112                 log = c.modules.BootManager.log('/tmp/new.log')
113                 bm = c.modules.BootManager.BootManager(log,'boot')
114
115                 BootManagerException = c.modules.Exceptions.BootManagerException
116                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
118                 bm_continue = True
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "   ERROR:", x
125                         print "   Possibly, unable to find valid configuration file"
126
127                 if bm_continue:
128                         for key in bm.VARS.keys():
129                                 print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to read Node Configuration"
132                 
133         def fsck_repair_node(self):
134                 c = self.c
135                 self.c.modules.sys.path.append("/tmp/source/")
136                 self.c.modules.os.chdir('/tmp/source')
137                 # TODO: restart
138                 # TODO: set boot state to node's actually boot state.
139                 # could be 'boot' or 'safeboot'
140                 self.c.modules.os.chdir('/tmp/source')
141                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
142                         print "Running MANUAL FSCK already... try again soon."
143                 else:
144                         print "Running MANUAL fsck on %s" % self.node
145                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
146                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
147                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
148                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
149                                   "  rm -f /tmp/BM_RUNNING " + \
150                                   ") &" 
151                         cmd = cmd % self.get_nodestate()
152                         self.c.modules.os.system(cmd)
153                 #self.restart_bootmanager('boot')       
154                 pass
155
156         def compare_and_repair_nodekeys(self):
157                 c = self.c
158                 self.c.modules.sys.path.append("/tmp/source/")
159                 self.c.modules.os.chdir('/tmp/source')
160
161                 log = c.modules.BootManager.log('/tmp/new.log')
162                 bm = c.modules.BootManager.BootManager(log,'boot')
163
164                 BootManagerException = c.modules.Exceptions.BootManagerException
165                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
166                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
167                 bm_continue = True
168
169                 plcnode = plccache.GetNodeByName(self.node)
170
171                 InitializeBootManager.Run(bm.VARS, bm.LOG)
172                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
173                 except Exception, x:
174                         bm_continue = False
175                         print "exception"
176                         print x
177                         print "   Possibly, unable to find valid configuration file"
178
179                 if bm_continue:
180                         print "   NODE: %s" % bm.VARS['NODE_KEY']
181                         print "   PLC : %s" % plcnode['key']
182
183                         if bm.VARS['NODE_KEY'] == plcnode['key']:
184                                 return True
185                         else:
186                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
187                                         print "   Successfully updated NODE_KEY with PLC"
188                                         return True
189                                 else:
190                                         return False
191                                 
192                         #for key in bm.VARS.keys():
193                         #       print key, " == ", bm.VARS[key]
194                 else:
195                         print "   Unable to retrieve NODE_KEY"
196
197         def bootmanager_running(self):
198                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
199                         return True
200                 else:
201                         return False
202
203         def set_nodestate(self, state='boot'):
204                 return api.UpdateNode(self.node, {'boot_state' : state})
205
206         def get_nodestate(self):
207                 try:
208                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
209                 except:
210                         traceback.print_exc()
211                         # NOTE: use last cached value from plc
212                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
213                         return fbnode['plc_node_stats']['boot_state']
214
215
216         def restart_node(self, state='boot'):
217                 api.UpdateNode(self.node, {'boot_state' : state})
218
219                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
220                 if not pflags.getRecentFlag('gentlekill'):
221                         print "   Killing all slice processes... : %s" %  self.node
222                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
223                         self.c.modules.os.system(cmd_slicekill)
224                         cmd = """ shutdown -r +1 & """
225                         print "   Restarting %s : %s" % ( self.node, cmd)
226                         self.c.modules.os.system(cmd)
227
228                         pflags.setRecentFlag('gentlekill')
229                         pflags.save()
230                 else:
231                         print "   Restarting with sysrq 'sub' %s" % self.node
232                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
233                         self.c.modules.os.system(cmd)
234
235                 return
236
237         def restart_bootmanager(self, forceState):
238
239                 self.c.modules.os.chdir('/tmp/source')
240                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
241                         print "   BootManager is already running: try again soon..."
242                 else:
243                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
244                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
245                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
246                                   "  rm -f /tmp/BM_RUNNING " + \
247                                   ") &" 
248                         cmd = cmd % forceState
249                         self.c.modules.os.system(cmd)
250
251                 return 
252
253
254 class PlanetLabSession:
255         globalport = 22000 + int(random.random()*1000)
256
257         def __init__(self, node, nosetup, verbose):
258                 self.verbose = verbose
259                 self.node = node
260                 self.port = None
261                 self.nosetup = nosetup
262                 self.command = None
263                 self.setup_host()
264
265         def get_connection(self, config):
266                 try:
267                         print "SocketConnection(localhost, %s" % self.port
268                         sc = SocketConnection("localhost", self.port)
269                         print "NodeConnection(%s, %s)" % (sc, self.node)
270                         conn = NodeConnection(sc, self.node, config)
271                 except:
272                         # NOTE: try twice since this can sometimes fail the first time. If
273                         #               it fails again, let it go.
274                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
275                 return conn
276         
277         def setup_host(self):
278                 self.port = PlanetLabSession.globalport
279                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
280
281                 args = {}
282                 args['port'] = self.port
283                 args['user'] = 'root'
284                 args['hostname'] = self.node
285                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
286                 ssh_port = 22
287
288                 if self.nosetup:
289                         print "Skipping setup"
290                         return 
291
292                 # COPY Rpyc files to host
293                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
294                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
295                 if self.verbose: print cmd
296                 print cmd
297                 # TODO: Add timeout
298                 timeout = 120
299                 localos = moncommands.CMD()
300
301                 ret = localos.system(cmd, timeout)
302                 print ret
303                 if ret != 0:
304                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
305                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
306                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
307                         print "trying: ", cmd
308                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
309                         ret = localos.system(cmd, timeout)
310                         print ret
311                         if ret != 0:
312                                 print "\tFAILED TWICE"
313                                 #email_exception("%s rsync failed twice" % self.node)
314                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
315
316                 t1 = time.time()
317                 # KILL any already running servers.
318                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
319                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
320             rm -f out.log
321             echo "kill server" >> out.log
322                         netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
323             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
324             echo "export" >> out.log
325             export PYTHONPATH=$HOME  ;
326             echo "start server" >> out.log
327             python Rpyc/Servers/forking_server.py &> server.log &
328             echo "done" >> out.log
329 EOF""")
330                 print "setup rpyc server over ssh"
331                 print ssh.ret
332
333                 # TODO: Add timeout
334                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
335                 # and the following options seems to work well.
336                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
337                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
338                           """-o ConnectTimeout=120 """ + \
339                           """-n -N -L %(port)s:localhost:18812 """ + \
340                           """%(user)s@%(hostname)s"""
341                 cmd = cmd % args
342                 if self.verbose: print cmd
343                 print cmd
344                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
345                 # TODO: the read() here may block indefinitely.  Need a better
346                 # approach therefore, that includes a timeout.
347                 #ret = self.command.stdout.read(5)
348                 ret = moncommands.read_t(self.command.stdout, 5)
349
350                 t2 = time.time()
351                 if 'READY' in ret:
352                         # NOTE: There is still a slight race for machines that are slow...
353                         self.timeout = 2*(t2-t1)
354                         print "Sleeping for %s sec" % self.timeout
355                         time.sleep(self.timeout)
356                         return
357
358                 if self.command.returncode is not None:
359                         print "Failed to establish tunnel!"
360                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
361
362                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
363
364         def __del__(self):
365                 if self.command:
366                         if self.verbose: print "Killing SSH session %s" % self.port
367                         print "Killing SSH session %s" % self.port
368                         self.command.kill()
369
370         
371 def steps_to_list(steps, index=1):
372         return map(lambda x: x[index], steps)
373
374 def index_to_id(steps,index):
375         if index < len(steps):
376                 return steps[index][0]
377         else:
378                 return "done"
379
380 class DebugInterface:
381         def __init__(self, hostname):
382                 self.hostname = hostname
383                 self.session = None
384
385         def getConnection(self):
386                 print "Creating session for %s" % self.hostname
387                 # update known_hosts file (in case the node has rebooted since last run)
388                 try:
389                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
390                 except:
391                         email_exception()
392                         print traceback.print_exc()
393                         return False
394
395                 msg = "ERROR setting up session for %s" % self.hostname
396                 try:
397                         if config == None:
398                                 self.session = PlanetLabSession(self.hostname, False, True)
399                         else:
400                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
401                 except ExceptionDoubleSSHError, e:
402                         print msg
403                         return False
404                 except Exception, e:
405                         traceback.print_exc()
406                         email_exception(msg)
407                         return False
408
409                 print "Getting connection: 1st try"
410                 try:
411                         conn = self.session.get_connection(config)
412                 except EOFError:
413                         # NOTE: sometimes the wait in setup_host() is not long enough.  
414                         # So, here we try to wait a little longer before giving up entirely.
415                         try:
416                                 print "Getting connection: 2nd try"
417                                 time.sleep(self.session.timeout*5)
418                                 conn = self.session.get_connection(config)
419                         except EOFError:
420                                 # failed twice... no need to report this really, it's just in a
421                                 # weird state...
422                                 print "Getting connection: failed"
423                                 email_exception(self.hostname, "failed twice to get connection")
424                                 return False
425                         except:
426                                 traceback.print_exc()
427                                 email_exception(self.hostname)
428                                 return False
429                 print "Getting connection: ok"
430                 #print "trying to use conn before returning it."
431                 #print conn.c.modules.sys.path
432                 #print conn.c.modules.os.path.exists('/tmp/source')
433                 #time.sleep(1)
434
435                 #print "conn: %s" % conn
436                 return conn
437
438         def getSequences(self):
439
440                 # NOTE: The DB is now the autoritative record for all BM sequences. 
441                 #               An admin can introduce new patterns and actions without touching code.
442                 sequences = {}
443
444                 bms = BootmanSequenceRecord.query.all()
445                 for s in bms:
446                         sequences[s.sequence] = s.action
447                 
448                 return sequences
449
450         def getDiskSteps(self):
451                 steps = [
452                         ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
453                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
454                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
455                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
456
457                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
458
459                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
460                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
461
462                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
463                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
464
465                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
466                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
467
468                         ('floppytimeout','floppy0: floppy timeout called'),
469                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
470
471                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
472                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
473
474                         # floppy0: floppy timeout called
475                         # end_request: I/O error, dev fd0, sector 0
476
477                         # Buffer I/O error on device dm-2, logical block 8888896
478                         # ata1: status=0x51 { DriveReady SeekComplete Error }
479                         # ata1: error=0x40 { UncorrectableError }
480                         # SCSI error : <0 0 0 0> return code = 0x8000002
481                         # sda: Current: sense key: Medium Error
482                         #       Additional sense: Unrecovered read error - auto reallocate failed
483
484                         # SCSI error : <0 2 0 0> return code = 0x40001
485                         # end_request: I/O error, dev sda, sector 572489600
486                 ]
487                 return steps
488
489         def getDiskSequence(self, steps, child):
490                 sequence = []
491                 while True:
492                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
493                         sequence.append(id)
494
495                         if id == "done":
496                                 break
497                 return sequence
498
499         def getBootManagerStepPatterns(self):
500                 steps = [
501                         ('bminit'               , 'Initializing the BootManager.'),
502                         ('cfg'                  , 'Reading node configuration file.'),
503                         ('auth'                 , 'Authenticating node with PLC.'),
504                         ('getplc'               , 'Retrieving details of node from PLC.'),
505                         ('update'               , 'Updating node boot state at PLC.'),
506                         ('hardware'             , 'Checking if hardware requirements met.'),
507                         ('installinit'  , 'Install: Initializing.'),
508                         ('installdisk'  , 'Install: partitioning disks.'),
509                         ('installbootfs', 'Install: bootstrapfs tarball.'),
510                         ('installcfg'   , 'Install: Writing configuration files.'),
511                         ('installstop'  , 'Install: Shutting down installer.'),
512                         ('update2'              , 'Updating node boot state at PLC.'),
513                         ('installinit2' , 'Install: Initializing.'),
514                         ('validate'             , 'Validating node installation.'),
515                         ('rebuildinitrd', 'Rebuilding initrd'),
516                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
517                         ('update3'              , 'Updating node configuration.'),
518                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
519                         ('update4'              , 'Sending hardware configuration to PLC.'),
520                         ('debug'                , 'Starting debug mode'),
521                         ('bmexceptmount', 'BootManagerException during mount'),
522                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
523                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
524                         ('exception'    , 'Exception'),
525                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
526                         ('protoerror2'  , '500 Internal Server Error'),
527                         ('protoerror'   , 'XML RPC protocol error'),
528                         ('nodehostname' , 'Configured node hostname does not resolve'),
529                         ('implementerror', 'Implementation Error'),
530                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
531                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
532                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
533                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
534                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
535                         ('noinstall'    , 'notinstalled'),
536                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
537                         ('noblockdev'   , "No block devices detected."),
538                         ('missingkernel', "missingkernel"),
539                         ('dnserror'     , 'Name or service not known'),
540                         ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
541                         ('noconfig'             , "Unable to find and read a node configuration file"),
542                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
543                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
544                         ('hardwarerequirefail' , 'Hardware requirements not met'),
545                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
546                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
547                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
548                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
549                         ('modulefail'   , 'Unable to get list of system modules'),
550                         ('writeerror'   , 'write error: No space left on device'),
551                         ('nospace'      , "No space left on device"),
552                         ('nonode'       , 'Failed to authenticate call: No such node'),
553                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
554                         ('authfail2'    , 'Authentication Failed'),
555                         ('bootcheckfail'  , 'BootCheckAuthentication'),
556                         ('bootupdatefail' , 'BootUpdateNode'),
557                 ]
558                 return steps
559
560         def getBootManagerSequenceFromLog(self, steps, child):
561                 sequence = []
562                 while True:
563                         
564                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
565                         id = index_to_id(steps,index)
566                         sequence.append(id)
567
568                         if id == "exception":
569                                 print "...Found An Exception!!!"
570                         elif id == "done": #index == len(steps_to_list(steps)):
571                                 #print "Reached EOF"
572                                 break
573
574                 return sequence
575                 
576 def restore(sitehist, hostname, config=None, forced_action=None):
577         ret = restore_basic(sitehist, hostname, config, forced_action)
578         session.flush()
579         return ret
580
581 def restore_basic(sitehist, hostname, config=None, forced_action=None):
582
583         # NOTE: Nothing works if the bootcd is REALLY old.
584         #       So, this is the first step.
585
586         bootman_action = "unknown"
587
588         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
589         recent_actions = sitehist.getRecentActions(hostname=hostname)
590
591         if fbnode['observed_category'] == "OLDBOOTCD":
592                 print "\t...Notify owner to update BootImage!!!"
593
594                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
595                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
596
597                         print "\tDisabling %s due to out-of-date BootImage" % hostname
598                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
599
600                 # NOTE: nothing else is possible.
601                 return "disabled"
602
603         debugnode = DebugInterface(hostname)
604         conn = debugnode.getConnection()
605         if type(conn) == type(False): return "connect_failed"
606
607         boot_state = conn.get_boot_state()
608         if boot_state != "debug":
609                 print "... %s in %s state: skipping..." % (hostname , boot_state)
610                 return "skipped" #boot_state == "boot"
611
612         if conn.bootmanager_running():
613                 print "...BootManager is currently running.  Skipping host %s" %hostname 
614                 return "skipped" # True
615
616         # Read persistent flags, tagged on one week intervals.
617
618         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
619         dmesg = conn.get_dmesg()
620         child = fdpexpect.fdspawn(dmesg)
621
622         steps = debugnode.getDiskSteps()
623         sequence = debugnode.getDiskSequence(steps, child)
624
625         s = Set(sequence)
626         if config and not config.quiet: print "\tSET: ", s
627
628         if len(s) > 1:
629                 print "...Potential drive errors on %s" % hostname 
630                 if len(s) == 2 and 'floppyerror' in s:
631                         print "...Should investigate.  Continuing with node."
632                 else:
633                         print "...Should investigate.  Skipping node."
634                         # TODO: send message related to these errors.
635
636                         if not found_within(recent_actions, 'baddisk_notice', 7):
637                                 print "baddisk_notice not found recently"
638
639                                 log=conn.get_dmesg().read()
640                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
641                                 return "skipping_baddisk"
642                         else:
643                                 # NOTE: "" does not add a new action record
644                                 return ""
645
646
647         print "...Downloading bm.log from %s" %hostname 
648         log = conn.get_bootmanager_log()
649         bm_log_data = log.read() # get data
650         log.seek(0)     # reset fd pointer for fdspawn
651         child = fdpexpect.fdspawn(log)
652
653         if hasattr(config, 'collect') and config.collect: return "collect"
654
655         if config and not config.quiet: print "...Scanning bm.log for errors"
656
657         time.sleep(1)
658
659         steps = debugnode.getBootManagerStepPatterns()
660         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
661                 
662         s = "-".join(sequence)
663         print "   FOUND SEQUENCE: ", s
664
665         # NOTE: We get or set the flag based on the current sequence identifier.
666         #  By using the sequence identifier, we guarantee that there will be no
667         #  frequent loops.  I'm guessing there is a better way to track loops,
668         #  though.
669
670         sequences = debugnode.getSequences()
671         flag_set = True
672         
673         if s not in sequences:
674                 print "   HOST %s" % hostname
675                 print "   UNKNOWN SEQUENCE: %s" % s
676
677                 args = {}
678                 args['hostname'] = hostname
679                 args['sequence'] = s
680                 args['bmlog'] = bm_log_data
681                 args['viart'] = False
682                 args['saveact'] = True
683                 args['ccemail'] = True
684
685                 sitehist.sendMessage('unknownsequence_notice', **args)
686
687                 conn.restart_bootmanager('boot')
688
689                 bootman_action = "restart_bootmanager"
690
691                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
692                 # This way, we can check it again after we've fixed it.
693                 flag_set = False
694
695         else:
696                 bootman_action = sequences[s]
697
698                 if   sequences[s] == "restart_bootmanager_boot":
699                         print "...Restarting BootManager.py on %s "%hostname 
700                         conn.restart_bootmanager('boot')
701                 elif sequences[s] == "restart_bootmanager_rins":
702                         print "...Restarting BootManager.py on %s "%hostname 
703                         conn.restart_bootmanager('reinstall')
704                 elif sequences[s] == "restart_node_rins":
705                         conn.restart_node('reinstall')
706                 elif sequences[s] == "restart_node_boot":
707                         conn.restart_node('boot')
708                 elif sequences[s] == "fsck_repair":
709                         conn.fsck_repair_node()
710                 elif sequences[s] == "repair_node_keys":
711                         if conn.compare_and_repair_nodekeys():
712                                 # the keys either are in sync or were forced in sync.
713                                 # so try to start BM again.
714                                 conn.restart_bootmanager(conn.get_nodestate())
715                         else:
716                                 # there was some failure to synchronize the keys.
717                                 print "...Unable to repair node keys on %s" %hostname 
718                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
719                                         args = {}
720                                         args['hostname'] = hostname
721                                         sitehist.sendMessage('nodeconfig_notice', **args)
722                                         conn.dump_plconf_file()
723                                 else:
724                                         # NOTE: do not add a new action record
725                                         return ""
726
727                 elif sequences[s] == "unknownsequence_notice":
728                         args = {}
729                         args['hostname'] = hostname
730                         args['sequence'] = s
731                         args['bmlog'] = bm_log_data
732                         args['viart'] = False
733                         args['saveact'] = True
734                         args['ccemail'] = True
735
736                         sitehist.sendMessage('unknownsequence_notice', **args)
737                         conn.restart_bootmanager('boot')
738
739                 elif sequences[s] == "nodeconfig_notice":
740
741                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
742                                 args = {}
743                                 args['hostname'] = hostname
744                                 sitehist.sendMessage('nodeconfig_notice', **args)
745                                 conn.dump_plconf_file()
746                         else:
747                                 # NOTE: do not add a new action record
748                                 return ""
749
750                 elif sequences[s] == "nodenetwork_email":
751
752                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
753                                 args = {}
754                                 args['hostname'] = hostname
755                                 args['bmlog'] = bm_log_data
756                                 sitehist.sendMessage('nodeconfig_notice', **args)
757                                 conn.dump_plconf_file()
758                         else:
759                                 # NOTE: do not add a new action record
760                                 return ""
761
762                 elif sequences[s] == "noblockdevice_notice":
763
764                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
765                                 args = {}
766                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
767                                 args['hostname'] = hostname
768                         
769                                 sitehist.sendMessage('noblockdevice_notice', **args)
770                         else:
771                                 # NOTE: do not add a new action record
772                                 return ""
773
774                 elif sequences[s] == "baddisk_notice":
775                         # MAKE An ACTION record that this host has failed hardware.  May
776                         # require either an exception "/minhw" or other manual intervention.
777                         # Definitely need to send out some more EMAIL.
778                         # TODO: email notice of broken hardware
779                         if not found_within(recent_actions, 'baddisk_notice', 7):
780                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
781                                 args = {}
782                                 args['hostname'] = hostname
783                                 args['log'] = conn.get_dmesg().read()
784
785                                 sitehist.sendMessage('baddisk_notice', **args)
786                                 #conn.set_nodestate('disabled')
787                         else:
788                                 # NOTE: do not add a new action record
789                                 return ""
790
791                 elif sequences[s] == "minimalhardware_notice":
792                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
793                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
794                                 args = {}
795                                 args['hostname'] = hostname
796                                 args['bmlog'] = bm_log_data
797                                 sitehist.sendMessage('minimalhardware_notice', **args)
798                         else:
799                                 # NOTE: do not add a new action record
800                                 return ""
801
802                 elif sequences[s] == "baddns_notice":
803                         if not found_within(recent_actions, 'baddns_notice', 1):
804                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
805                                 args = {}
806                                 try:
807                                         node = plccache.GetNodeByName(hostname)
808                                         net = api.GetInterfaces(node['interface_ids'])[0]
809                                 except:
810                                         email_exception()
811                                         print traceback.print_exc()
812                                         # TODO: api error. skip email, b/c all info is not available,
813                                         # flag_set will not be recorded.
814                                         return "exception"
815                                 nodenet_str = network_config_to_str(net)
816
817                                 args['hostname'] = hostname
818                                 args['network_config'] = nodenet_str
819                                 args['interface_id'] = net['interface_id']
820
821                                 sitehist.sendMessage('baddns_notice', **args)
822                         else:
823                                 # NOTE: do not add a new action record
824                                 return ""
825
826         return bootman_action
827         
828
829 if __name__ == "__main__":
830         print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]