clearer names for actions, and infer actions better
[monitor.git] / monitor / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import os
6 import sys
7 import time
8 import random
9 import signal
10 import traceback
11 import subprocess
12 from sets import Set
13
14 from monitor.util.sshknownhosts import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
17
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
29
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
35
36 api = plc.getAuthAPI()
37 fb = None
38
39 def bootmanager_log_name(hostname):
40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42         short_target_filename = os.path.join('history', base_filename)
43         return short_target_filename
44
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
46         try:
47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
49                 err = ""
50         except:
51                 loginbase = "unknown"
52                 err = traceback.format_exc()
53
54         act = ActionRecord(loginbase=loginbase,
55                                                 hostname=hostname,
56                                                 action='log',
57                                                 action_type=logtype,
58                                                 log_path=short_log_path,
59                                                 error_string=err)
60         return
61         
62
63 class ExceptionDoubleSSHError(Exception): pass
64
65 class NodeConnection:
66         def __init__(self, connection, node, config):
67                 print "init nodeconnection"
68                 self.node = node
69                 self.c = connection
70                 self.config = config
71
72         def get_boot_state(self):
73                 print "get_boot_state(self)"
74                 try:
75                         if self.c.modules.os.path.exists('/tmp/source'):
76                                 return "debug"
77                         elif self.c.modules.os.path.exists('/vservers'): 
78                                 return "boot"
79                         else:
80                                 return "unknown"
81                 except EOFError:
82                         traceback.print_exc()
83                         print self.c.modules.sys.path
84                 except:
85                         email_exception()
86                         traceback.print_exc()
87
88                 return "unknown"
89
90         def get_dmesg(self):
91                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
92                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
93                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
94                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
95                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
96                 return log
97
98         def get_bootmanager_log(self):
99                 bm_name = bootmanager_log_name(self.node)
100                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
101                 #email_exception(self.node, "collected BM log for %s" % self.node)
102                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
103                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
104                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
105                 return log
106
107         def dump_plconf_file(self):
108                 c = self.c
109                 self.c.modules.sys.path.append("/tmp/source/")
110                 self.c.modules.os.chdir('/tmp/source')
111
112                 log = c.modules.BootManager.log('/tmp/new.log')
113                 bm = c.modules.BootManager.BootManager(log,'boot')
114
115                 BootManagerException = c.modules.Exceptions.BootManagerException
116                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
117                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
118                 bm_continue = True
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "   ERROR:", x
125                         print "   Possibly, unable to find valid configuration file"
126
127                 if bm_continue:
128                         for key in bm.VARS.keys():
129                                 print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to read Node Configuration"
132
133         def fprobe_repair_node(self):
134                 # When fprobe data gets too much, it fills the root partition and
135                 # fails to boot
136                 c = self.c
137                 self.c.modules.sys.path.append("/tmp/source/")
138
139                 # NOTE: assume that the root fs is already mounted...
140                 if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'):
141                         print "CLEARING FPROBE DATA on %s" % self.node
142                         self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe')
143                         cmd = """ ls -lrt . | awk '{if (i<NR/2 && $9) {print "rm "$9;i=i+1;}}' | sh """
144                         self.c.modules.os.system(cmd)
145                 else:
146                         print "COULD NOT CLEAR FPROBE DATA on %s" % self.node
147                 
148         def fsck_repair_node(self):
149                 c = self.c
150                 self.c.modules.sys.path.append("/tmp/source/")
151                 self.c.modules.os.chdir('/tmp/source')
152                 # TODO: restart
153                 # TODO: set boot state to node's actually boot state.
154                 # could be 'boot' or 'safeboot'
155                 self.c.modules.os.chdir('/tmp/source')
156                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
157                         print "Running MANUAL FSCK already... try again soon."
158                 else:
159                         print "Running MANUAL fsck on %s" % self.node
160                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
161                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
162                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
163                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
164                                   "  rm -f /tmp/BM_RUNNING " + \
165                                   ") &" 
166                         cmd = cmd % self.get_nodestate()
167                         self.c.modules.os.system(cmd)
168                 #self.restart_bootmanager('boot')       
169                 pass
170
171         def compare_and_repair_nodekeys(self):
172                 c = self.c
173                 self.c.modules.sys.path.append("/tmp/source/")
174                 self.c.modules.os.chdir('/tmp/source')
175
176                 log = c.modules.BootManager.log('/tmp/new.log')
177                 bm = c.modules.BootManager.BootManager(log,'boot')
178
179                 BootManagerException = c.modules.Exceptions.BootManagerException
180                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
181                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
182                 bm_continue = True
183
184                 plcnode = plccache.GetNodeByName(self.node)
185
186                 InitializeBootManager.Run(bm.VARS, bm.LOG)
187                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
188                 except Exception, x:
189                         bm_continue = False
190                         print "exception"
191                         print x
192                         print "   Possibly, unable to find valid configuration file"
193
194                 if bm_continue:
195                         print "   NODE: %s" % bm.VARS['NODE_KEY']
196                         print "   PLC : %s" % plcnode['key']
197
198                         if bm.VARS['NODE_KEY'] == plcnode['key']:
199                                 return True
200                         else:
201                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
202                                         print "   Successfully updated NODE_KEY with PLC"
203                                         return True
204                                 else:
205                                         return False
206                                 
207                         #for key in bm.VARS.keys():
208                         #       print key, " == ", bm.VARS[key]
209                 else:
210                         print "   Unable to retrieve NODE_KEY"
211
212         def bootmanager_running(self):
213                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
214                         return True
215                 else:
216                         return False
217
218         def set_nodestate(self, state='boot'):
219                 return api.UpdateNode(self.node, {'boot_state' : state})
220
221         def get_nodestate(self):
222                 try:
223                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
224                 except:
225                         traceback.print_exc()
226                         # NOTE: use last cached value from plc
227                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
228                         return fbnode['plc_node_stats']['boot_state']
229
230
231         def restart_node(self, state='boot'):
232                 api.UpdateNode(self.node, {'boot_state' : state})
233
234                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
235                 if not pflags.getRecentFlag('gentlekill'):
236                         print "   Killing all slice processes... : %s" %  self.node
237                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
238                         self.c.modules.os.system(cmd_slicekill)
239                         cmd = """ shutdown -r +1 & """
240                         print "   Restarting %s : %s" % ( self.node, cmd)
241                         self.c.modules.os.system(cmd)
242
243                         pflags.setRecentFlag('gentlekill')
244                         pflags.save()
245                 else:
246                         print "   Restarting with sysrq 'sub' %s" % self.node
247                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
248                         self.c.modules.os.system(cmd)
249
250                 return
251
252         def restart_bootmanager(self, forceState):
253
254                 self.c.modules.os.chdir('/tmp/source')
255                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
256                         print "   BootManager is already running: try again soon..."
257                 else:
258                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
259                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
260                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
261                                   "  rm -f /tmp/BM_RUNNING " + \
262                                   ") &" 
263                         cmd = cmd % forceState
264                         self.c.modules.os.system(cmd)
265
266                 return 
267
268
269 class PlanetLabSession:
270         globalport = 22000 + int(random.random()*1000)
271
272         def __init__(self, node, nosetup, verbose):
273                 self.verbose = verbose
274                 self.node = node
275                 self.port = None
276                 self.nosetup = nosetup
277                 self.command = None
278                 self.setup_host()
279
280         def get_connection(self, config):
281                 try:
282                         print "SocketConnection(localhost, %s" % self.port
283                         sc = SocketConnection("localhost", self.port)
284                         print "NodeConnection(%s, %s)" % (sc, self.node)
285                         conn = NodeConnection(sc, self.node, config)
286                 except:
287                         # NOTE: try twice since this can sometimes fail the first time. If
288                         #               it fails again, let it go.
289                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
290                 return conn
291         
292         def setup_host(self):
293                 self.port = PlanetLabSession.globalport
294                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
295
296                 args = {}
297                 args['port'] = self.port
298                 args['user'] = 'root'
299                 args['hostname'] = self.node
300                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
301                 ssh_port = 22
302
303                 if self.nosetup:
304                         print "Skipping setup"
305                         return 
306
307                 # COPY Rpyc files to host
308                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
309                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
310                 if self.verbose: print cmd
311                 print cmd
312                 # TODO: Add timeout
313                 timeout = 120
314                 localos = moncommands.CMD()
315
316                 ret = localos.system(cmd, timeout)
317                 print ret
318                 if ret != 0:
319                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
320                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
321                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
322                         print "trying: ", cmd
323                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
324                         ret = localos.system(cmd, timeout)
325                         print ret
326                         if ret != 0:
327                                 print "\tFAILED TWICE"
328                                 #email_exception("%s rsync failed twice" % self.node)
329                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
330
331                 t1 = time.time()
332                 # KILL any already running servers.
333                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
334                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
335             rm -f out.log
336             echo "kill server" >> out.log
337                         netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
338             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
339             echo "export" >> out.log
340             export PYTHONPATH=$HOME  ;
341             echo "start server" >> out.log
342             python Rpyc/Servers/forking_server.py &> server.log &
343             echo "done" >> out.log
344 EOF""")
345                 print "setup rpyc server over ssh"
346                 print ssh.ret
347
348                 # TODO: Add timeout
349                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
350                 # and the following options seems to work well.
351                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
352                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
353                           """-o ConnectTimeout=120 """ + \
354                           """-n -N -L %(port)s:localhost:18812 """ + \
355                           """%(user)s@%(hostname)s"""
356                 cmd = cmd % args
357                 if self.verbose: print cmd
358                 print cmd
359                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
360                 # TODO: the read() here may block indefinitely.  Need a better
361                 # approach therefore, that includes a timeout.
362                 #ret = self.command.stdout.read(5)
363                 ret = moncommands.read_t(self.command.stdout, 5)
364
365                 t2 = time.time()
366                 if 'READY' in ret:
367                         # NOTE: There is still a slight race for machines that are slow...
368                         self.timeout = 2*(t2-t1)
369                         print "Sleeping for %s sec" % self.timeout
370                         time.sleep(self.timeout)
371                         return
372
373                 if self.command.returncode is not None:
374                         print "Failed to establish tunnel!"
375                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
376
377                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
378
379         def __del__(self):
380                 if self.command:
381                         if self.verbose: print "Killing SSH session %s" % self.port
382                         print "Killing SSH session %s" % self.port
383                         self.command.kill()
384
385         
386 def steps_to_list(steps, index=1):
387         return map(lambda x: x[index], steps)
388
389 def index_to_id(steps,index):
390         if index < len(steps):
391                 return steps[index][0]
392         else:
393                 return "done"
394
395 class DebugInterface:
396         def __init__(self, hostname):
397                 self.hostname = hostname
398                 self.session = None
399
400         def getConnection(self):
401                 print "Creating session for %s" % self.hostname
402                 # update known_hosts file (in case the node has rebooted since last run)
403                 try:
404                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
405                 except:
406                         email_exception()
407                         print traceback.print_exc()
408                         return False
409
410                 msg = "ERROR setting up session for %s" % self.hostname
411                 try:
412                         if config == None:
413                                 self.session = PlanetLabSession(self.hostname, False, True)
414                         else:
415                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
416                 except ExceptionDoubleSSHError, e:
417                         print msg
418                         return False
419                 except Exception, e:
420                         traceback.print_exc()
421                         email_exception(msg)
422                         return False
423
424                 print "Getting connection: 1st try"
425                 try:
426                         conn = self.session.get_connection(config)
427                 except EOFError:
428                         # NOTE: sometimes the wait in setup_host() is not long enough.  
429                         # So, here we try to wait a little longer before giving up entirely.
430                         try:
431                                 print "Getting connection: 2nd try"
432                                 time.sleep(self.session.timeout*5)
433                                 conn = self.session.get_connection(config)
434                         except EOFError:
435                                 # failed twice... no need to report this really, it's just in a
436                                 # weird state...
437                                 print "Getting connection: failed"
438                                 email_exception(self.hostname, "failed twice to get connection")
439                                 return False
440                         except:
441                                 traceback.print_exc()
442                                 email_exception(self.hostname)
443                                 return False
444                 print "Getting connection: ok"
445                 #print "trying to use conn before returning it."
446                 #print conn.c.modules.sys.path
447                 #print conn.c.modules.os.path.exists('/tmp/source')
448                 #time.sleep(1)
449
450                 #print "conn: %s" % conn
451                 return conn
452
453         def getSequences(self):
454
455                 # NOTE: The DB is now the autoritative record for all BM sequences. 
456                 #               An admin can introduce new patterns and actions without touching code.
457                 sequences = {}
458
459                 bms = BootmanSequenceRecord.query.all()
460                 for s in bms:
461                         sequences[s.sequence] = s.action
462                 
463                 return sequences
464
465         def getDiskSteps(self):
466                 steps = [
467                         ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
468                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
469                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
470                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
471
472                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
473
474                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
475                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
476
477                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
478                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
479
480                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
481                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
482
483                         ('floppytimeout','floppy0: floppy timeout called'),
484                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
485
486                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
487                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
488
489                         # floppy0: floppy timeout called
490                         # end_request: I/O error, dev fd0, sector 0
491
492                         # Buffer I/O error on device dm-2, logical block 8888896
493                         # ata1: status=0x51 { DriveReady SeekComplete Error }
494                         # ata1: error=0x40 { UncorrectableError }
495                         # SCSI error : <0 0 0 0> return code = 0x8000002
496                         # sda: Current: sense key: Medium Error
497                         #       Additional sense: Unrecovered read error - auto reallocate failed
498
499                         # SCSI error : <0 2 0 0> return code = 0x40001
500                         # end_request: I/O error, dev sda, sector 572489600
501                 ]
502                 return steps
503
504         def getDiskSequence(self, steps, child):
505                 sequence = []
506                 while True:
507                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
508                         sequence.append(id)
509
510                         if id == "done":
511                                 break
512                 return sequence
513
514         def getBootManagerStepPatterns(self):
515                 steps = [
516                         ('bminit'               , 'Initializing the BootManager.'),
517                         ('cfg'                  , 'Reading node configuration file.'),
518                         ('auth'                 , 'Authenticating node with PLC.'),
519                         ('getplc'               , 'Retrieving details of node from PLC.'),
520                         ('update'               , 'Updating node boot state at PLC.'),
521                         ('hardware'             , 'Checking if hardware requirements met.'),
522                         ('installinit'  , 'Install: Initializing.'),
523                         ('installdisk'  , 'Install: partitioning disks.'),
524                         ('installbootfs', 'Install: bootstrapfs tarball.'),
525                         ('installcfg'   , 'Install: Writing configuration files.'),
526                         ('installstop'  , 'Install: Shutting down installer.'),
527                         ('update2'              , 'Updating node boot state at PLC.'),
528                         ('installinit2' , 'Install: Initializing.'),
529                         ('validate'             , 'Validating node installation.'),
530                         ('rebuildinitrd', 'Rebuilding initrd'),
531                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
532                         ('update3'              , 'Updating node configuration.'),
533                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
534                         ('update4'              , 'Sending hardware configuration to PLC.'),
535                         ('debug'                , 'Starting debug mode'),
536                         ('bmexceptmount', 'BootManagerException during mount'),
537                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
538                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
539                         ('exception'    , 'Exception'),
540                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
541                         ('protoerror2'  , '500 Internal Server Error'),
542                         ('protoerror'   , 'XML RPC protocol error'),
543                         ('nodehostname' , 'Configured node hostname does not resolve'),
544                         ('implementerror', 'Implementation Error'),
545                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
546                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
547                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
548                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
549                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
550                         ('noinstall'    , 'notinstalled'),
551                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
552                         ('noblockdev'   , "No block devices detected."),
553                         ('missingkernel', "missingkernel"),
554                         ('dnserror'     , 'Name or service not known'),
555                         ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
556                         ('noconfig'             , "Unable to find and read a node configuration file"),
557                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
558                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
559                         ('hardwarerequirefail' , 'Hardware requirements not met'),
560                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
561                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
562                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
563                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
564                         ('modulefail'   , 'Unable to get list of system modules'),
565                         ('writeerror'   , 'write error: No space left on device'),
566                         ('nospace'      , "No space left on device"),
567                         ('nonode'       , 'Failed to authenticate call: No such node'),
568                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
569                         ('authfail2'    , 'Authentication Failed'),
570                         ('bootcheckfail'  , 'BootCheckAuthentication'),
571                         ('bootupdatefail' , 'BootUpdateNode'),
572                 ]
573                 return steps
574
575         def getBootManagerSequenceFromLog(self, steps, child):
576                 sequence = []
577                 while True:
578                         
579                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
580                         id = index_to_id(steps,index)
581                         sequence.append(id)
582
583                         if id == "exception":
584                                 print "...Found An Exception!!!"
585                         elif id == "done": #index == len(steps_to_list(steps)):
586                                 #print "Reached EOF"
587                                 break
588
589                 return sequence
590                 
591 def restore(sitehist, hostname, config=None, forced_action=None):
592         ret = restore_basic(sitehist, hostname, config, forced_action)
593         session.flush()
594         return ret
595
596 def restore_basic(sitehist, hostname, config=None, forced_action=None):
597
598         # NOTE: Nothing works if the bootcd is REALLY old.
599         #       So, this is the first step.
600
601         bootman_action = "unknown"
602
603         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
604         recent_actions = sitehist.getRecentActions(hostname=hostname)
605
606         if fbnode['observed_category'] == "OLDBOOTCD":
607                 print "\t...Notify owner to update BootImage!!!"
608
609                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
610                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
611
612                         print "\tDisabling %s due to out-of-date BootImage" % hostname
613                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
614
615                 # NOTE: nothing else is possible.
616                 return "disabled"
617
618         debugnode = DebugInterface(hostname)
619         conn = debugnode.getConnection()
620         if type(conn) == type(False): return "connect_failed"
621
622         boot_state = conn.get_boot_state()
623         if boot_state != "debug":
624                 print "... %s in %s state: skipping..." % (hostname , boot_state)
625                 return "skipped" #boot_state == "boot"
626
627         if conn.bootmanager_running():
628                 print "...BootManager is currently running.  Skipping host %s" %hostname 
629                 return "skipped" # True
630
631         # Read persistent flags, tagged on one week intervals.
632
633         if config and not config.quiet: print "...downloading dmesg from %s" %hostname 
634         dmesg = conn.get_dmesg()
635         child = fdpexpect.fdspawn(dmesg)
636
637         steps = debugnode.getDiskSteps()
638         sequence = debugnode.getDiskSequence(steps, child)
639
640         s = Set(sequence)
641         if config and not config.quiet: print "\tSET: ", s
642
643         if len(s) > 1:
644                 print "...Potential drive errors on %s" % hostname 
645                 if len(s) == 2 and 'floppyerror' in s:
646                         print "...Should investigate.  Continuing with node."
647                 else:
648                         print "...Should investigate.  Skipping node."
649                         # TODO: send message related to these errors.
650
651                         if not found_within(recent_actions, 'baddisk_notice', 7):
652                                 print "baddisk_notice not found recently"
653
654                                 log=conn.get_dmesg().read()
655                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
656                                 return "skipping_baddisk"
657                         else:
658                                 # NOTE: "" does not add a new action record
659                                 return ""
660
661
662         print "...Downloading bm.log from %s" %hostname 
663         log = conn.get_bootmanager_log()
664         bm_log_data = log.read() # get data
665         log.seek(0)     # reset fd pointer for fdspawn
666         child = fdpexpect.fdspawn(log)
667
668         if hasattr(config, 'collect') and config.collect: return "collect"
669
670         if config and not config.quiet: print "...Scanning bm.log for errors"
671
672         time.sleep(1)
673
674         steps = debugnode.getBootManagerStepPatterns()
675         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
676                 
677         s = "-".join(sequence)
678         print "   FOUND SEQUENCE: ", s
679
680         # NOTE: We get or set the flag based on the current sequence identifier.
681         #  By using the sequence identifier, we guarantee that there will be no
682         #  frequent loops.  I'm guessing there is a better way to track loops,
683         #  though.
684
685         sequences = debugnode.getSequences()
686         flag_set = True
687         
688         if s not in sequences:
689                 print "   HOST %s" % hostname
690                 print "   UNKNOWN SEQUENCE: %s" % s
691
692                 args = {}
693                 args['hostname'] = hostname
694                 args['sequence'] = s
695                 args['bmlog'] = bm_log_data
696                 args['viart'] = False
697                 args['saveact'] = True
698                 args['ccemail'] = True
699
700                 if 'nospace' in s:
701                         # NOTE: sequence is unknown and contains nospace, so try the
702                         # fprobe repair trick first.
703                         conn.fprobe_repair_node()
704
705                 sitehist.sendMessage('unknownsequence_notice', **args)
706                 conn.restart_bootmanager('boot')
707                 bootman_action = "restart_bootmanager"
708
709                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
710                 # This way, we can check it again after we've fixed it.
711                 flag_set = False
712
713         else:
714                 bootman_action = sequences[s]
715
716                 if   sequences[s] == "restart_bootmanager_boot":
717                         print "...Restarting BootManager.py on %s "%hostname 
718                         conn.restart_bootmanager('boot')
719                 elif sequences[s] == "restart_bootmanager_rins":
720                         print "...Restarting BootManager.py on %s "%hostname 
721                         conn.restart_bootmanager('reinstall')
722                 elif sequences[s] == "restart_node_rins":
723                         conn.restart_node('reinstall')
724                 elif sequences[s] == "restart_node_boot":
725                         conn.restart_node('boot')
726                 elif sequences[s] == "fsck_repair":
727                         conn.fsck_repair_node()
728                 elif sequences[s] == "repair_node_keys":
729                         if conn.compare_and_repair_nodekeys():
730                                 # the keys either are in sync or were forced in sync.
731                                 # so try to start BM again.
732                                 conn.restart_bootmanager(conn.get_nodestate())
733                         else:
734                                 # there was some failure to synchronize the keys.
735                                 print "...Unable to repair node keys on %s" %hostname 
736                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
737                                         args = {}
738                                         args['hostname'] = hostname
739                                         sitehist.sendMessage('nodeconfig_notice', **args)
740                                         conn.dump_plconf_file()
741                                 else:
742                                         # NOTE: do not add a new action record
743                                         return ""
744
745                 elif sequences[s] == "unknownsequence_notice":
746                         args = {}
747                         args['hostname'] = hostname
748                         args['sequence'] = s
749                         args['bmlog'] = bm_log_data
750                         args['viart'] = False
751                         args['saveact'] = True
752                         args['ccemail'] = True
753
754                         sitehist.sendMessage('unknownsequence_notice', **args)
755                         conn.restart_bootmanager('boot')
756
757                 elif sequences[s] == "nodeconfig_notice":
758
759                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
760                                 args = {}
761                                 args['hostname'] = hostname
762                                 sitehist.sendMessage('nodeconfig_notice', **args)
763                                 conn.dump_plconf_file()
764                         else:
765                                 # NOTE: do not add a new action record
766                                 return ""
767
768                 elif sequences[s] == "nodenetwork_email":
769
770                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
771                                 args = {}
772                                 args['hostname'] = hostname
773                                 args['bmlog'] = bm_log_data
774                                 sitehist.sendMessage('nodeconfig_notice', **args)
775                                 conn.dump_plconf_file()
776                         else:
777                                 # NOTE: do not add a new action record
778                                 return ""
779
780                 elif sequences[s] == "noblockdevice_notice":
781
782                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
783                                 args = {}
784                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
785                                 args['hostname'] = hostname
786                         
787                                 sitehist.sendMessage('noblockdevice_notice', **args)
788                         else:
789                                 # NOTE: do not add a new action record
790                                 return ""
791
792                 elif sequences[s] == "baddisk_notice":
793                         # MAKE An ACTION record that this host has failed hardware.  May
794                         # require either an exception "/minhw" or other manual intervention.
795                         # Definitely need to send out some more EMAIL.
796                         # TODO: email notice of broken hardware
797                         if not found_within(recent_actions, 'baddisk_notice', 7):
798                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
799                                 args = {}
800                                 args['hostname'] = hostname
801                                 args['log'] = conn.get_dmesg().read()
802
803                                 sitehist.sendMessage('baddisk_notice', **args)
804                                 #conn.set_nodestate('disabled')
805                         else:
806                                 # NOTE: do not add a new action record
807                                 return ""
808
809                 elif sequences[s] == "minimalhardware_notice":
810                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
811                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
812                                 args = {}
813                                 args['hostname'] = hostname
814                                 args['bmlog'] = bm_log_data
815                                 sitehist.sendMessage('minimalhardware_notice', **args)
816                         else:
817                                 # NOTE: do not add a new action record
818                                 return ""
819
820                 elif sequences[s] == "baddns_notice":
821                         if not found_within(recent_actions, 'baddns_notice', 1):
822                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
823                                 args = {}
824                                 try:
825                                         node = plccache.GetNodeByName(hostname)
826                                         net = api.GetInterfaces(node['interface_ids'])[0]
827                                 except:
828                                         email_exception()
829                                         print traceback.print_exc()
830                                         # TODO: api error. skip email, b/c all info is not available,
831                                         # flag_set will not be recorded.
832                                         return "exception"
833                                 nodenet_str = network_config_to_str(net)
834
835                                 args['hostname'] = hostname
836                                 args['network_config'] = nodenet_str
837                                 args['interface_id'] = net['interface_id']
838
839                                 sitehist.sendMessage('baddns_notice', **args)
840                         else:
841                                 # NOTE: do not add a new action record
842                                 return ""
843
844         return bootman_action
845         
846
847 if __name__ == "__main__":
848         print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]