www/printbadnodes.py
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 api = plc.getAuthAPI()
7
8 import sys
9 import os
10 import const
11
12 from getsshkeys import SSHKnownHosts
13
14 import subprocess
15 import time
16 import database
17 import moncommands
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 import monitorconfig
28
29 import signal
30 class Sopen(subprocess.Popen):
31         def kill(self, signal = signal.SIGTERM):
32                 os.kill(self.pid, signal)
33
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
37
38 def get_fbnode(node):
39         fb = database.dbLoad("findbad")
40         fbnode = fb['nodes'][node]['values']
41         return fbnode
42
43 class NodeConnection:
44         def __init__(self, connection, node, config):
45                 self.node = node
46                 self.c = connection
47                 self.config = config
48
49         def get_boot_state(self):
50                 if self.c.modules.os.path.exists('/tmp/source'):
51                         return "dbg"
52                 elif self.c.modules.os.path.exists('/vservers'): 
53                         return "boot"
54                 else:
55                         return "unknown"
56
57         def get_dmesg(self):
58                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
59                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
60                 log = open("log/dmesg.%s.log" % self.node, 'r')
61                 return log
62
63         def get_bootmanager_log(self):
64                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
65                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
66                 log = open("log/bm.%s.log" % self.node, 'r')
67                 return log
68
69         def dump_plconf_file(self):
70                 c = self.c
71                 self.c.modules.sys.path.append("/tmp/source/")
72                 self.c.modules.os.chdir('/tmp/source')
73
74                 log = c.modules.BootManager.log('/tmp/new.log')
75                 bm = c.modules.BootManager.BootManager(log,'boot')
76
77                 BootManagerException = c.modules.Exceptions.BootManagerException
78                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
79                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
80                 bm_continue = True
81
82                 InitializeBootManager.Run(bm.VARS, bm.LOG)
83                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
84                 except Exception, x:
85                         bm_continue = False
86                         print "   ERROR:", x
87                         print "   Possibly, unable to find valid configuration file"
88
89                 if bm_continue and self.config and not self.config.quiet:
90                         for key in bm.VARS.keys():
91                                 print key, " == ", bm.VARS[key]
92                 else:
93                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
94                 
95
96         def compare_and_repair_nodekeys(self):
97                 c = self.c
98                 self.c.modules.sys.path.append("/tmp/source/")
99                 self.c.modules.os.chdir('/tmp/source')
100
101                 log = c.modules.BootManager.log('/tmp/new.log')
102                 bm = c.modules.BootManager.BootManager(log,'boot')
103
104                 BootManagerException = c.modules.Exceptions.BootManagerException
105                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
106                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
107                 bm_continue = True
108
109                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
110
111                 InitializeBootManager.Run(bm.VARS, bm.LOG)
112                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
113                 except Exception, x:
114                         bm_continue = False
115                         print "exception"
116                         print x
117                         print "   Possibly, unable to find valid configuration file"
118
119                 if bm_continue:
120                         print "   NODE: %s" % bm.VARS['NODE_KEY']
121                         print "   PLC : %s" % plcnode['key']
122
123                         if bm.VARS['NODE_KEY'] == plcnode['key']:
124                                 return True
125                         else:
126                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
127                                         print "   Successfully updated NODE_KEY with PLC"
128                                         return True
129                                 else:
130                                         return False
131                                 
132                         #for key in bm.VARS.keys():
133                         #       print key, " == ", bm.VARS[key]
134                 else:
135                         print "   Unable to retrieve NODE_KEY"
136
137         def bootmanager_running(self):
138                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
139                         return True
140                 else:
141                         return False
142
143         def set_nodestate(self, state='boot'):
144                 return api.UpdateNode(self.node, {'boot_state' : state})
145
146         def restart_node(self, state='boot'):
147                 api.UpdateNode(self.node, {'boot_state' : state})
148
149                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
150                 if not pflags.getRecentFlag('gentlekill'):
151                         print "   Killing all slice processes... : %s" %  self.node
152                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
153                         self.c.modules.os.system(cmd_slicekill)
154                         cmd = """ shutdown -r +1 & """
155                         print "   Restarting %s : %s" % ( self.node, cmd)
156                         self.c.modules.os.system(cmd)
157
158                         pflags.setRecentFlag('gentlekill')
159                         pflags.save()
160                 else:
161                         print "   Restarting with sysrq 'sub' %s" % self.node
162                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
163                         self.c.modules.os.system(cmd)
164
165                 return
166
167         def restart_bootmanager(self, forceState):
168
169                 self.c.modules.os.chdir('/tmp/source')
170                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
171                         print "   BootManager is already running: try again soon..."
172                 else:
173                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
174                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
175                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
176                                   "  rm -f /tmp/BM_RUNNING " + \
177                                   ") &" 
178                         cmd = cmd % forceState
179                         self.c.modules.os.system(cmd)
180
181                 return 
182
183
184 import random
185 class PlanetLabSession:
186         globalport = 22000 + int(random.random()*1000)
187
188         def __init__(self, node, nosetup, verbose):
189                 self.verbose = verbose
190                 self.node = node
191                 self.port = None
192                 self.nosetup = nosetup
193                 self.command = None
194                 self.setup_host()
195
196         def get_connection(self, config):
197                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
198         
199         def setup_host(self):
200                 self.port = PlanetLabSession.globalport
201                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
202
203                 args = {}
204                 args['port'] = self.port
205                 args['user'] = 'root'
206                 args['hostname'] = self.node
207                 args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
208                 ssh_port = 22
209
210                 if self.nosetup:
211                         print "Skipping setup"
212                         return 
213
214                 # COPY Rpyc files to host
215                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
216                 if self.verbose: print cmd
217                 # TODO: Add timeout
218                 timeout = 120
219                 localos = moncommands.CMD()
220
221                 ret = localos.system(cmd, timeout)
222                 print ret
223                 if ret != 0:
224                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
225                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
226                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
227                         ret = localos.system(cmd, timeout)
228                         print ret
229                         if ret != 0:
230                                 print "\tFAILED TWICE"
231                                 #sys.exit(1)
232                                 raise Exception("Failed twice trying to login with updated ssh host key")
233
234                 t1 = time.time()
235                 # KILL any already running servers.
236                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
237                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
238             rm -f out.log
239             echo "kill server" >> out.log
240             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
241             echo "export" >> out.log
242             export PYTHONPATH=$HOME  ;
243             echo "start server" >> out.log
244             python Rpyc/Servers/forking_server.py &> server.log &
245             echo "done" >> out.log
246 EOF""")
247                 #cmd = """ssh %(user)s@%(hostname)s """ + \
248                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
249                 #cmd = cmd % args
250                 #if self.verbose: print cmd
251                 ## TODO: Add timeout
252                 #print localos.system(cmd,timeout)
253
254                 ## START a new rpyc server.
255                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
256                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
257                 #cmd = cmd % args
258                 #if self.verbose: print cmd
259                 #print localos.system(cmd,timeout)
260                 print ssh.ret
261
262                 # TODO: Add timeout
263                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
264                 # and the following options seems to work well.
265                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
266                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
267                           """-o ConnectTimeout=120 """ + \
268                           """-n -N -L %(port)s:localhost:18812 """ + \
269                           """%(user)s@%(hostname)s"""
270                 cmd = cmd % args
271                 if self.verbose: print cmd
272                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
273                 # TODO: the read() here may block indefinitely.  Need a better
274                 # approach therefore, that includes a timeout.
275                 #ret = self.command.stdout.read(5)
276                 ret = moncommands.read_t(self.command.stdout, 5)
277
278                 t2 = time.time()
279                 if 'READY' in ret:
280                         # NOTE: There is still a slight race for machines that are slow...
281                         self.timeout = 2*(t2-t1)
282                         print "Sleeping for %s sec" % self.timeout
283                         time.sleep(self.timeout)
284                         return
285
286                 if self.command.returncode is not None:
287                         print "Failed to establish tunnel!"
288                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
289
290                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
291
292         def __del__(self):
293                 if self.command:
294                         if self.verbose: print "Killing SSH session %s" % self.port
295                         self.command.kill()
296
297
298 def steps_to_list(steps):
299         ret_list = []
300         for (id,label) in steps:
301                 ret_list.append(label)
302         return ret_list
303
304 def index_to_id(steps,index):
305         if index < len(steps):
306                 return steps[index][0]
307         else:
308                 return "done"
309
310 def reboot(hostname, config=None, forced_action=None):
311
312         # NOTE: Nothing works if the bootcd is REALLY old.
313         #       So, this is the first step.
314         fbnode = get_fbnode(hostname)
315         if fbnode['category'] == "OLDBOOTCD":
316                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
317                 args = {}
318                 args['hostname_list'] = "    %s" % hostname
319
320                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
321                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
322
323                 loginbase = plc.siteId(hostname)
324                 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
325
326                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
327                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
328                 return True
329
330         node = hostname
331         print "Creating session for %s" % node
332         # update known_hosts file (in case the node has rebooted since last run)
333         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
334         try:
335                 k = SSHKnownHosts(); k.update(node); k.write(); del k
336         except:
337                 print traceback.print_exc()
338                 return False
339
340         try:
341                 if config == None:
342                         session = PlanetLabSession(node, False, True)
343                 else:
344                         session = PlanetLabSession(node, config.nosetup, config.verbose)
345         except Exception, e:
346                 print "ERROR setting up session for %s" % hostname
347                 print traceback.print_exc()
348                 print e
349                 return False
350
351         try:
352                 conn = session.get_connection(config)
353         except EOFError:
354                 # NOTE: sometimes the wait in setup_host() is not long enough.  
355                 # So, here we try to wait a little longer before giving up entirely.
356                 try:
357                         time.sleep(session.timeout*4)
358                         conn = session.get_connection(config)
359                 except:
360                         print traceback.print_exc()
361                         return False
362                         
363
364         if forced_action == "reboot":
365                 conn.restart_node('rins')
366                 return True
367
368         boot_state = conn.get_boot_state()
369         if boot_state == "boot":
370                 print "...Boot state of %s already completed : skipping..." % node
371                 return True
372         elif boot_state == "unknown":
373                 print "...Unknown bootstate for %s : skipping..."% node
374                 return False
375         else:
376                 pass
377
378         if conn.bootmanager_running():
379                 print "...BootManager is currently running.  Skipping host %s" % node
380                 return True
381
382         #if config != None:
383         #       if config.force:
384         #               conn.restart_bootmanager(config.force)
385         #               return True
386
387         # Read persistent flags, tagged on one week intervals.
388         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
389                 
390
391         if config and not config.quiet: print "...downloading dmesg from %s" % node
392         dmesg = conn.get_dmesg()
393         child = fdpexpect.fdspawn(dmesg)
394
395         sequence = []
396         while True:
397                 steps = [
398                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
399                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
400                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
401
402                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
403
404                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
405                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
406
407                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
408                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
409
410                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
411                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
412
413                         ('floppytimeout','floppy0: floppy timeout called'),
414                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
415
416                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
417                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
418
419                         # floppy0: floppy timeout called
420                         # end_request: I/O error, dev fd0, sector 0
421
422                         # Buffer I/O error on device dm-2, logical block 8888896
423                         # ata1: status=0x51 { DriveReady SeekComplete Error }
424                         # ata1: error=0x40 { UncorrectableError }
425                         # SCSI error : <0 0 0 0> return code = 0x8000002
426                         # sda: Current: sense key: Medium Error
427                         #       Additional sense: Unrecovered read error - auto reallocate failed
428
429                         # SCSI error : <0 2 0 0> return code = 0x40001
430                         # end_request: I/O error, dev sda, sector 572489600
431                 ]
432                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
433                 sequence.append(id)
434
435                 if id == "done":
436                         break
437
438         s = Set(sequence)
439         if config and not config.quiet: print "\tSET: ", s
440
441         if len(s) > 1:
442                 print "...Potential drive errors on %s" % node
443                 if len(s) == 2 and 'floppyerror' in s:
444                         print "...Should investigate.  Continuing with node."
445                 else:
446                         print "...Should investigate.  Skipping node."
447                         # TODO: send message related to these errors.
448                         args = {}
449                         args['hostname'] = hostname
450                         args['log'] = conn.get_dmesg().read()
451
452                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
453                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
454
455                         loginbase = plc.siteId(hostname)
456                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
457                         conn.set_nodestate('disable')
458                         return False
459
460         print "...Downloading bm.log from %s" % node
461         log = conn.get_bootmanager_log()
462         child = fdpexpect.fdspawn(log)
463
464         try:
465                 if config.collect: return True
466         except:
467                 pass
468
469         time.sleep(1)
470
471         if config and not config.quiet: print "...Scanning bm.log for errors"
472         action_id = "dbg"
473         sequence = []
474         while True:
475
476                 steps = [
477                         ('bminit'               , 'Initializing the BootManager.'),
478                         ('cfg'                  , 'Reading node configuration file.'),
479                         ('auth'                 , 'Authenticating node with PLC.'),
480                         ('getplc'               , 'Retrieving details of node from PLC.'),
481                         ('update'               , 'Updating node boot state at PLC.'),
482                         ('hardware'             , 'Checking if hardware requirements met.'),
483                         ('installinit'  , 'Install: Initializing.'),
484                         ('installdisk'  , 'Install: partitioning disks.'),
485                         ('installbootfs', 'Install: bootstrapfs tarball.'),
486                         ('installcfg'   , 'Install: Writing configuration files.'),
487                         ('installstop'  , 'Install: Shutting down installer.'),
488                         ('update2'              , 'Updating node boot state at PLC.'),
489                         ('installinit2' , 'Install: Initializing.'),
490                         ('validate'             , 'Validating node installation.'),
491                         ('rebuildinitrd', 'Rebuilding initrd'),
492                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
493                         ('update3'              , 'Updating node configuration.'),
494                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
495                         ('update4'              , 'Sending hardware configuration to PLC.'),
496                         ('debug'                , 'Starting debug mode'),
497                         ('bmexceptmount', 'BootManagerException during mount'),
498                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
499                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
500                         ('exception'    , 'Exception'),
501                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
502                         ('protoerror'   , 'XML RPC protocol error'),
503                         ('nodehostname' , 'Configured node hostname does not resolve'),
504                         ('implementerror', 'Implementation Error'),
505                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
506                         ('noinstall'    , 'notinstalled'),
507                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
508                         ('noblockdev'   , "No block devices detected."),
509                         ('dnserror'     , 'Name or service not known'),
510                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
511                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
512                         ('hardwarerequirefail' , 'Hardware requirements not met'),
513                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
514                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
515                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
516                         ('modulefail'   , 'Unable to get list of system modules'),
517                         ('writeerror'   , 'write error: No space left on device'),
518                         ('nospace'      , "No space left on device"),
519                         ('nonode'       , 'Failed to authenticate call: No such node'),
520                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
521                         ('bootcheckfail'     , 'BootCheckAuthentication'),
522                         ('bootupdatefail'   , 'BootUpdateNode'),
523                 ]
524                 list = steps_to_list(steps)
525                 index = child.expect( list + [ pexpect.EOF ])
526                 id = index_to_id(steps,index)
527                 sequence.append(id)
528
529                 if id == "exception":
530                         if config and not config.quiet: print "...Found An Exception!!!"
531                 elif index == len(list):
532                         #print "Reached EOF"
533                         break
534                 
535         s = "-".join(sequence)
536         print "   FOUND SEQUENCE: ", s
537
538         # NOTE: We get or set the flag based on the current sequence identifier.
539         #  By using the sequence identifier, we guarantee that there will be no
540         #  frequent loops.  I'm guessing there is a better way to track loops,
541         #  though.
542         if not config.force and pflags.getRecentFlag(s):
543                 pflags.setRecentFlag(s)
544                 pflags.save() 
545                 print "... flag is set or it has already run recently. Skipping %s" % node
546                 return True
547
548         sequences = {}
549
550
551         # restart_bootmanager_boot
552         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
553                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
554                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
555
556                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
557
558                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
559                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
560                         "bminit-cfg-auth-getplc-update-debug-done",
561                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
562                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
563                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
564                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
565                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
566                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
567                         ]:
568                 sequences.update({n : "restart_bootmanager_boot"})
569
570         #       conn.restart_bootmanager('rins')
571         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
572                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
573                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
574                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
575                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
576                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
577                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
578                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
579                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
580                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
581                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
582                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
583                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
584                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
585                         ]:
586                 sequences.update({n : "restart_bootmanager_rins"})
587
588         # repair_node_keys
589         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
590
591         #   conn.restart_node('rins')
592         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
593                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
594                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
595                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
596                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
597                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
598                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
599                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
600                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
601                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
602                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
603                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
604                         ]:
605                 sequences.update({n : "restart_node_rins"})
606
607         #       restart_node_boot
608         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
609                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
610                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
611                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
612                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
613                          ]:
614                 sequences.update({n: "restart_node_boot"})
615
616         # update_node_config_email
617         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
618                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
619                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
620                         ]:
621                 sequences.update({n : "update_node_config_email"})
622
623         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
624                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
625                         ]:
626                 sequences.update({n : "nodenetwork_email"})
627
628         # update_bootcd_email
629         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
630                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
631                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
632                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
633                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
634                         ]:
635                 sequences.update({n : "update_bootcd_email"})
636
637         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
638                         ]:
639                 sequences.update({n: "suspect_error_email"})
640
641         # update_hardware_email
642         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
643         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
644
645         # broken_hardware_email
646         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
647
648         # bad_dns_email
649         for n in [ 
650          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
651                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
652                 ]:
653                 sequences.update( { n : "bad_dns_email"})
654
655         flag_set = True
656
657         
658         if s not in sequences:
659                 print "   HOST %s" % hostname
660                 print "   UNKNOWN SEQUENCE: %s" % s
661
662                 args = {}
663                 args['hostname'] = hostname
664                 args['sequence'] = s
665                 args['bmlog'] = conn.get_bootmanager_log().read()
666                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
667                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
668                 m.reset()
669                 m.send(['monitor-list@lists.planet-lab.org'])
670
671                 conn.restart_bootmanager('boot')
672
673                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
674                 # This way, we can check it again after we've fixed it.
675                 flag_set = False
676
677         else:
678
679                 if   sequences[s] == "restart_bootmanager_boot":
680                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
681                         conn.restart_bootmanager('boot')
682                 elif sequences[s] == "restart_bootmanager_rins":
683                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
684                         conn.restart_bootmanager('rins')
685                 elif sequences[s] == "restart_node_rins":
686                         conn.restart_node('rins')
687                 elif sequences[s] == "restart_node_boot":
688                         conn.restart_node('boot')
689                 elif sequences[s] == "repair_node_keys":
690                         if conn.compare_and_repair_nodekeys():
691                                 # the keys either are in sync or were forced in sync.
692                                 # so try to reboot the node again.
693                                 conn.restart_bootmanager('rins')
694                                 pass
695                         else:
696                                 # there was some failure to synchronize the keys.
697                                 print "...Unable to repair node keys on %s" % node
698
699                 elif sequences[s] == "suspect_error_email":
700                         args = {}
701                         args['hostname'] = hostname
702                         args['sequence'] = s
703                         args['bmlog'] = conn.get_bootmanager_log().read()
704                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
705                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
706                         m.reset()
707                         m.send(['monitor-list@lists.planet-lab.org'])
708
709                         conn.restart_bootmanager('boot')
710
711                 elif sequences[s] == "update_node_config_email":
712                         print "...Sending message to UPDATE NODE CONFIG"
713                         args = {}
714                         args['hostname'] = hostname
715                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
716                                                                 True, db='nodeid_persistmessages')
717                         loginbase = plc.siteId(hostname)
718                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
719                         conn.dump_plconf_file()
720                         conn.set_nodestate('disable')
721
722                 elif sequences[s] == "nodenetwork_email":
723                         print "...Sending message to LOOK AT NODE NETWORK"
724                         args = {}
725                         args['hostname'] = hostname
726                         args['bmlog'] = conn.get_bootmanager_log().read()
727                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
728                                                                 True, db='nodenet_persistmessages')
729                         loginbase = plc.siteId(hostname)
730                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
731                         conn.dump_plconf_file()
732                         conn.set_nodestate('disable')
733
734                 elif sequences[s] == "update_bootcd_email":
735                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
736                         import getconf
737                         args = {}
738                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
739                         args['hostname_list'] = "%s" % hostname
740
741                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
742                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
743
744                         loginbase = plc.siteId(hostname)
745                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
746
747                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
748                         conn.set_nodestate('disable')
749
750                 elif sequences[s] == "broken_hardware_email":
751                         # MAKE An ACTION record that this host has failed hardware.  May
752                         # require either an exception "/minhw" or other manual intervention.
753                         # Definitely need to send out some more EMAIL.
754                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
755                         # TODO: email notice of broken hardware
756                         args = {}
757                         args['hostname'] = hostname
758                         args['log'] = conn.get_dmesg().read()
759                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
760                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
761
762                         loginbase = plc.siteId(hostname)
763                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
764                         conn.set_nodestate('disable')
765
766                 elif sequences[s] == "update_hardware_email":
767                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
768                         args = {}
769                         args['hostname'] = hostname
770                         args['bmlog'] = conn.get_bootmanager_log().read()
771                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
772                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
773
774                         loginbase = plc.siteId(hostname)
775                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
776                         conn.set_nodestate('disable')
777
778                 elif sequences[s] == "bad_dns_email":
779                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
780                         args = {}
781                         try:
782                                 node = api.GetNodes(hostname)[0]
783                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
784                         except:
785                                 print traceback.print_exc()
786                                 # TODO: api error. skip email, b/c all info is not available,
787                                 # flag_set will not be recorded.
788                                 return False
789                         nodenet_str = network_config_to_str(net)
790
791                         args['hostname'] = hostname
792                         args['network_config'] = nodenet_str
793                         args['nodenetwork_id'] = net['nodenetwork_id']
794                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
795                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
796
797                         loginbase = plc.siteId(hostname)
798                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
799                         conn.set_nodestate('disable')
800
801         if flag_set:
802                 pflags.setRecentFlag(s)
803                 pflags.save() 
804
805         return True
806         
807
808 # MAIN -------------------------------------------------------------------
809
810 def main():
811         import parser as parsermodule
812         parser = parsermodule.getParser()
813
814         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
815                                                 force=None, quiet=False)
816         parser.add_option("", "--child", dest="child", action="store_true", 
817                                                 help="This is the child mode of this process.")
818         parser.add_option("", "--force", dest="force", metavar="boot_state",
819                                                 help="Force a boot state passed to BootManager.py.")
820         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
821                                                 help="Extra quiet output messages.")
822         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
823                                                 help="Extra debug output messages.")
824         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
825                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
826         parser.add_option("", "--collect", dest="collect", action="store_true", 
827                                                 help="No action, just collect dmesg, and bm.log")
828         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
829                                                 help="Do not perform the orginary setup phase.")
830
831         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
832         config = parsermodule.parse_args(parser)
833
834         if config.nodelist:
835                 nodes = config.getListFromFile(config.nodelist)
836         elif config.node:
837                 nodes = [ config.node ]
838         else:
839                 parser.print_help()
840                 sys.exit(1)
841
842         for node in nodes:
843                 reboot(node, config)
844
845 if __name__ == "__main__":
846         main()