solve the ordering problem which causes the boot failure
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # $Id$
4 # $URL$
5 #
6 # Copyright (c) 2003 Intel Corporation
7 # All rights reserved.
8 #
9 # Copyright (c) 2004-2006 The Trustees of Princeton University
10 # All rights reserved.
11
12 import string
13 import sys, os, traceback
14 import time
15 import gzip
16
17 from steps import *
18 from Exceptions import *
19 import notify_messages
20 import BootServerRequest
21
22 # all output is written to this file
23 BM_NODE_LOG= "/tmp/bm.log"
24 VARS_FILE = "configuration"
25
26 # the new contents of PATH when the boot manager is running
27 BIN_PATH= ('/usr/local/bin',
28            '/usr/local/sbin',
29            '/usr/bin',
30            '/usr/sbin',
31            '/bin',
32            '/sbin')
33
34 def read_configuration_file(filename):
35     # read in and store all variables in VARS_FILE into each line
36     # is in the format name=val (any whitespace around the = is
37     # removed. everything after the = to the end of the line is
38     # the value
39     vars = {}
40     vars_file= file(filename,'r')
41     validConfFile = True
42     for line in vars_file:
43         # if its a comment or a whitespace line, ignore
44         if line[:1] == "#" or string.strip(line) == "":
45             continue
46
47         parts= string.split(line,"=")
48         if len(parts) != 2:
49             validConfFile = False
50             raise Exception( "Invalid line in vars file: %s" % line )
51
52         name= string.strip(parts[0])
53         value= string.strip(parts[1])
54         value= value.replace("'", "")   # remove quotes
55         value= value.replace('"', "")   # remove quotes
56         vars[name]= value
57
58     vars_file.close()
59     if not validConfFile:
60         raise Exception( "Unable to read configuration vars." )
61
62     # find out which directory we are running it, and set a variable
63     # for that. future steps may need to get files out of the bootmanager
64     # directory
65     current_dir= os.getcwd()
66     vars['BM_SOURCE_DIR']= current_dir
67
68     return vars
69
70 ##############################
71 class log:
72
73     format="%H:%M:%S(%Z) "
74
75     def __init__( self, OutputFilePath= None ):
76         try:
77             self.OutputFile= open( OutputFilePath, "w")
78             self.OutputFilePath= OutputFilePath
79         except:
80             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
81             self.OutputFile= None
82
83         self.VARS = None
84         try:
85             vars = read_configuration_file(VARS_FILE)
86             self.VARS = vars
87         except Exception, e:
88             self.LogEntry( str(e) )
89             return
90     
91     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
92         now=time.strftime(log.format, time.localtime())
93         if self.OutputFile:
94             self.OutputFile.write( now+str )
95         if display_screen:
96             sys.stdout.write( now+str )
97             
98         if inc_newline:
99             if display_screen:
100                 sys.stdout.write( "\n" )
101             if self.OutputFile:
102                 self.OutputFile.write( "\n" )
103
104         if self.OutputFile:
105             self.OutputFile.flush()
106
107     def write( self, str ):
108         """
109         make log behave like a writable file object (for traceback
110         prints)
111         """
112         self.LogEntry( str, 0, 1 )
113     
114     # bm log uploading is available back again, as of nodeconfig-5.0-2
115     def Upload( self ):
116         """
117         upload the contents of the log to the server
118         """
119         if self.OutputFile is not None:
120             self.OutputFile.flush()
121
122             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
123             
124             self.OutputFile.close()
125             self.OutputFile= None
126
127             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
128                       self.VARS['INTERFACE_SETTINGS']['domainname']
129             bs_request = BootServerRequest.BootServerRequest(self.VARS)
130             try:
131                 # this was working until f10
132                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
133                                        GetVars = None, PostVars = None,
134                                        DoSSL = True, DoCertCheck = True,
135                                        FormData = ["log=@" + self.OutputFilePath,
136                                                    "hostname=" + hostname, 
137                                                    "type=bm.log"])
138             except:
139                 # new pycurl
140                 import pycurl
141                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
142                                        GetVars = None, PostVars = None,
143                                        DoSSL = True, DoCertCheck = True,
144                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
145                                                    ("hostname",hostname),
146                                                    ("type","bm.log")])
147
148
149 ##############################
150 class BootManager:
151
152     # file containing initial variables/constants
153
154     # the set of valid node run states
155     NodeRunStates = {'reinstall':None,
156                      'boot':None,
157                      'safeboot':None,
158                      'disabled':None,
159                      }
160     
161     def __init__(self, log, forceState):
162         # override machine's current state from the command line
163         self.forceState = forceState
164
165         # the main logging point
166         self.LOG= log
167
168         # set to 1 if we can run after initialization
169         self.CAN_RUN = 0
170
171         if log.VARS:
172             # this contains a set of information used and updated by each step
173             self.VARS= log.VARS
174         else:
175             return
176              
177         # not sure what the current PATH is set to, replace it with what
178         # we know will work with all the boot cds
179         os.environ['PATH']= string.join(BIN_PATH,":")
180
181         self.CAN_RUN= 1
182
183     def Run(self):
184         """
185         core boot manager logic.
186
187         the way errors are handled is as such: if any particular step
188         cannot continue or unexpectibly fails, an exception is thrown.
189         in this case, the boot manager cannot continue running.
190
191         these step functions can also return a 0/1 depending on whether
192         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
193         a 0 is returned and no exception is thrown if the user chose not
194         to confirm the install. The same goes with the CheckHardwareRequirements.
195         If requriements not met, but tests were succesfull, return 0.
196
197         for steps that run within the installer, they are expected to either
198         complete succesfully and return 1, or throw an execption.
199
200         For exact return values and expected operations, see the comments
201         at the top of each of the invididual step functions.
202         """
203
204         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
205             # called by the _xxxState() functions below upon failure
206             self.VARS['RUN_LEVEL']= 'failboot'
207             notify = getattr(notify_messages, message)
208             self.VARS['STATE_CHANGE_NOTIFY']= 1
209             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
210             raise BootManagerException, notify
211
212         def _bootRun():
213             # implements the boot logic, which consists of first
214             # double checking that the node was properly installed,
215             # checking whether someone added or changed disks, and
216             # then finally chain boots.
217
218             # starting the fallback/debug ssh daemon for safety:
219             # if the node install somehow hangs, or if it simply takes ages, 
220             # we can still enter and investigate
221             try:
222                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
223             except:
224                 pass
225
226             InstallInit.Run( self.VARS, self.LOG )                    
227             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
228             if ret == 1:
229                 WriteModprobeConfig.Run( self.VARS, self.LOG )
230                 WriteNetworkConfig.Run( self.VARS, self.LOG )
231                 CheckForNewDisks.Run( self.VARS, self.LOG )
232                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
233                 ChainBootNode.Run( self.VARS, self.LOG )
234             elif ret == -1:
235                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
236             elif ret == -2:
237                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
238             elif ret == -3:
239                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
240             else:
241                 _nodeNotInstalled()
242
243         def _reinstallRun():
244
245             # starting the fallback/debug ssh daemon for safety:
246             # if the node install somehow hangs, or if it simply takes ages, 
247             # we can still enter and investigate
248             try:
249                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
250             except:
251                 pass
252
253             # implements the reinstall logic, which will check whether
254             # the min. hardware requirements are met, install the
255             # software, and upon correct installation will switch too
256             # 'boot' state and chainboot into the production system
257             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
258                 self.VARS['RUN_LEVEL']= 'failboot'
259                 raise BootManagerException, "Hardware requirements not met."
260
261             # runinstaller
262             InstallInit.Run( self.VARS, self.LOG )                    
263             InstallPartitionDisks.Run( self.VARS, self.LOG )            
264             InstallBootstrapFS.Run( self.VARS, self.LOG )            
265             InstallWriteConfig.Run( self.VARS, self.LOG )
266             InstallUninitHardware.Run( self.VARS, self.LOG )
267             self.VARS['BOOT_STATE']= 'boot'
268             self.VARS['STATE_CHANGE_NOTIFY']= 1
269             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
270                  notify_messages.MSG_INSTALL_FINISHED
271             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
272             _bootRun()
273             
274         def _installRun():
275             # implements the new install logic, which will first check
276             # with the user whether it is ok to install on this
277             # machine, switch to 'reinstall' state and then invoke the reinstall
278             # logic.  See reinstallState logic comments for further
279             # details.
280             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
281                 return 0
282             self.VARS['BOOT_STATE']= 'reinstall'
283             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
284             _reinstallRun()
285
286         def _debugRun(state='failboot'):
287             # implements debug logic, which starts the sshd and just waits around
288             self.VARS['RUN_LEVEL']=state
289             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
290             StartDebug.Run( self.VARS, self.LOG )
291             # fsck/mount fs if present, and ignore return value if it's not.
292             ValidateNodeInstall.Run( self.VARS, self.LOG )
293
294         def _badstateRun():
295             # should never happen; log event
296             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
297             _debugRun()
298
299         # setup state -> function hash table
300         BootManager.NodeRunStates['reinstall']  = _reinstallRun
301         BootManager.NodeRunStates['boot']       = _bootRun
302         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
303         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
304
305         success = 0
306         try:
307             InitializeBootManager.Run( self.VARS, self.LOG )
308             ReadNodeConfiguration.Run( self.VARS, self.LOG )
309             AuthenticateWithPLC.Run( self.VARS, self.LOG )
310             StartRunlevelAgent.Run( self.VARS, self.LOG )
311             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
312
313             # override machine's current state from the command line
314             if self.forceState is not None:
315                 self.VARS['BOOT_STATE']= self.forceState
316                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
317                 UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
318
319             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
320             stateRun()
321             success = 1
322
323         except KeyError, e:
324             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
325         except BootManagerException, e:
326             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
327         except BootManagerAuthenticationException, e:
328             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
329             # sets /tmp/CANCEL_BOOT flag
330             StartDebug.Run(self.VARS, self.LOG )
331             # Return immediately b/c any other calls to API will fail
332             return success
333         except:
334             self.LOG.write( "\n\nImplementation Error\n")
335             traceback.print_exc(file=self.LOG.OutputFile)
336             traceback.print_exc()
337
338         if not success:
339             try:
340                 _debugRun()
341             except BootManagerException, e:
342                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
343             except:
344                 self.LOG.write( "\n\nImplementation Error\n")
345                 traceback.print_exc(file=self.LOG.OutputFile)
346                 traceback.print_exc()
347
348         return success
349             
350             
351 def main(argv):
352
353     import utils
354     utils.prompt_for_breakpoint_mode()
355
356     utils.breakpoint ("Entering BootManager::main")
357     
358     # set to 1 if error occurred
359     error= 0
360     
361     # all output goes through this class so we can save it and post
362     # the data back to PlanetLab central
363     LOG= log( BM_NODE_LOG )
364
365     LOG.LogEntry( "BootManager started at: %s" % \
366                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
367
368     try:
369         forceState = None
370         if len(argv) == 2:
371             fState = argv[1]
372             if BootManager.NodeRunStates.has_key(fState):
373                 forceState = fState
374             else:
375                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
376                 error = 1
377     except:
378         traceback.print_exc(file=LOG.OutputFile)
379         traceback.print_exc()
380         
381     if error:
382         LOG.LogEntry( "BootManager finished at: %s" % \
383                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
384         LOG.Upload()
385         return error
386
387     try:
388         bm= BootManager(LOG,forceState)
389         if bm.CAN_RUN == 0:
390             LOG.LogEntry( "Unable to initialize BootManager." )
391         else:
392             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
393             success= bm.Run()
394             if success:
395                 LOG.LogEntry( "\nDone!" );
396             else:
397                 LOG.LogEntry( "\nError occurred!" );
398                 error = 1
399     except:
400         traceback.print_exc(file=LOG.OutputFile)
401         traceback.print_exc()
402
403     LOG.LogEntry( "BootManager finished at: %s" % \
404                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
405     LOG.Upload()
406
407     return error
408
409     
410 if __name__ == "__main__":
411     error = main(sys.argv)
412     sys.exit(error)