714aa15cbb15b1f5860f7a8d24dd5fde0b38677c
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import string
10 import sys, os, traceback
11 import time
12 import gzip
13
14 from steps import *
15 from Exceptions import *
16 import notify_messages
17 import BootServerRequest
18
19 # all output is written to this file
20 BM_NODE_LOG= "/tmp/bm.log"
21 VARS_FILE = "configuration"
22
23 # the new contents of PATH when the boot manager is running
24 BIN_PATH= ('/usr/local/bin',
25            '/usr/local/sbin',
26            '/usr/bin',
27            '/usr/sbin',
28            '/bin',
29            '/sbin')
30
31 def read_configuration_file(filename):
32     # read in and store all variables in VARS_FILE into each line
33     # is in the format name=val (any whitespace around the = is
34     # removed. everything after the = to the end of the line is
35     # the value
36     vars = {}
37     vars_file= file(filename,'r')
38     validConfFile = True
39     for line in vars_file:
40         # if its a comment or a whitespace line, ignore
41         if line[:1] == "#" or string.strip(line) == "":
42             continue
43
44         parts= string.split(line,"=")
45         if len(parts) != 2:
46             validConfFile = False
47             raise Exception( "Invalid line in vars file: %s" % line )
48
49         name= string.strip(parts[0])
50         value= string.strip(parts[1])
51         value= value.replace("'", "")   # remove quotes
52         value= value.replace('"', "")   # remove quotes
53         vars[name]= value
54
55     vars_file.close()
56     if not validConfFile:
57         raise Exception( "Unable to read configuration vars." )
58
59     # find out which directory we are running it, and set a variable
60     # for that. future steps may need to get files out of the bootmanager
61     # directory
62     current_dir= os.getcwd()
63     vars['BM_SOURCE_DIR']= current_dir
64
65     return vars
66
67 ##############################
68 class log:
69
70     format="%H:%M:%S(%Z) "
71
72     def __init__( self, OutputFilePath= None ):
73         try:
74             self.OutputFile= open( OutputFilePath, "w")
75             self.OutputFilePath= OutputFilePath
76         except:
77             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
78             self.OutputFile= None
79
80         self.VARS = None
81         try:
82             vars = read_configuration_file(VARS_FILE)
83             self.VARS = vars
84         except Exception, e:
85             self.LogEntry( str(e) )
86             return
87     
88     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
89         now=time.strftime(log.format, time.localtime())
90         if self.OutputFile:
91             self.OutputFile.write( now+str )
92         if display_screen:
93             sys.stdout.write( now+str )
94             
95         if inc_newline:
96             if display_screen:
97                 sys.stdout.write( "\n" )
98             if self.OutputFile:
99                 self.OutputFile.write( "\n" )
100
101         if self.OutputFile:
102             self.OutputFile.flush()
103
104     def write( self, str ):
105         """
106         make log behave like a writable file object (for traceback
107         prints)
108         """
109         self.LogEntry( str, 0, 1 )
110     
111     # bm log uploading is available back again, as of nodeconfig-5.0-2
112     def Upload( self ):
113         """
114         upload the contents of the log to the server
115         """
116         if self.OutputFile is not None:
117             self.OutputFile.flush()
118
119             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
120             
121             self.OutputFile.close()
122             self.OutputFile= None
123
124             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
125                       self.VARS['INTERFACE_SETTINGS']['domainname']
126             bs_request = BootServerRequest.BootServerRequest(self.VARS)
127             bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
128                                    GetVars = None, PostVars = None,
129                                    FormData = ["log=@" + self.OutputFilePath,
130                                    "hostname=" + hostname, "type=bm.log"],
131                                    DoSSL = True, DoCertCheck = True)
132
133 ##############################
134 class BootManager:
135
136     # file containing initial variables/constants
137
138     # the set of valid node run states
139     NodeRunStates = {'reinstall':None,
140                      'boot':None,
141                      'safeboot':None,
142                      'disabled':None,
143                      }
144     
145     def __init__(self, log, forceState):
146         # override machine's current state from the command line
147         self.forceState = forceState
148
149         # the main logging point
150         self.LOG= log
151
152         # set to 1 if we can run after initialization
153         self.CAN_RUN = 0
154
155         if log.VARS:
156             # this contains a set of information used and updated by each step
157             self.VARS= log.VARS
158         else:
159             return
160              
161         # not sure what the current PATH is set to, replace it with what
162         # we know will work with all the boot cds
163         os.environ['PATH']= string.join(BIN_PATH,":")
164
165         self.CAN_RUN= 1
166
167     def Run(self):
168         """
169         core boot manager logic.
170
171         the way errors are handled is as such: if any particular step
172         cannot continue or unexpectibly fails, an exception is thrown.
173         in this case, the boot manager cannot continue running.
174
175         these step functions can also return a 0/1 depending on whether
176         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
177         a 0 is returned and no exception is thrown if the user chose not
178         to confirm the install. The same goes with the CheckHardwareRequirements.
179         If requriements not met, but tests were succesfull, return 0.
180
181         for steps that run within the installer, they are expected to either
182         complete succesfully and return 1, or throw an execption.
183
184         For exact return values and expected operations, see the comments
185         at the top of each of the invididual step functions.
186         """
187
188         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
189             # called by the _xxxState() functions below upon failure
190             self.VARS['RUN_LEVEL']= 'failboot'
191             notify = getattr(notify_messages, message)
192             self.VARS['STATE_CHANGE_NOTIFY']= 1
193             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
194             raise BootManagerException, notify
195
196         def _bootRun():
197             # implements the boot logic, which consists of first
198             # double checking that the node was properly installed,
199             # checking whether someone added or changed disks, and
200             # then finally chain boots.
201
202             # starting the fallback/debug ssh daemon for safety:
203             # if the node install somehow hangs, or if it simply takes ages, 
204             # we can still enter and investigate
205             try:
206                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
207             except:
208                 pass
209
210             InstallInit.Run( self.VARS, self.LOG )                    
211             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
212             if ret == 1:
213                 WriteModprobeConfig.Run( self.VARS, self.LOG )
214                 MakeInitrd.Run( self.VARS, self.LOG )
215                 WriteNetworkConfig.Run( self.VARS, self.LOG )
216                 CheckForNewDisks.Run( self.VARS, self.LOG )
217                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
218                 ChainBootNode.Run( self.VARS, self.LOG )
219             elif ret == -1:
220                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
221             elif ret == -2:
222                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
223             elif ret == -3:
224                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
225             else:
226                 _nodeNotInstalled()
227
228         def _reinstallRun():
229
230             # starting the fallback/debug ssh daemon for safety:
231             # if the node install somehow hangs, or if it simply takes ages, 
232             # we can still enter and investigate
233             try:
234                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
235             except:
236                 pass
237
238             # implements the reinstall logic, which will check whether
239             # the min. hardware requirements are met, install the
240             # software, and upon correct installation will switch too
241             # 'boot' state and chainboot into the production system
242             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
243                 self.VARS['RUN_LEVEL']= 'failboot'
244                 raise BootManagerException, "Hardware requirements not met."
245
246             # runinstaller
247             InstallInit.Run( self.VARS, self.LOG )                    
248             InstallPartitionDisks.Run( self.VARS, self.LOG )            
249             InstallBootstrapFS.Run( self.VARS, self.LOG )            
250             InstallWriteConfig.Run( self.VARS, self.LOG )
251             InstallUninitHardware.Run( self.VARS, self.LOG )
252             self.VARS['BOOT_STATE']= 'boot'
253             self.VARS['STATE_CHANGE_NOTIFY']= 1
254             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
255                  notify_messages.MSG_INSTALL_FINISHED
256             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
257             _bootRun()
258             
259         def _installRun():
260             # implements the new install logic, which will first check
261             # with the user whether it is ok to install on this
262             # machine, switch to 'reinstall' state and then invoke the reinstall
263             # logic.  See reinstallState logic comments for further
264             # details.
265             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
266                 return 0
267             self.VARS['BOOT_STATE']= 'reinstall'
268             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
269             _reinstallRun()
270
271         def _debugRun(state='failboot'):
272             # implements debug logic, which starts the sshd and just waits around
273             self.VARS['RUN_LEVEL']=state
274             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
275             StartDebug.Run( self.VARS, self.LOG )
276             # fsck/mount fs if present, and ignore return value if it's not.
277             ValidateNodeInstall.Run( self.VARS, self.LOG )
278
279         def _badstateRun():
280             # should never happen; log event
281             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
282             _debugRun()
283
284         # setup state -> function hash table
285         BootManager.NodeRunStates['reinstall']  = _reinstallRun
286         BootManager.NodeRunStates['boot']       = _bootRun
287         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
288         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
289
290         success = 0
291         try:
292             InitializeBootManager.Run( self.VARS, self.LOG )
293             ReadNodeConfiguration.Run( self.VARS, self.LOG )
294             AuthenticateWithPLC.Run( self.VARS, self.LOG )
295             StartRunlevelAgent.Run( self.VARS, self.LOG )
296             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
297
298             # override machine's current state from the command line
299             if self.forceState is not None:
300                 self.VARS['BOOT_STATE']= self.forceState
301                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
302                 UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
303
304             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
305             stateRun()
306             success = 1
307
308         except KeyError, e:
309             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
310         except BootManagerException, e:
311             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
312         except BootManagerAuthenticationException, e:
313             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
314             # sets /tmp/CANCEL_BOOT flag
315             StartDebug.Run(self.VARS, self.LOG )
316             # Return immediately b/c any other calls to API will fail
317             return success
318         except:
319             self.LOG.write( "\n\nImplementation Error\n")
320             traceback.print_exc(file=self.LOG.OutputFile)
321             traceback.print_exc()
322
323         if not success:
324             try:
325                 _debugRun()
326             except BootManagerException, e:
327                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
328             except:
329                 self.LOG.write( "\n\nImplementation Error\n")
330                 traceback.print_exc(file=self.LOG.OutputFile)
331                 traceback.print_exc()
332
333         return success
334             
335             
336 def main(argv):
337
338     import utils
339     utils.prompt_for_breakpoint_mode()
340
341     utils.breakpoint ("Entering BootManager::main")
342     
343     # set to 1 if error occurred
344     error= 0
345     
346     # all output goes through this class so we can save it and post
347     # the data back to PlanetLab central
348     LOG= log( BM_NODE_LOG )
349
350     LOG.LogEntry( "BootManager started at: %s" % \
351                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
352
353     try:
354         forceState = None
355         if len(argv) == 2:
356             fState = argv[1]
357             if BootManager.NodeRunStates.has_key(fState):
358                 forceState = fState
359             else:
360                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
361                 error = 1
362     except:
363         traceback.print_exc(file=LOG.OutputFile)
364         traceback.print_exc()
365         
366     if error:
367         LOG.LogEntry( "BootManager finished at: %s" % \
368                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
369         LOG.Upload()
370         return error
371
372     try:
373         bm= BootManager(LOG,forceState)
374         if bm.CAN_RUN == 0:
375             LOG.LogEntry( "Unable to initialize BootManager." )
376         else:
377             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
378             success= bm.Run()
379             if success:
380                 LOG.LogEntry( "\nDone!" );
381             else:
382                 LOG.LogEntry( "\nError occurred!" );
383                 error = 1
384     except:
385         traceback.print_exc(file=LOG.OutputFile)
386         traceback.print_exc()
387
388     LOG.LogEntry( "BootManager finished at: %s" % \
389                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
390     LOG.Upload()
391
392     return error
393
394     
395 if __name__ == "__main__":
396     error = main(sys.argv)
397     sys.exit(error)