merge initrd changes from trunk
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import string
10 import sys, os, traceback
11 import time
12 import gzip
13
14 from steps import *
15 from Exceptions import *
16 import notify_messages
17 import BootServerRequest
18
19 # all output is written to this file
20 BM_NODE_LOG= "/tmp/bm.log"
21 VARS_FILE = "configuration"
22
23 # the new contents of PATH when the boot manager is running
24 BIN_PATH= ('/usr/local/bin',
25            '/usr/local/sbin',
26            '/usr/bin',
27            '/usr/sbin',
28            '/bin',
29            '/sbin')
30
31 def read_configuration_file(filename):
32     # read in and store all variables in VARS_FILE into each line
33     # is in the format name=val (any whitespace around the = is
34     # removed. everything after the = to the end of the line is
35     # the value
36     vars = {}
37     vars_file= file(filename,'r')
38     validConfFile = True
39     for line in vars_file:
40         # if its a comment or a whitespace line, ignore
41         if line[:1] == "#" or string.strip(line) == "":
42             continue
43
44         parts= string.split(line,"=")
45         if len(parts) != 2:
46             validConfFile = False
47             raise Exception( "Invalid line in vars file: %s" % line )
48
49         name= string.strip(parts[0])
50         value= string.strip(parts[1])
51         value= value.replace("'", "")   # remove quotes
52         value= value.replace('"', "")   # remove quotes
53         vars[name]= value
54
55     vars_file.close()
56     if not validConfFile:
57         raise Exception( "Unable to read configuration vars." )
58
59     # find out which directory we are running it, and set a variable
60     # for that. future steps may need to get files out of the bootmanager
61     # directory
62     current_dir= os.getcwd()
63     vars['BM_SOURCE_DIR']= current_dir
64
65     return vars
66
67 ##############################
68 class log:
69
70     format="%H:%M:%S(%Z) "
71
72     def __init__( self, OutputFilePath= None ):
73         try:
74             self.OutputFile= open( OutputFilePath, "w")
75             self.OutputFilePath= OutputFilePath
76         except:
77             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
78             self.OutputFile= None
79
80         self.VARS = None
81         try:
82             vars = read_configuration_file(VARS_FILE)
83             self.VARS = vars
84         except Exception, e:
85             self.LogEntry( str(e) )
86             return
87     
88     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
89         now=time.strftime(log.format, time.localtime())
90         if self.OutputFile:
91             self.OutputFile.write( now+str )
92         if display_screen:
93             sys.stdout.write( now+str )
94             
95         if inc_newline:
96             if display_screen:
97                 sys.stdout.write( "\n" )
98             if self.OutputFile:
99                 self.OutputFile.write( "\n" )
100
101         if self.OutputFile:
102             self.OutputFile.flush()
103
104     def write( self, str ):
105         """
106         make log behave like a writable file object (for traceback
107         prints)
108         """
109         self.LogEntry( str, 0, 1 )
110     
111     # bm log uploading is available back again, as of nodeconfig-5.0-2
112     def Upload( self ):
113         """
114         upload the contents of the log to the server
115         """
116         if self.OutputFile is not None:
117             self.OutputFile.flush()
118
119             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
120             
121             self.OutputFile.close()
122             self.OutputFile= None
123
124             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
125                       self.VARS['INTERFACE_SETTINGS']['domainname']
126             bs_request = BootServerRequest.BootServerRequest(self.VARS)
127             try:
128                 # this was working until f10
129                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
130                                        GetVars = None, PostVars = None,
131                                        DoSSL = True, DoCertCheck = True,
132                                        FormData = ["log=@" + self.OutputFilePath,
133                                                    "hostname=" + hostname, 
134                                                    "type=bm.log"])
135             except:
136                 # new pycurl
137                 import pycurl
138                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
139                                        GetVars = None, PostVars = None,
140                                        DoSSL = True, DoCertCheck = True,
141                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
142                                                    ("hostname",hostname),
143                                                    ("type","bm.log")])
144
145
146 ##############################
147 class BootManager:
148
149     # file containing initial variables/constants
150
151     # the set of valid node run states
152     NodeRunStates = {'reinstall':None,
153                      'boot':None,
154                      'safeboot':None,
155                      'disabled':None,
156                      }
157     
158     def __init__(self, log, forceState):
159         # override machine's current state from the command line
160         self.forceState = forceState
161
162         # the main logging point
163         self.LOG= log
164
165         # set to 1 if we can run after initialization
166         self.CAN_RUN = 0
167
168         if log.VARS:
169             # this contains a set of information used and updated by each step
170             self.VARS= log.VARS
171         else:
172             return
173              
174         # not sure what the current PATH is set to, replace it with what
175         # we know will work with all the boot cds
176         os.environ['PATH']= string.join(BIN_PATH,":")
177
178         self.CAN_RUN= 1
179
180     def Run(self):
181         """
182         core boot manager logic.
183
184         the way errors are handled is as such: if any particular step
185         cannot continue or unexpectibly fails, an exception is thrown.
186         in this case, the boot manager cannot continue running.
187
188         these step functions can also return a 0/1 depending on whether
189         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
190         a 0 is returned and no exception is thrown if the user chose not
191         to confirm the install. The same goes with the CheckHardwareRequirements.
192         If requriements not met, but tests were succesfull, return 0.
193
194         for steps that run within the installer, they are expected to either
195         complete succesfully and return 1, or throw an execption.
196
197         For exact return values and expected operations, see the comments
198         at the top of each of the invididual step functions.
199         """
200
201         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
202             # called by the _xxxState() functions below upon failure
203             self.VARS['RUN_LEVEL']= 'failboot'
204             notify = getattr(notify_messages, message)
205             self.VARS['STATE_CHANGE_NOTIFY']= 1
206             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
207             raise BootManagerException, notify
208
209         def _bootRun():
210             # implements the boot logic, which consists of first
211             # double checking that the node was properly installed,
212             # checking whether someone added or changed disks, and
213             # then finally chain boots.
214
215             # starting the fallback/debug ssh daemon for safety:
216             # if the node install somehow hangs, or if it simply takes ages, 
217             # we can still enter and investigate
218             try:
219                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
220             except:
221                 pass
222
223             InstallInit.Run( self.VARS, self.LOG )                    
224             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
225             if ret == 1:
226                 WriteModprobeConfig.Run( self.VARS, self.LOG )
227                 WriteNetworkConfig.Run( self.VARS, self.LOG )
228                 CheckForNewDisks.Run( self.VARS, self.LOG )
229                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
230                 ChainBootNode.Run( self.VARS, self.LOG )
231             elif ret == -1:
232                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
233             elif ret == -2:
234                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
235             elif ret == -3:
236                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
237             else:
238                 _nodeNotInstalled()
239
240         def _reinstallRun():
241
242             # starting the fallback/debug ssh daemon for safety:
243             # if the node install somehow hangs, or if it simply takes ages, 
244             # we can still enter and investigate
245             try:
246                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
247             except:
248                 pass
249
250             # implements the reinstall logic, which will check whether
251             # the min. hardware requirements are met, install the
252             # software, and upon correct installation will switch too
253             # 'boot' state and chainboot into the production system
254             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
255                 self.VARS['RUN_LEVEL']= 'failboot'
256                 raise BootManagerException, "Hardware requirements not met."
257
258             # runinstaller
259             InstallInit.Run( self.VARS, self.LOG )                    
260             InstallPartitionDisks.Run( self.VARS, self.LOG )            
261             InstallBootstrapFS.Run( self.VARS, self.LOG )            
262             InstallWriteConfig.Run( self.VARS, self.LOG )
263             InstallUninitHardware.Run( self.VARS, self.LOG )
264             self.VARS['BOOT_STATE']= 'boot'
265             self.VARS['STATE_CHANGE_NOTIFY']= 1
266             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
267                  notify_messages.MSG_INSTALL_FINISHED
268             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
269             _bootRun()
270             
271         def _installRun():
272             # implements the new install logic, which will first check
273             # with the user whether it is ok to install on this
274             # machine, switch to 'reinstall' state and then invoke the reinstall
275             # logic.  See reinstallState logic comments for further
276             # details.
277             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
278                 return 0
279             self.VARS['BOOT_STATE']= 'reinstall'
280             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
281             _reinstallRun()
282
283         def _debugRun(state='failboot'):
284             # implements debug logic, which starts the sshd and just waits around
285             self.VARS['RUN_LEVEL']=state
286             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
287             StartDebug.Run( self.VARS, self.LOG )
288             # fsck/mount fs if present, and ignore return value if it's not.
289             ValidateNodeInstall.Run( self.VARS, self.LOG )
290
291         def _badstateRun():
292             # should never happen; log event
293             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
294             _debugRun()
295
296         # setup state -> function hash table
297         BootManager.NodeRunStates['reinstall']  = _reinstallRun
298         BootManager.NodeRunStates['boot']       = _bootRun
299         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
300         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
301
302         success = 0
303         try:
304             InitializeBootManager.Run( self.VARS, self.LOG )
305             ReadNodeConfiguration.Run( self.VARS, self.LOG )
306             AuthenticateWithPLC.Run( self.VARS, self.LOG )
307             StartRunlevelAgent.Run( self.VARS, self.LOG )
308             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
309
310             # override machine's current state from the command line
311             if self.forceState is not None:
312                 self.VARS['BOOT_STATE']= self.forceState
313                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
314                 UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
315
316             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
317             stateRun()
318             success = 1
319
320         except KeyError, e:
321             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
322         except BootManagerException, e:
323             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
324         except BootManagerAuthenticationException, e:
325             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
326             # sets /tmp/CANCEL_BOOT flag
327             StartDebug.Run(self.VARS, self.LOG )
328             # Return immediately b/c any other calls to API will fail
329             return success
330         except:
331             self.LOG.write( "\n\nImplementation Error\n")
332             traceback.print_exc(file=self.LOG.OutputFile)
333             traceback.print_exc()
334
335         if not success:
336             try:
337                 _debugRun()
338             except BootManagerException, e:
339                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
340             except:
341                 self.LOG.write( "\n\nImplementation Error\n")
342                 traceback.print_exc(file=self.LOG.OutputFile)
343                 traceback.print_exc()
344
345         return success
346             
347             
348 def main(argv):
349
350     import utils
351     utils.prompt_for_breakpoint_mode()
352
353     utils.breakpoint ("Entering BootManager::main")
354     
355     # set to 1 if error occurred
356     error= 0
357     
358     # all output goes through this class so we can save it and post
359     # the data back to PlanetLab central
360     LOG= log( BM_NODE_LOG )
361
362     LOG.LogEntry( "BootManager started at: %s" % \
363                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
364
365     try:
366         forceState = None
367         if len(argv) == 2:
368             fState = argv[1]
369             if BootManager.NodeRunStates.has_key(fState):
370                 forceState = fState
371             else:
372                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
373                 error = 1
374     except:
375         traceback.print_exc(file=LOG.OutputFile)
376         traceback.print_exc()
377         
378     if error:
379         LOG.LogEntry( "BootManager finished at: %s" % \
380                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
381         LOG.Upload()
382         return error
383
384     try:
385         bm= BootManager(LOG,forceState)
386         if bm.CAN_RUN == 0:
387             LOG.LogEntry( "Unable to initialize BootManager." )
388         else:
389             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
390             success= bm.Run()
391             if success:
392                 LOG.LogEntry( "\nDone!" );
393             else:
394                 LOG.LogEntry( "\nError occurred!" );
395                 error = 1
396     except:
397         traceback.print_exc(file=LOG.OutputFile)
398         traceback.print_exc()
399
400     LOG.LogEntry( "BootManager finished at: %s" % \
401                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
402     LOG.Upload()
403
404     return error
405
406     
407 if __name__ == "__main__":
408     error = main(sys.argv)
409     sys.exit(error)