* improve availability - reliability : start a fallback sshd very early in the bm...
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import string
10 import sys, os, traceback
11 import time
12 import gzip
13
14 from steps import *
15 from Exceptions import *
16 import notify_messages
17 import BootServerRequest
18
19 # all output is written to this file
20 BM_NODE_LOG= "/tmp/bm.log"
21 UPLOAD_LOG_SCRIPT = "/boot/upload-bmlog.php"
22
23 # the new contents of PATH when the boot manager is running
24 BIN_PATH= ('/usr/local/bin',
25            '/usr/local/sbin',
26            '/usr/bin',
27            '/usr/sbin',
28            '/bin',
29            '/sbin')
30            
31 ##############################
32 class log:
33
34     format="%H:%M:%S(%Z) "
35
36     def __init__( self, OutputFilePath= None ):
37         try:
38             self.OutputFile= open( OutputFilePath, "w")
39             self.OutputFilePath= OutputFilePath
40         except:
41             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
42             self.OutputFile= None
43     
44     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
45         now=time.strftime(log.format, time.localtime())
46         if self.OutputFile:
47             self.OutputFile.write( now+str )
48         if display_screen:
49             sys.stdout.write( now+str )
50             
51         if inc_newline:
52             if display_screen:
53                 sys.stdout.write( "\n" )
54             if self.OutputFile:
55                 self.OutputFile.write( "\n" )
56
57         if self.OutputFile:
58             self.OutputFile.flush()
59
60     def write( self, str ):
61         """
62         make log behave like a writable file object (for traceback
63         prints)
64         """
65         self.LogEntry( str, 0, 1 )
66     
67     # bm log uploading is available back again, as of nodeconfig-5.0-2
68     def Upload( self ):
69         """
70         upload the contents of the log to the server
71         """
72         if self.OutputFile is not None:
73             self.OutputFile.flush()
74
75             self.LogEntry( "Uploading logs to %s" % UPLOAD_LOG_SCRIPT )
76             
77             self.OutputFile.close()
78             self.OutputFile= None
79
80             bs_request = BootServerRequest.BootServerRequest()
81             bs_request.MakeRequest(PartialPath = UPLOAD_LOG_SCRIPT,
82                                    GetVars = None, PostVars = None,
83                                    FormData = ["log=@" + self.OutputFilePath],
84                                    DoSSL = True, DoCertCheck = True)
85
86 ##############################
87 class BootManager:
88
89     # file containing initial variables/constants
90     VARS_FILE = "configuration"
91
92     # the set of valid node run states
93     NodeRunStates = {'install':None,
94                      'reinstall':None,
95                      'boot':None,
96                      'failboot':None,
97                      'safeboot':None,
98                      'disabled':None,
99                      }
100     
101     def __init__(self, log, forceState):
102         # override machine's current state from the command line
103         self.forceState = forceState
104
105         # the main logging point
106         self.LOG= log
107
108         # set to 1 if we can run after initialization
109         self.CAN_RUN = 0
110              
111         # read in and store all variables in VARS_FILE into each line
112         # is in the format name=val (any whitespace around the = is
113         # removed. everything after the = to the end of the line is
114         # the value
115         vars = {}
116         vars_file= file(self.VARS_FILE,'r')
117         validConfFile = True
118         for line in vars_file:
119             # if its a comment or a whitespace line, ignore
120             if line[:1] == "#" or string.strip(line) == "":
121                 continue
122
123             parts= string.split(line,"=")
124             if len(parts) != 2:
125                 self.LOG.LogEntry( "Invalid line in vars file: %s" % line )
126                 validConfFile = False
127                 break
128
129             name= string.strip(parts[0])
130             value= string.strip(parts[1])
131             vars[name]= value
132
133         vars_file.close()
134         if not validConfFile:
135             self.LOG.LogEntry( "Unable to read configuration vars." )
136             return
137
138         # find out which directory we are running it, and set a variable
139         # for that. future steps may need to get files out of the bootmanager
140         # directory
141         current_dir= os.getcwd()
142         vars['BM_SOURCE_DIR']= current_dir
143
144         # not sure what the current PATH is set to, replace it with what
145         # we know will work with all the boot cds
146         os.environ['PATH']= string.join(BIN_PATH,":")
147                    
148         # this contains a set of information used and updated by each step
149         self.VARS= vars
150
151         self.CAN_RUN= 1
152
153     def Run(self):
154         """
155         core boot manager logic.
156
157         the way errors are handled is as such: if any particular step
158         cannot continue or unexpectibly fails, an exception is thrown.
159         in this case, the boot manager cannot continue running.
160
161         these step functions can also return a 0/1 depending on whether
162         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
163         a 0 is returned and no exception is thrown if the user chose not
164         to confirm the install. The same goes with the CheckHardwareRequirements.
165         If requriements not met, but tests were succesfull, return 0.
166
167         for steps that run within the installer, they are expected to either
168         complete succesfully and return 1, or throw an execption.
169
170         For exact return values and expected operations, see the comments
171         at the top of each of the invididual step functions.
172         """
173
174         def _nodeNotInstalled():
175             # called by the _xxxState() functions below upon failure
176             self.VARS['BOOT_STATE']= 'failboot'
177             self.VARS['STATE_CHANGE_NOTIFY']= 1
178             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
179                       notify_messages.MSG_NODE_NOT_INSTALLED
180             raise BootManagerException, \
181                   notify_messages.MSG_NODE_NOT_INSTALLED
182
183         def _bootRun():
184             # implements the boot logic, which consists of first
185             # double checking that the node was properly installed,
186             # checking whether someone added or changed disks, and
187             # then finally chain boots.
188
189             # starting the fallback/debug ssh daemon for safety:
190             # if the node install somehow hangs, or if it simply takes ages, 
191             # we can still enter and investigate
192             try:
193                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
194             except:
195                 pass
196
197             InstallInit.Run( self.VARS, self.LOG )                    
198             if ValidateNodeInstall.Run( self.VARS, self.LOG ):
199                 WriteModprobeConfig.Run( self.VARS, self.LOG )
200                 MakeInitrd.Run( self.VARS, self.LOG )
201                 WriteNetworkConfig.Run( self.VARS, self.LOG )
202                 CheckForNewDisks.Run( self.VARS, self.LOG )
203                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
204                 ChainBootNode.Run( self.VARS, self.LOG )
205             else:
206                 _nodeNotInstalled()
207
208         def _reinstallRun():
209
210             # starting the fallback/debug ssh daemon for safety:
211             # if the node install somehow hangs, or if it simply takes ages, 
212             # we can still enter and investigate
213             try:
214                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
215             except:
216                 pass
217
218             # implements the reinstall logic, which will check whether
219             # the min. hardware requirements are met, install the
220             # software, and upon correct installation will switch too
221             # 'boot' state and chainboot into the production system
222             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
223                 self.VARS['BOOT_STATE']= 'failboot'
224                 raise BootManagerException, "Hardware requirements not met."
225
226             # runinstaller
227             InstallInit.Run( self.VARS, self.LOG )                    
228             InstallPartitionDisks.Run( self.VARS, self.LOG )            
229             InstallBootstrapFS.Run( self.VARS, self.LOG )            
230             InstallWriteConfig.Run( self.VARS, self.LOG )
231             InstallUninitHardware.Run( self.VARS, self.LOG )
232             self.VARS['BOOT_STATE']= 'boot'
233             self.VARS['STATE_CHANGE_NOTIFY']= 1
234             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
235                  notify_messages.MSG_INSTALL_FINISHED
236             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
237             _bootRun()
238             
239         def _installRun():
240             # implements the new install logic, which will first check
241             # with the user whether it is ok to install on this
242             # machine, switch to 'reinstall' state and then invoke the reinstall
243             # logic.  See reinstallState logic comments for further
244             # details.
245             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
246                 return 0
247             self.VARS['BOOT_STATE']= 'reinstall'
248             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
249             _reinstallRun()
250
251         def _debugRun(state='failboot'):
252             # implements debug logic, which starts the sshd and just waits around
253             self.VARS['BOOT_STATE']=state
254             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
255             StartDebug.Run( self.VARS, self.LOG )
256
257         def _badstateRun():
258             # should never happen; log event
259             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
260             _debugRun()
261
262         # setup state -> function hash table
263         BootManager.NodeRunStates['install']    = _installRun
264         BootManager.NodeRunStates['reinstall']  = _reinstallRun
265         BootManager.NodeRunStates['boot']       = _bootRun
266         BootManager.NodeRunStates['failboot']   = _bootRun   # should always try to boot.
267         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
268         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
269
270         success = 0
271         try:
272             InitializeBootManager.Run( self.VARS, self.LOG )
273             ReadNodeConfiguration.Run( self.VARS, self.LOG )
274             AuthenticateWithPLC.Run( self.VARS, self.LOG )
275             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
276
277             # override machine's current state from the command line
278             if self.forceState is not None:
279                 self.VARS['BOOT_STATE']= self.forceState
280                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
281
282             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
283             stateRun()
284             success = 1
285
286         except KeyError, e:
287             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
288         except BootManagerException, e:
289             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
290         except:
291             self.LOG.write( "\n\nImplementation Error\n")
292             traceback.print_exc(file=self.LOG.OutputFile)
293             traceback.print_exc()
294
295         if not success:
296             try:
297                 _debugRun()
298             except BootManagerException, e:
299                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
300             except:
301                 self.LOG.write( "\n\nImplementation Error\n")
302                 traceback.print_exc(file=self.LOG.OutputFile)
303                 traceback.print_exc()
304
305         return success
306             
307             
308 def main(argv):
309
310     import utils
311     utils.prompt_for_breakpoint_mode()
312
313     utils.breakpoint ("Entering BootManager::main")
314     
315     # set to 1 if error occurred
316     error= 0
317     
318     # all output goes through this class so we can save it and post
319     # the data back to PlanetLab central
320     LOG= log( BM_NODE_LOG )
321
322     LOG.LogEntry( "BootManager started at: %s" % \
323                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
324
325     try:
326         forceState = None
327         if len(argv) == 2:
328             fState = argv[1]
329             if BootManager.NodeRunStates.has_key(fState):
330                 forceState = fState
331             else:
332                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
333                 error = 1
334     except:
335         traceback.print_exc(file=LOG.OutputFile)
336         traceback.print_exc()
337         
338     if error:
339         LOG.LogEntry( "BootManager finished at: %s" % \
340                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
341         LOG.Upload()
342         return error
343
344     try:
345         bm= BootManager(LOG,forceState)
346         if bm.CAN_RUN == 0:
347             LOG.LogEntry( "Unable to initialize BootManager." )
348         else:
349             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
350             success= bm.Run()
351             if success:
352                 LOG.LogEntry( "\nDone!" );
353             else:
354                 LOG.LogEntry( "\nError occurred!" );
355                 error = 1
356     except:
357         traceback.print_exc(file=LOG.OutputFile)
358         traceback.print_exc()
359
360     LOG.LogEntry( "BootManager finished at: %s" % \
361                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
362     LOG.Upload()
363
364     return error
365
366     
367 if __name__ == "__main__":
368     error = main(sys.argv)
369     sys.exit(error)