run ValidateNodeInstall in debug/disabled/diagnose mode to fsck/mount the fs
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import string
10 import sys, os, traceback
11 import time
12 import gzip
13
14 from steps import *
15 from Exceptions import *
16 import notify_messages
17 import BootServerRequest
18
19 # all output is written to this file
20 BM_NODE_LOG= "/tmp/bm.log"
21 UPLOAD_LOG_SCRIPT = "/boot/upload-bmlog.php"
22
23 # the new contents of PATH when the boot manager is running
24 BIN_PATH= ('/usr/local/bin',
25            '/usr/local/sbin',
26            '/usr/bin',
27            '/usr/sbin',
28            '/bin',
29            '/sbin')
30            
31 ##############################
32 class log:
33
34     format="%H:%M:%S(%Z) "
35
36     def __init__( self, OutputFilePath= None ):
37         try:
38             self.OutputFile= open( OutputFilePath, "w")
39             self.OutputFilePath= OutputFilePath
40         except:
41             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
42             self.OutputFile= None
43     
44     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
45         now=time.strftime(log.format, time.localtime())
46         if self.OutputFile:
47             self.OutputFile.write( now+str )
48         if display_screen:
49             sys.stdout.write( now+str )
50             
51         if inc_newline:
52             if display_screen:
53                 sys.stdout.write( "\n" )
54             if self.OutputFile:
55                 self.OutputFile.write( "\n" )
56
57         if self.OutputFile:
58             self.OutputFile.flush()
59
60     def write( self, str ):
61         """
62         make log behave like a writable file object (for traceback
63         prints)
64         """
65         self.LogEntry( str, 0, 1 )
66     
67     # bm log uploading is available back again, as of nodeconfig-5.0-2
68     def Upload( self ):
69         """
70         upload the contents of the log to the server
71         """
72         if self.OutputFile is not None:
73             self.OutputFile.flush()
74
75             self.LogEntry( "Uploading logs to %s" % UPLOAD_LOG_SCRIPT )
76             
77             self.OutputFile.close()
78             self.OutputFile= None
79
80             bs_request = BootServerRequest.BootServerRequest()
81             bs_request.MakeRequest(PartialPath = UPLOAD_LOG_SCRIPT,
82                                    GetVars = None, PostVars = None,
83                                    FormData = ["log=@" + self.OutputFilePath],
84                                    DoSSL = True, DoCertCheck = True)
85
86 ##############################
87 class BootManager:
88
89     # file containing initial variables/constants
90     VARS_FILE = "configuration"
91
92     # the set of valid node run states
93     NodeRunStates = {'install':None,
94                      'reinstall':None,
95                      'boot':None,
96                      'failboot':None,
97                      'safeboot':None,
98                      'disabled':None,
99                      }
100     
101     def __init__(self, log, forceState):
102         # override machine's current state from the command line
103         self.forceState = forceState
104
105         # the main logging point
106         self.LOG= log
107
108         # set to 1 if we can run after initialization
109         self.CAN_RUN = 0
110              
111         # read in and store all variables in VARS_FILE into each line
112         # is in the format name=val (any whitespace around the = is
113         # removed. everything after the = to the end of the line is
114         # the value
115         vars = {}
116         vars_file= file(self.VARS_FILE,'r')
117         validConfFile = True
118         for line in vars_file:
119             # if its a comment or a whitespace line, ignore
120             if line[:1] == "#" or string.strip(line) == "":
121                 continue
122
123             parts= string.split(line,"=")
124             if len(parts) != 2:
125                 self.LOG.LogEntry( "Invalid line in vars file: %s" % line )
126                 validConfFile = False
127                 break
128
129             name= string.strip(parts[0])
130             value= string.strip(parts[1])
131             vars[name]= value
132
133         vars_file.close()
134         if not validConfFile:
135             self.LOG.LogEntry( "Unable to read configuration vars." )
136             return
137
138         # find out which directory we are running it, and set a variable
139         # for that. future steps may need to get files out of the bootmanager
140         # directory
141         current_dir= os.getcwd()
142         vars['BM_SOURCE_DIR']= current_dir
143
144         # not sure what the current PATH is set to, replace it with what
145         # we know will work with all the boot cds
146         os.environ['PATH']= string.join(BIN_PATH,":")
147                    
148         # this contains a set of information used and updated by each step
149         self.VARS= vars
150
151         self.CAN_RUN= 1
152
153     def Run(self):
154         """
155         core boot manager logic.
156
157         the way errors are handled is as such: if any particular step
158         cannot continue or unexpectibly fails, an exception is thrown.
159         in this case, the boot manager cannot continue running.
160
161         these step functions can also return a 0/1 depending on whether
162         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
163         a 0 is returned and no exception is thrown if the user chose not
164         to confirm the install. The same goes with the CheckHardwareRequirements.
165         If requriements not met, but tests were succesfull, return 0.
166
167         for steps that run within the installer, they are expected to either
168         complete succesfully and return 1, or throw an execption.
169
170         For exact return values and expected operations, see the comments
171         at the top of each of the invididual step functions.
172         """
173
174         def _nodeNotInstalled():
175             # called by the _xxxState() functions below upon failure
176             self.VARS['BOOT_STATE']= 'failboot'
177             self.VARS['STATE_CHANGE_NOTIFY']= 1
178             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
179                       notify_messages.MSG_NODE_NOT_INSTALLED
180             raise BootManagerException, \
181                   notify_messages.MSG_NODE_NOT_INSTALLED
182
183         def _bootRun():
184             # implements the boot logic, which consists of first
185             # double checking that the node was properly installed,
186             # checking whether someone added or changed disks, and
187             # then finally chain boots.
188
189             # starting the fallback/debug ssh daemon for safety:
190             # if the node install somehow hangs, or if it simply takes ages, 
191             # we can still enter and investigate
192             try:
193                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
194             except:
195                 pass
196
197             InstallInit.Run( self.VARS, self.LOG )                    
198             if ValidateNodeInstall.Run( self.VARS, self.LOG ):
199                 WriteModprobeConfig.Run( self.VARS, self.LOG )
200                 MakeInitrd.Run( self.VARS, self.LOG )
201                 WriteNetworkConfig.Run( self.VARS, self.LOG )
202                 CheckForNewDisks.Run( self.VARS, self.LOG )
203                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
204                 ChainBootNode.Run( self.VARS, self.LOG )
205             else:
206                 _nodeNotInstalled()
207
208         def _reinstallRun():
209
210             # starting the fallback/debug ssh daemon for safety:
211             # if the node install somehow hangs, or if it simply takes ages, 
212             # we can still enter and investigate
213             try:
214                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
215             except:
216                 pass
217
218             # implements the reinstall logic, which will check whether
219             # the min. hardware requirements are met, install the
220             # software, and upon correct installation will switch too
221             # 'boot' state and chainboot into the production system
222             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
223                 self.VARS['BOOT_STATE']= 'failboot'
224                 raise BootManagerException, "Hardware requirements not met."
225
226             # runinstaller
227             InstallInit.Run( self.VARS, self.LOG )                    
228             InstallPartitionDisks.Run( self.VARS, self.LOG )            
229             InstallBootstrapFS.Run( self.VARS, self.LOG )            
230             InstallWriteConfig.Run( self.VARS, self.LOG )
231             InstallUninitHardware.Run( self.VARS, self.LOG )
232             self.VARS['BOOT_STATE']= 'boot'
233             self.VARS['STATE_CHANGE_NOTIFY']= 1
234             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
235                  notify_messages.MSG_INSTALL_FINISHED
236             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
237             _bootRun()
238             
239         def _installRun():
240             # implements the new install logic, which will first check
241             # with the user whether it is ok to install on this
242             # machine, switch to 'reinstall' state and then invoke the reinstall
243             # logic.  See reinstallState logic comments for further
244             # details.
245             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
246                 return 0
247             self.VARS['BOOT_STATE']= 'reinstall'
248             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
249             _reinstallRun()
250
251         def _debugRun(state='failboot'):
252             # implements debug logic, which starts the sshd and just waits around
253             self.VARS['BOOT_STATE']=state
254             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
255             StartDebug.Run( self.VARS, self.LOG )
256             # fsck/mount fs if present, and ignore return value if it's not.
257             ValidateNodeInstall.Run( self.VARS, self.LOG )
258
259         def _badstateRun():
260             # should never happen; log event
261             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
262             _debugRun()
263
264         # setup state -> function hash table
265         BootManager.NodeRunStates['install']    = _installRun
266         BootManager.NodeRunStates['reinstall']  = _reinstallRun
267         BootManager.NodeRunStates['boot']       = _bootRun
268         BootManager.NodeRunStates['failboot']   = _bootRun   # should always try to boot.
269         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
270         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
271
272         success = 0
273         try:
274             InitializeBootManager.Run( self.VARS, self.LOG )
275             ReadNodeConfiguration.Run( self.VARS, self.LOG )
276             AuthenticateWithPLC.Run( self.VARS, self.LOG )
277             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
278
279             # override machine's current state from the command line
280             if self.forceState is not None:
281                 self.VARS['BOOT_STATE']= self.forceState
282                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
283
284             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
285             stateRun()
286             success = 1
287
288         except KeyError, e:
289             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
290         except BootManagerException, e:
291             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
292         except:
293             self.LOG.write( "\n\nImplementation Error\n")
294             traceback.print_exc(file=self.LOG.OutputFile)
295             traceback.print_exc()
296
297         if not success:
298             try:
299                 _debugRun()
300             except BootManagerException, e:
301                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
302             except:
303                 self.LOG.write( "\n\nImplementation Error\n")
304                 traceback.print_exc(file=self.LOG.OutputFile)
305                 traceback.print_exc()
306
307         return success
308             
309             
310 def main(argv):
311
312     import utils
313     utils.prompt_for_breakpoint_mode()
314
315     utils.breakpoint ("Entering BootManager::main")
316     
317     # set to 1 if error occurred
318     error= 0
319     
320     # all output goes through this class so we can save it and post
321     # the data back to PlanetLab central
322     LOG= log( BM_NODE_LOG )
323
324     LOG.LogEntry( "BootManager started at: %s" % \
325                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
326
327     try:
328         forceState = None
329         if len(argv) == 2:
330             fState = argv[1]
331             if BootManager.NodeRunStates.has_key(fState):
332                 forceState = fState
333             else:
334                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
335                 error = 1
336     except:
337         traceback.print_exc(file=LOG.OutputFile)
338         traceback.print_exc()
339         
340     if error:
341         LOG.LogEntry( "BootManager finished at: %s" % \
342                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
343         LOG.Upload()
344         return error
345
346     try:
347         bm= BootManager(LOG,forceState)
348         if bm.CAN_RUN == 0:
349             LOG.LogEntry( "Unable to initialize BootManager." )
350         else:
351             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
352             success= bm.Run()
353             if success:
354                 LOG.LogEntry( "\nDone!" );
355             else:
356                 LOG.LogEntry( "\nError occurred!" );
357                 error = 1
358     except:
359         traceback.print_exc(file=LOG.OutputFile)
360         traceback.print_exc()
361
362     LOG.LogEntry( "BootManager finished at: %s" % \
363                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
364     LOG.Upload()
365
366     return error
367
368     
369 if __name__ == "__main__":
370     error = main(sys.argv)
371     sys.exit(error)