for debugging - print stack in case of exceptions caught at toplevel
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import sys, os
10 import traceback
11 import string
12 import time
13 import gzip
14
15 from steps import *
16 from Exceptions import *
17 import notify_messages
18 import BootServerRequest
19 import utils
20
21 # all output is written to this file
22 BM_NODE_LOG= "/tmp/bm.log"
23 VARS_FILE = "configuration"
24
25 # the new contents of PATH when the boot manager is running
26 BIN_PATH= ('/usr/local/bin',
27            '/usr/local/sbin',
28            '/usr/bin',
29            '/usr/sbin',
30            '/bin',
31            '/sbin')
32
33 def read_configuration_file(filename):
34     # read in and store all variables in VARS_FILE into each line
35     # is in the format name=val (any whitespace around the = is
36     # removed. everything after the = to the end of the line is
37     # the value
38     vars = {}
39     vars_file= file(filename,'r')
40     validConfFile = True
41     for line in vars_file:
42         # if its a comment or a whitespace line, ignore
43         if line[:1] == "#" or string.strip(line) == "":
44             continue
45
46         parts= string.split(line,"=")
47         if len(parts) != 2:
48             validConfFile = False
49             raise Exception( "Invalid line in vars file: %s" % line )
50
51         name= string.strip(parts[0])
52         value= string.strip(parts[1])
53         value= value.replace("'", "")   # remove quotes
54         value= value.replace('"', "")   # remove quotes
55         vars[name]= value
56
57     vars_file.close()
58     if not validConfFile:
59         raise Exception( "Unable to read configuration vars." )
60
61     # find out which directory we are running it, and set a variable
62     # for that. future steps may need to get files out of the bootmanager
63     # directory
64     current_dir= os.getcwd()
65     vars['BM_SOURCE_DIR']= current_dir
66
67     return vars
68
69 ##############################
70 class log:
71
72     format="%H:%M:%S(%Z) "
73
74     def __init__( self, OutputFilePath= None ):
75         try:
76             self.OutputFile= open( OutputFilePath, "w")
77             self.OutputFilePath= OutputFilePath
78         except:
79             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
80             self.OutputFile= None
81
82         self.VARS = None
83         try:
84             vars = read_configuration_file(VARS_FILE)
85             self.VARS = vars
86         except Exception, e:
87             self.LogEntry( str(e) )
88             return
89     
90     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
91         now=time.strftime(log.format, time.localtime())
92         if self.OutputFile:
93             self.OutputFile.write( now+str )
94         if display_screen:
95             sys.stdout.write( now+str )
96             
97         if inc_newline:
98             if display_screen:
99                 sys.stdout.write( "\n" )
100             if self.OutputFile:
101                 self.OutputFile.write( "\n" )
102
103         if self.OutputFile:
104             self.OutputFile.flush()
105
106     def write( self, str ):
107         """
108         make log behave like a writable file object (for traceback
109         prints)
110         """
111         self.LogEntry( str, 0, 1 )
112     
113     def print_stack (self):
114         """
115         dump current stack in log
116         """
117         self.write ( traceback.format_exc() )
118
119     # bm log uploading is available back again, as of nodeconfig-5.0-2
120     def Upload( self, extra_file=None ):
121         """
122         upload the contents of the log to the server
123         """
124         if self.OutputFile is not None:
125             self.OutputFile.flush()
126
127             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
128             
129             self.OutputFile.close()
130             self.OutputFile= None
131
132             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
133                       self.VARS['INTERFACE_SETTINGS']['domainname']
134             bs_request = BootServerRequest.BootServerRequest(self.VARS)
135             try:
136                 # this was working until f10
137                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
138                                        GetVars = None, PostVars = None,
139                                        DoSSL = True, DoCertCheck = True,
140                                        FormData = ["log=@" + self.OutputFilePath,
141                                                    "hostname=" + hostname, 
142                                                    "type=bm.log"])
143             except:
144                 # new pycurl
145                 import pycurl
146                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
147                                        GetVars = None, PostVars = None,
148                                        DoSSL = True, DoCertCheck = True,
149                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
150                                                    ("hostname",hostname),
151                                                    ("type","bm.log")])
152         if extra_file is not None:
153             # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
154             # by adding --login, bash reads .bash_profile before execution.
155             # Also, never fail, since this is an optional feature.
156             utils.sysexec_noerr( """bash --login -c "upload_logs %s" """ % extra_file, self)
157
158
159 ##############################
160 class BootManager:
161
162     # file containing initial variables/constants
163
164     # the set of valid node run states
165     NodeRunStates = {'reinstall':None,
166                      'boot':None,
167                      'safeboot':None,
168                      'disabled':None,
169                      }
170     
171     def __init__(self, log, forceState):
172         # override machine's current state from the command line
173         self.forceState = forceState
174
175         # the main logging point
176         self.LOG= log
177
178         # set to 1 if we can run after initialization
179         self.CAN_RUN = 0
180
181         if log.VARS:
182             # this contains a set of information used and updated by each step
183             self.VARS= log.VARS
184         else:
185             return
186              
187         # not sure what the current PATH is set to, replace it with what
188         # we know will work with all the boot cds
189         os.environ['PATH']= string.join(BIN_PATH,":")
190
191         self.CAN_RUN= 1
192
193     def Run(self):
194         """
195         core boot manager logic.
196
197         the way errors are handled is as such: if any particular step
198         cannot continue or unexpectibly fails, an exception is thrown.
199         in this case, the boot manager cannot continue running.
200
201         these step functions can also return a 0/1 depending on whether
202         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
203         a 0 is returned and no exception is thrown if the user chose not
204         to confirm the install. The same goes with the CheckHardwareRequirements.
205         If requriements not met, but tests were succesfull, return 0.
206
207         for steps that run within the installer, they are expected to either
208         complete succesfully and return 1, or throw an exception.
209
210         For exact return values and expected operations, see the comments
211         at the top of each of the invididual step functions.
212         """
213
214         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
215             # called by the _xxxState() functions below upon failure
216             self.VARS['RUN_LEVEL']= 'failboot'
217             notify = getattr(notify_messages, message)
218             self.VARS['STATE_CHANGE_NOTIFY']= 1
219             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
220             raise BootManagerException, notify
221
222         def _bootRun():
223             # implements the boot logic, which consists of first
224             # double checking that the node was properly installed,
225             # checking whether someone added or changed disks, and
226             # then finally chain boots.
227
228             # starting the fallback/debug ssh daemon for safety:
229             # if the node install somehow hangs, or if it simply takes ages, 
230             # we can still enter and investigate
231             try:
232                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
233             except:
234                 pass
235
236             InstallInit.Run( self.VARS, self.LOG )                    
237             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
238             if ret == 1:
239 # Thierry - feb. 2013 turning off WriteModprobeConfig for now on lxc
240 # for one thing this won't work at all with f18, as modules.pcimap
241 # has disappeared (Daniel suggested modules.aliases could be used instead)
242 # and second, in any case it's been years now that modprobe.conf was deprecated
243 # so most likely this code has no actual effect
244                 if self.VARS['virt'] == 'vs':
245                     WriteModprobeConfig.Run( self.VARS, self.LOG )
246                 WriteNetworkConfig.Run( self.VARS, self.LOG )
247                 CheckForNewDisks.Run( self.VARS, self.LOG )
248                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
249                 ChainBootNode.Run( self.VARS, self.LOG )
250             elif ret == -1:
251                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
252             elif ret == -2:
253                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
254             elif ret == -3:
255                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
256             else:
257                 _nodeNotInstalled()
258
259         def _reinstallRun():
260
261             # starting the fallback/debug ssh daemon for safety:
262             # if the node install somehow hangs, or if it simply takes ages, 
263             # we can still enter and investigate
264             try:
265                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
266             except:
267                 pass
268
269             # implements the reinstall logic, which will check whether
270             # the min. hardware requirements are met, install the
271             # software, and upon correct installation will switch too
272             # 'boot' state and chainboot into the production system
273             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
274                 self.VARS['RUN_LEVEL']= 'failboot'
275                 raise BootManagerException, "Hardware requirements not met."
276
277             # runinstaller
278             InstallInit.Run( self.VARS, self.LOG )                    
279             InstallPartitionDisks.Run( self.VARS, self.LOG )            
280             InstallBootstrapFS.Run( self.VARS, self.LOG )            
281             InstallWriteConfig.Run( self.VARS, self.LOG )
282             InstallUninitHardware.Run( self.VARS, self.LOG )
283             self.VARS['BOOT_STATE']= 'boot'
284             self.VARS['STATE_CHANGE_NOTIFY']= 1
285             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
286                  notify_messages.MSG_INSTALL_FINISHED
287             AnsibleHook.Run( self.VARS, self.LOG )
288             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
289             _bootRun()
290             
291         def _installRun():
292             # implements the new install logic, which will first check
293             # with the user whether it is ok to install on this
294             # machine, switch to 'reinstall' state and then invoke the reinstall
295             # logic.  See reinstallState logic comments for further
296             # details.
297             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
298                 return 0
299             self.VARS['BOOT_STATE']= 'reinstall'
300
301             AnsibleHook.Run( self.VARS, self.LOG )
302             _reinstallRun()
303
304         def _debugRun(state='failboot'):
305             # implements debug logic, which starts the sshd and just waits around
306             self.VARS['RUN_LEVEL']=state
307             StartDebug.Run( self.VARS, self.LOG )
308             # fsck/mount fs if present, and ignore return value if it's not.
309             ValidateNodeInstall.Run( self.VARS, self.LOG )
310
311         def _badstateRun():
312             # should never happen; log event
313             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
314             _debugRun()
315
316         # setup state -> function hash table
317         BootManager.NodeRunStates['reinstall']  = _reinstallRun
318         BootManager.NodeRunStates['boot']       = _bootRun
319         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
320         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
321
322         success = 0
323         try:
324             InitializeBootManager.Run( self.VARS, self.LOG )
325             ReadNodeConfiguration.Run( self.VARS, self.LOG )
326             AuthenticateWithPLC.Run( self.VARS, self.LOG )
327             UpdateLastBootOnce.Run( self.VARS, self.LOG )
328             StartRunlevelAgent.Run( self.VARS, self.LOG )
329             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
330
331             # override machine's current state from the command line
332             if self.forceState is not None:
333                 self.VARS['BOOT_STATE']= self.forceState
334                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
335
336             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
337             stateRun()
338             success = 1
339
340         except KeyError, e:
341             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
342             self.LOG.print_stack ()
343         except BootManagerException, e:
344             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
345             self.LOG.print_stack ()
346         except BootManagerAuthenticationException, e:
347             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
348             self.LOG.print_stack ()
349             # sets /tmp/CANCEL_BOOT flag
350             StartDebug.Run(self.VARS, self.LOG )
351             # Return immediately b/c any other calls to API will fail
352             return success
353         except:
354             self.LOG.write( "\n\nImplementation Error\n")
355             self.LOG.print_stack ()
356
357         if not success:
358             try:
359                 _debugRun()
360             except BootManagerException, e:
361                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
362             except:
363                 self.LOG.write( "\n\nImplementation Error\n")
364                 traceback.print_exc(file=self.LOG.OutputFile)
365                 traceback.print_exc()
366
367         return success
368             
369             
370 def main(argv):
371
372     import utils
373     utils.prompt_for_breakpoint_mode()
374
375 #    utils.breakpoint ("Entering BootManager::main")
376     
377     # set to 1 if error occurred
378     error= 0
379     
380     # all output goes through this class so we can save it and post
381     # the data back to PlanetLab central
382     LOG= log( BM_NODE_LOG )
383
384     # NOTE: assume CWD is BM's source directory, but never fail
385     utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
386
387     LOG.LogEntry( "BootManager started at: %s" % \
388                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
389
390     try:
391         forceState = None
392         if len(argv) == 2:
393             fState = argv[1]
394             if BootManager.NodeRunStates.has_key(fState):
395                 forceState = fState
396             else:
397                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
398                 error = 1
399     except:
400         traceback.print_exc(file=LOG.OutputFile)
401         traceback.print_exc()
402         
403     if error:
404         LOG.LogEntry( "BootManager finished at: %s" % \
405                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
406         LOG.Upload()
407         return error
408
409     try:
410         bm= BootManager(LOG,forceState)
411         if bm.CAN_RUN == 0:
412             LOG.LogEntry( "Unable to initialize BootManager." )
413         else:
414             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
415             success= bm.Run()
416             if success:
417                 LOG.LogEntry( "\nDone!" );
418             else:
419                 LOG.LogEntry( "\nError occurred!" );
420                 error = 1
421     except:
422         traceback.print_exc(file=LOG.OutputFile)
423         traceback.print_exc()
424
425     LOG.LogEntry( "BootManager finished at: %s" % \
426                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
427     LOG.Upload()
428
429     return error
430
431     
432 if __name__ == "__main__":
433     error = main(sys.argv)
434     sys.exit(error)