never fail when uploading logs
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # $Id$
4 # $URL$
5 #
6 # Copyright (c) 2003 Intel Corporation
7 # All rights reserved.
8 #
9 # Copyright (c) 2004-2006 The Trustees of Princeton University
10 # All rights reserved.
11
12 import string
13 import sys, os, traceback
14 import time
15 import gzip
16
17 from steps import *
18 from Exceptions import *
19 import notify_messages
20 import BootServerRequest
21 import utils
22
23 # all output is written to this file
24 BM_NODE_LOG= "/tmp/bm.log"
25 VARS_FILE = "configuration"
26
27 # the new contents of PATH when the boot manager is running
28 BIN_PATH= ('/usr/local/bin',
29            '/usr/local/sbin',
30            '/usr/bin',
31            '/usr/sbin',
32            '/bin',
33            '/sbin')
34
35 def read_configuration_file(filename):
36     # read in and store all variables in VARS_FILE into each line
37     # is in the format name=val (any whitespace around the = is
38     # removed. everything after the = to the end of the line is
39     # the value
40     vars = {}
41     vars_file= file(filename,'r')
42     validConfFile = True
43     for line in vars_file:
44         # if its a comment or a whitespace line, ignore
45         if line[:1] == "#" or string.strip(line) == "":
46             continue
47
48         parts= string.split(line,"=")
49         if len(parts) != 2:
50             validConfFile = False
51             raise Exception( "Invalid line in vars file: %s" % line )
52
53         name= string.strip(parts[0])
54         value= string.strip(parts[1])
55         value= value.replace("'", "")   # remove quotes
56         value= value.replace('"', "")   # remove quotes
57         vars[name]= value
58
59     vars_file.close()
60     if not validConfFile:
61         raise Exception( "Unable to read configuration vars." )
62
63     # find out which directory we are running it, and set a variable
64     # for that. future steps may need to get files out of the bootmanager
65     # directory
66     current_dir= os.getcwd()
67     vars['BM_SOURCE_DIR']= current_dir
68
69     return vars
70
71 ##############################
72 class log:
73
74     format="%H:%M:%S(%Z) "
75
76     def __init__( self, OutputFilePath= None ):
77         try:
78             self.OutputFile= open( OutputFilePath, "w")
79             self.OutputFilePath= OutputFilePath
80         except:
81             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
82             self.OutputFile= None
83
84         self.VARS = None
85         try:
86             vars = read_configuration_file(VARS_FILE)
87             self.VARS = vars
88         except Exception, e:
89             self.LogEntry( str(e) )
90             return
91     
92     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
93         now=time.strftime(log.format, time.localtime())
94         if self.OutputFile:
95             self.OutputFile.write( now+str )
96         if display_screen:
97             sys.stdout.write( now+str )
98             
99         if inc_newline:
100             if display_screen:
101                 sys.stdout.write( "\n" )
102             if self.OutputFile:
103                 self.OutputFile.write( "\n" )
104
105         if self.OutputFile:
106             self.OutputFile.flush()
107
108     def write( self, str ):
109         """
110         make log behave like a writable file object (for traceback
111         prints)
112         """
113         self.LogEntry( str, 0, 1 )
114     
115     # bm log uploading is available back again, as of nodeconfig-5.0-2
116     def Upload( self, extra_file=None ):
117         """
118         upload the contents of the log to the server
119         """
120         if self.OutputFile is not None:
121             self.OutputFile.flush()
122
123             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
124             
125             self.OutputFile.close()
126             self.OutputFile= None
127
128             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
129                       self.VARS['INTERFACE_SETTINGS']['domainname']
130             bs_request = BootServerRequest.BootServerRequest(self.VARS)
131             try:
132                 # this was working until f10
133                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
134                                        GetVars = None, PostVars = None,
135                                        DoSSL = True, DoCertCheck = True,
136                                        FormData = ["log=@" + self.OutputFilePath,
137                                                    "hostname=" + hostname, 
138                                                    "type=bm.log"])
139             except:
140                 # new pycurl
141                 import pycurl
142                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
143                                        GetVars = None, PostVars = None,
144                                        DoSSL = True, DoCertCheck = True,
145                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
146                                                    ("hostname",hostname),
147                                                    ("type","bm.log")])
148         if extra_file is not None:
149             # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
150             # by adding --login, bash reads .bash_profile before execution.
151             # Also, never fail, since this is an optional feature.
152             utils.sysexec( """bash --login -c "upload_logs %s || /bin/true" """ % extra_file, self)
153
154
155 ##############################
156 class BootManager:
157
158     # file containing initial variables/constants
159
160     # the set of valid node run states
161     NodeRunStates = {'reinstall':None,
162                      'boot':None,
163                      'safeboot':None,
164                      'disabled':None,
165                      }
166     
167     def __init__(self, log, forceState):
168         # override machine's current state from the command line
169         self.forceState = forceState
170
171         # the main logging point
172         self.LOG= log
173
174         # set to 1 if we can run after initialization
175         self.CAN_RUN = 0
176
177         if log.VARS:
178             # this contains a set of information used and updated by each step
179             self.VARS= log.VARS
180         else:
181             return
182              
183         # not sure what the current PATH is set to, replace it with what
184         # we know will work with all the boot cds
185         os.environ['PATH']= string.join(BIN_PATH,":")
186
187         self.CAN_RUN= 1
188
189     def Run(self):
190         """
191         core boot manager logic.
192
193         the way errors are handled is as such: if any particular step
194         cannot continue or unexpectibly fails, an exception is thrown.
195         in this case, the boot manager cannot continue running.
196
197         these step functions can also return a 0/1 depending on whether
198         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
199         a 0 is returned and no exception is thrown if the user chose not
200         to confirm the install. The same goes with the CheckHardwareRequirements.
201         If requriements not met, but tests were succesfull, return 0.
202
203         for steps that run within the installer, they are expected to either
204         complete succesfully and return 1, or throw an execption.
205
206         For exact return values and expected operations, see the comments
207         at the top of each of the invididual step functions.
208         """
209
210         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
211             # called by the _xxxState() functions below upon failure
212             self.VARS['RUN_LEVEL']= 'failboot'
213             notify = getattr(notify_messages, message)
214             self.VARS['STATE_CHANGE_NOTIFY']= 1
215             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
216             raise BootManagerException, notify
217
218         def _bootRun():
219             # implements the boot logic, which consists of first
220             # double checking that the node was properly installed,
221             # checking whether someone added or changed disks, and
222             # then finally chain boots.
223
224             # starting the fallback/debug ssh daemon for safety:
225             # if the node install somehow hangs, or if it simply takes ages, 
226             # we can still enter and investigate
227             try:
228                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
229             except:
230                 pass
231
232             InstallInit.Run( self.VARS, self.LOG )                    
233             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
234             if ret == 1:
235                 WriteModprobeConfig.Run( self.VARS, self.LOG )
236                 WriteNetworkConfig.Run( self.VARS, self.LOG )
237                 CheckForNewDisks.Run( self.VARS, self.LOG )
238                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
239                 ChainBootNode.Run( self.VARS, self.LOG )
240             elif ret == -1:
241                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
242             elif ret == -2:
243                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
244             elif ret == -3:
245                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
246             else:
247                 _nodeNotInstalled()
248
249         def _reinstallRun():
250
251             # starting the fallback/debug ssh daemon for safety:
252             # if the node install somehow hangs, or if it simply takes ages, 
253             # we can still enter and investigate
254             try:
255                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
256             except:
257                 pass
258
259             # implements the reinstall logic, which will check whether
260             # the min. hardware requirements are met, install the
261             # software, and upon correct installation will switch too
262             # 'boot' state and chainboot into the production system
263             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
264                 self.VARS['RUN_LEVEL']= 'failboot'
265                 raise BootManagerException, "Hardware requirements not met."
266
267             # runinstaller
268             InstallInit.Run( self.VARS, self.LOG )                    
269             InstallPartitionDisks.Run( self.VARS, self.LOG )            
270             InstallBootstrapFS.Run( self.VARS, self.LOG )            
271             InstallWriteConfig.Run( self.VARS, self.LOG )
272             InstallUninitHardware.Run( self.VARS, self.LOG )
273             self.VARS['BOOT_STATE']= 'boot'
274             self.VARS['STATE_CHANGE_NOTIFY']= 1
275             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
276                  notify_messages.MSG_INSTALL_FINISHED
277             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
278             _bootRun()
279             
280         def _installRun():
281             # implements the new install logic, which will first check
282             # with the user whether it is ok to install on this
283             # machine, switch to 'reinstall' state and then invoke the reinstall
284             # logic.  See reinstallState logic comments for further
285             # details.
286             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
287                 return 0
288             self.VARS['BOOT_STATE']= 'reinstall'
289             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
290             _reinstallRun()
291
292         def _debugRun(state='failboot'):
293             # implements debug logic, which starts the sshd and just waits around
294             self.VARS['RUN_LEVEL']=state
295             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
296             StartDebug.Run( self.VARS, self.LOG )
297             # fsck/mount fs if present, and ignore return value if it's not.
298             ValidateNodeInstall.Run( self.VARS, self.LOG )
299
300         def _badstateRun():
301             # should never happen; log event
302             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
303             _debugRun()
304
305         # setup state -> function hash table
306         BootManager.NodeRunStates['reinstall']  = _reinstallRun
307         BootManager.NodeRunStates['boot']       = _bootRun
308         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
309         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
310
311         success = 0
312         try:
313             InitializeBootManager.Run( self.VARS, self.LOG )
314             ReadNodeConfiguration.Run( self.VARS, self.LOG )
315             AuthenticateWithPLC.Run( self.VARS, self.LOG )
316             StartRunlevelAgent.Run( self.VARS, self.LOG )
317             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
318
319             # override machine's current state from the command line
320             if self.forceState is not None:
321                 self.VARS['BOOT_STATE']= self.forceState
322                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
323                 UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
324
325             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
326             stateRun()
327             success = 1
328
329         except KeyError, e:
330             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
331         except BootManagerException, e:
332             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
333         except BootManagerAuthenticationException, e:
334             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
335             # sets /tmp/CANCEL_BOOT flag
336             StartDebug.Run(self.VARS, self.LOG )
337             # Return immediately b/c any other calls to API will fail
338             return success
339         except:
340             self.LOG.write( "\n\nImplementation Error\n")
341             traceback.print_exc(file=self.LOG.OutputFile)
342             traceback.print_exc()
343
344         if not success:
345             try:
346                 _debugRun()
347             except BootManagerException, e:
348                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
349             except:
350                 self.LOG.write( "\n\nImplementation Error\n")
351                 traceback.print_exc(file=self.LOG.OutputFile)
352                 traceback.print_exc()
353
354         return success
355             
356             
357 def main(argv):
358
359     import utils
360     utils.prompt_for_breakpoint_mode()
361
362     utils.breakpoint ("Entering BootManager::main")
363     
364     # set to 1 if error occurred
365     error= 0
366     
367     # all output goes through this class so we can save it and post
368     # the data back to PlanetLab central
369     LOG= log( BM_NODE_LOG )
370
371     # NOTE: assume CWD is BM's source directory, but never fail
372     utils.sysexec("./setup_bash_history_scripts.sh || /bin/true", LOG)
373
374     LOG.LogEntry( "BootManager started at: %s" % \
375                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
376
377     try:
378         forceState = None
379         if len(argv) == 2:
380             fState = argv[1]
381             if BootManager.NodeRunStates.has_key(fState):
382                 forceState = fState
383             else:
384                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
385                 error = 1
386     except:
387         traceback.print_exc(file=LOG.OutputFile)
388         traceback.print_exc()
389         
390     if error:
391         LOG.LogEntry( "BootManager finished at: %s" % \
392                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
393         LOG.Upload()
394         return error
395
396     try:
397         bm= BootManager(LOG,forceState)
398         if bm.CAN_RUN == 0:
399             LOG.LogEntry( "Unable to initialize BootManager." )
400         else:
401             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
402             success= bm.Run()
403             if success:
404                 LOG.LogEntry( "\nDone!" );
405             else:
406                 LOG.LogEntry( "\nError occurred!" );
407                 error = 1
408     except:
409         traceback.print_exc(file=LOG.OutputFile)
410         traceback.print_exc()
411
412     LOG.LogEntry( "BootManager finished at: %s" % \
413                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
414     LOG.Upload()
415
416     return error
417
418     
419 if __name__ == "__main__":
420     error = main(sys.argv)
421     sys.exit(error)