purely cosmetic: white spaces, .format rather than %, this kind of things only
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import sys, os
10 import traceback
11 import string
12 import time
13 import gzip
14
15 from steps import *
16 from Exceptions import *
17 import notify_messages
18 import BootServerRequest
19 import utils
20
21 # all output is written to this file
22 BM_NODE_LOG = "/tmp/bm.log"
23 VARS_FILE = "configuration"
24
25 # the new contents of PATH when the boot manager is running
26 BIN_PATH= ('/usr/local/bin',
27            '/usr/local/sbin',
28            '/usr/bin',
29            '/usr/sbin',
30            '/bin',
31            '/sbin')
32
33 def read_configuration_file(filename):
34     # read in and store all variables in VARS_FILE into each line
35     # is in the format name=val (any whitespace around the = is
36     # removed. everything after the = to the end of the line is
37     # the value
38     vars = {}
39     vars_file = file(filename,'r')
40     validConfFile = True
41     for line in vars_file:
42         # if its a comment or a whitespace line, ignore
43         if line[:1] == "#" or string.strip(line) == "":
44             continue
45
46         parts = string.split(line,"=")
47         if len(parts) != 2:
48             validConfFile = False
49             raise Exception("Invalid line in vars file: {}".format(line))
50
51         name = string.strip(parts[0])
52         value = string.strip(parts[1])
53         value = value.replace("'", "")   # remove quotes
54         value = value.replace('"', "")   # remove quotes
55         vars[name] = value
56
57     vars_file.close()
58     if not validConfFile:
59         raise Exception("Unable to read configuration vars.")
60
61     # find out which directory we are running it, and set a variable
62     # for that. future steps may need to get files out of the bootmanager
63     # directory
64     current_dir = os.getcwd()
65     vars['BM_SOURCE_DIR'] = current_dir
66
67     return vars
68
69 ##############################
70 class log:
71
72     format = "%H:%M:%S(%Z) "
73
74     def __init__(self, OutputFilePath=None):
75         try:
76             self.OutputFile = open(OutputFilePath, "w")
77             self.OutputFilePath = OutputFilePath
78         except:
79             print("bootmanager log : Unable to open output file {}, continuing"\
80                   .format(OutputFilePath))
81             self.OutputFile = None
82
83         self.VARS = None
84         try:
85             vars = read_configuration_file(VARS_FILE)
86             self.VARS = vars
87         except Exception, e:
88             self.LogEntry(str(e))
89             return
90     
91     def LogEntry(self, str, inc_newline = 1, display_screen = 1):
92         now = time.strftime(log.format, time.localtime())
93         if self.OutputFile:
94             self.OutputFile.write(now + str)
95         if display_screen:
96             sys.stdout.write(now + str)
97             
98         if inc_newline:
99             if display_screen:
100                 sys.stdout.write("\n")
101             if self.OutputFile:
102                 self.OutputFile.write("\n")
103
104         if self.OutputFile:
105             self.OutputFile.flush()
106
107     def write(self, str):
108         """
109         make log behave like a writable file object (for traceback
110         prints)
111         """
112         self.LogEntry(str, 0, 1)
113     
114     def print_stack(self):
115         """
116         dump current stack in log
117         """
118         self.write(traceback.format_exc())
119
120     # bm log uploading is available back again, as of nodeconfig-5.0-2
121     def Upload(self, extra_file=None):
122         """
123         upload the contents of the log to the server
124         """
125         if self.OutputFile is not None:
126             self.OutputFile.flush()
127
128             self.LogEntry("Uploading logs to {}".format(self.VARS['UPLOAD_LOG_SCRIPT']))
129             
130             self.OutputFile.close()
131             self.OutputFile= None
132
133             hostname = self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
134                        self.VARS['INTERFACE_SETTINGS']['domainname']
135             bs_request = BootServerRequest.BootServerRequest(self.VARS)
136             try:
137                 # this was working until f10
138                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
139                                        GetVars = None, PostVars = None,
140                                        DoSSL = True, DoCertCheck = True,
141                                        FormData = ["log=@" + self.OutputFilePath,
142                                                    "hostname=" + hostname, 
143                                                    "type=bm.log"])
144             except:
145                 # new pycurl
146                 import pycurl
147                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
148                                        GetVars = None, PostVars = None,
149                                        DoSSL = True, DoCertCheck = True,
150                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
151                                                    ("hostname",hostname),
152                                                    ("type","bm.log")])
153         if extra_file is not None:
154             # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
155             # by adding --login, bash reads .bash_profile before execution.
156             # Also, never fail, since this is an optional feature.
157             utils.sysexec_noerr("""bash --login -c "upload_logs {}" """.format(extra_file), self)
158
159
160 ##############################
161 class BootManager:
162
163     # file containing initial variables/constants
164
165     # the set of valid node run states
166     NodeRunStates = {'reinstall':None,
167                      'boot':None,
168                      'safeboot':None,
169                      'disabled':None,
170                      }
171     
172     def __init__(self, log, forceState):
173         # override machine's current state from the command line
174         self.forceState = forceState
175
176         # the main logging point
177         self.LOG = log
178
179         # set to 1 if we can run after initialization
180         self.CAN_RUN = 0
181
182         if log.VARS:
183             # this contains a set of information used and updated by each step
184             self.VARS = log.VARS
185         else:
186             return
187              
188         # not sure what the current PATH is set to, replace it with what
189         # we know will work with all the boot cds
190         os.environ['PATH'] = string.join(BIN_PATH,":")
191
192         self.CAN_RUN = 1
193
194     def Run(self):
195         """
196         core boot manager logic.
197
198         the way errors are handled is as such: if any particular step
199         cannot continue or unexpectibly fails, an exception is thrown.
200         in this case, the boot manager cannot continue running.
201
202         these step functions can also return a 0/1 depending on whether
203         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
204         a 0 is returned and no exception is thrown if the user chose not
205         to confirm the install. The same goes with the CheckHardwareRequirements.
206         If requriements not met, but tests were succesfull, return 0.
207
208         for steps that run within the installer, they are expected to either
209         complete succesfully and return 1, or throw an exception.
210
211         For exact return values and expected operations, see the comments
212         at the top of each of the invididual step functions.
213         """
214
215         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
216             # called by the _xxxState() functions below upon failure
217             self.VARS['RUN_LEVEL'] = 'failboot'
218             notify = getattr(notify_messages, message)
219             self.VARS['STATE_CHANGE_NOTIFY'] = 1
220             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = notify
221             raise BootManagerException, notify
222
223         def _bootRun():
224             # implements the boot logic, which consists of first
225             # double checking that the node was properly installed,
226             # checking whether someone added or changed disks, and
227             # then finally chain boots.
228
229             # starting the fallback/debug ssh daemon for safety:
230             # if the node install somehow hangs, or if it simply takes ages, 
231             # we can still enter and investigate
232             try:
233                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
234             except:
235                 pass
236
237             InstallInit.Run(self.VARS, self.LOG)                    
238             ret = ValidateNodeInstall.Run(self.VARS, self.LOG)
239             if ret == 1:
240 # Thierry - feb. 2013 turning off WriteModprobeConfig for now on lxc
241 # for one thing this won't work at all with f18, as modules.pcimap
242 # has disappeared (Daniel suggested modules.aliases could be used instead)
243 # and second, in any case it's been years now that modprobe.conf was deprecated
244 # so most likely this code has no actual effect
245                 if self.VARS['virt'] == 'vs':
246                     WriteModprobeConfig.Run(self.VARS, self.LOG)
247                 WriteNetworkConfig.Run(self.VARS, self.LOG)
248                 CheckForNewDisks.Run(self.VARS, self.LOG)
249                 SendHardwareConfigToPLC.Run(self.VARS, self.LOG)
250                 ChainBootNode.Run(self.VARS, self.LOG)
251             elif ret == -1:
252                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
253             elif ret == -2:
254                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
255             elif ret == -3:
256                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
257             else:
258                 _nodeNotInstalled()
259
260         def _reinstallRun():
261
262             # starting the fallback/debug ssh daemon for safety:
263             # if the node install somehow hangs, or if it simply takes ages, 
264             # we can still enter and investigate
265             try:
266                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
267             except:
268                 pass
269
270             # implements the reinstall logic, which will check whether
271             # the min. hardware requirements are met, install the
272             # software, and upon correct installation will switch too
273             # 'boot' state and chainboot into the production system
274             if not CheckHardwareRequirements.Run(self.VARS, self.LOG):
275                 self.VARS['RUN_LEVEL'] = 'failboot'
276                 raise BootManagerException, "Hardware requirements not met."
277
278             # runinstaller
279             InstallPartitionDisks.Run( self.VARS, self.LOG )            
280             InstallInit.Run(self.VARS, self.LOG)                    
281             InstallBootstrapFS.Run(self.VARS, self.LOG)            
282             InstallWriteConfig.Run(self.VARS, self.LOG)
283             InstallUninitHardware.Run(self.VARS, self.LOG)
284             self.VARS['BOOT_STATE'] = 'boot'
285             self.VARS['STATE_CHANGE_NOTIFY'] = 1
286             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = \
287                  notify_messages.MSG_INSTALL_FINISHED
288             AnsibleHook.Run(self.VARS, self.LOG)
289             UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
290             _bootRun()
291             
292         def _installRun():
293             # implements the new install logic, which will first check
294             # with the user whether it is ok to install on this
295             # machine, switch to 'reinstall' state and then invoke the reinstall
296             # logic.  See reinstallState logic comments for further
297             # details.
298             if not ConfirmInstallWithUser.Run(self.VARS, self.LOG):
299                 return 0
300             self.VARS['BOOT_STATE'] = 'reinstall'
301
302             AnsibleHook.Run(self.VARS, self.LOG)
303             _reinstallRun()
304
305         def _debugRun(state='failboot'):
306             # implements debug logic, which starts the sshd and just waits around
307             self.VARS['RUN_LEVEL'] = state
308             StartDebug.Run(self.VARS, self.LOG)
309             # fsck/mount fs if present, and ignore return value if it's not.
310             ValidateNodeInstall.Run(self.VARS, self.LOG)
311
312         def _badstateRun():
313             # should never happen; log event
314             self.LOG.write("\nInvalid BOOT_STATE = {}\n".format(self.VARS['BOOT_STATE']))
315             _debugRun()
316
317         # setup state -> function hash table
318         BootManager.NodeRunStates['reinstall']  = _reinstallRun
319         BootManager.NodeRunStates['boot']       = _bootRun
320         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
321         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
322
323         success = 0
324         try:
325             InitializeBootManager.Run(self.VARS, self.LOG)
326             ReadNodeConfiguration.Run(self.VARS, self.LOG)
327             AuthenticateWithPLC.Run(self.VARS, self.LOG)
328             UpdateLastBootOnce.Run(self.VARS, self.LOG)
329             StartRunlevelAgent.Run(self.VARS, self.LOG)
330             GetAndUpdateNodeDetails.Run(self.VARS, self.LOG)
331
332             # override machine's current state from the command line
333             if self.forceState is not None:
334                 self.VARS['BOOT_STATE'] = self.forceState
335                 UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
336
337             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
338             stateRun()
339             success = 1
340
341         except KeyError as e:
342             self.LOG.write("\n\nKeyError while running: {}\n".format(e))
343             self.LOG.print_stack ()
344         except BootManagerException as e:
345             self.LOG.write("\n\nException while running: {}\n".format(e))
346             self.LOG.print_stack ()
347         except BootManagerAuthenticationException as e:
348             self.LOG.write("\n\nFailed to Authenticate Node: {}\n".format(e))
349             self.LOG.print_stack ()
350             # sets /tmp/CANCEL_BOOT flag
351             StartDebug.Run(self.VARS, self.LOG)
352             # Return immediately b/c any other calls to API will fail
353             return success
354         except:
355             self.LOG.write("\n\nImplementation Error\n")
356             self.LOG.print_stack ()
357
358         if not success:
359             try:
360                 _debugRun()
361             except BootManagerException, e:
362                 self.LOG.write("\n\nException while running: {}\n".format(e))
363             except:
364                 self.LOG.write("\n\nImplementation Error\n")
365                 traceback.print_exc(file=self.LOG.OutputFile)
366                 traceback.print_exc()
367
368         return success
369             
370             
371 def main(argv):
372
373     import utils
374     utils.prompt_for_breakpoint_mode()
375
376 #    utils.breakpoint ("Entering BootManager::main")
377     
378     # set to 1 if error occurred
379     error = 0
380     
381     # all output goes through this class so we can save it and post
382     # the data back to PlanetLab central
383     LOG = log(BM_NODE_LOG)
384
385     # NOTE: assume CWD is BM's source directory, but never fail
386     utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
387
388     LOG.LogEntry("BootManager started at: {}"\
389                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
390
391     try:
392         forceState = None
393         if len(argv) == 2:
394             fState = argv[1]
395             if BootManager.NodeRunStates.has_key(fState):
396                 forceState = fState
397             else:
398                 LOG.LogEntry("FATAL: cannot force node run state to={}".format(fState))
399                 error = 1
400     except:
401         traceback.print_exc(file=LOG.OutputFile)
402         traceback.print_exc()
403         
404     if error:
405         LOG.LogEntry("BootManager finished at: {}"\
406                      .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
407         LOG.Upload()
408         return error
409
410     try:
411         bm = BootManager(LOG, forceState)
412         if bm.CAN_RUN == 0:
413             LOG.LogEntry("Unable to initialize BootManager.")
414         else:
415             LOG.LogEntry("Running version {} of BootManager.".format(bm.VARS['VERSION']))
416             success = bm.Run()
417             if success:
418                 LOG.LogEntry("\nDone!");
419             else:
420                 LOG.LogEntry("\nError occurred!");
421                 error = 1
422     except:
423         traceback.print_exc(file=LOG.OutputFile)
424         traceback.print_exc()
425
426     LOG.LogEntry("BootManager finished at: {}"\
427                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
428     LOG.Upload()
429
430     return error
431
432     
433 if __name__ == "__main__":
434     error = main(sys.argv)
435     sys.exit(error)