oops
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 import sys, os
10 import traceback
11 import string
12 import time
13 import gzip
14
15 from steps import *
16 from Exceptions import *
17 import notify_messages
18 import BootServerRequest
19 import utils
20
21 # all output is written to this file
22 BM_NODE_LOG = "/tmp/bm.log"
23 VARS_FILE = "configuration"
24
25 # the new contents of PATH when the boot manager is running
26 BIN_PATH = ('/usr/local/bin',
27             '/usr/local/sbin',
28             '/usr/bin',
29             '/usr/sbin',
30             '/bin',
31             '/sbin')
32
33 def read_configuration_file(filename):
34     # read in and store all variables in VARS_FILE into each line
35     # is in the format name=val (any whitespace around the = is
36     # removed. everything after the = to the end of the line is
37     # the value
38     vars = {}
39     vars_file = file(filename,'r')
40     validConfFile = True
41     for line in vars_file:
42         # if its a comment or a whitespace line, ignore
43         if line[:1] == "#" or string.strip(line) == "":
44             continue
45
46         parts = string.split(line, "=")
47         if len(parts) != 2:
48             validConfFile = False
49             raise Exception("Invalid line in vars file: {}".format(line))
50
51         name = string.strip(parts[0])
52         value = string.strip(parts[1])
53         value = value.replace("'", "")   # remove quotes
54         value = value.replace('"', "")   # remove quotes
55         vars[name] = value
56
57     vars_file.close()
58     if not validConfFile:
59         raise Exception("Unable to read configuration vars.")
60
61     # find out which directory we are running it, and set a variable
62     # for that. future steps may need to get files out of the bootmanager
63     # directory
64     current_dir = os.getcwd()
65     vars['BM_SOURCE_DIR'] = current_dir
66
67     return vars
68
69 ##############################
70 class log:
71
72     format = "%H:%M:%S(%Z) "
73
74     def __init__(self, OutputFilePath=None):
75         try:
76             self.OutputFile = open(OutputFilePath, "w")
77             self.OutputFilePath = OutputFilePath
78         except:
79             print("bootmanager log : Unable to open output file {}, continuing"\
80                   .format(OutputFilePath))
81             self.OutputFile = None
82
83         self.VARS = None
84         try:
85             vars = read_configuration_file(VARS_FILE)
86             self.VARS = vars
87         except Exception, e:
88             self.LogEntry(str(e))
89             return
90     
91     def LogEntry(self, str, inc_newline = 1, display_screen = 1):
92         now = time.strftime(log.format, time.localtime())
93         if self.OutputFile:
94             self.OutputFile.write(now + str)
95         if display_screen:
96             sys.stdout.write(now + str)
97             
98         if inc_newline:
99             if display_screen:
100                 sys.stdout.write("\n")
101             if self.OutputFile:
102                 self.OutputFile.write("\n")
103
104         if self.OutputFile:
105             self.OutputFile.flush()
106
107     def write(self, str):
108         """
109         make log behave like a writable file object (for traceback
110         prints)
111         """
112         self.LogEntry(str, 0, 1)
113     
114     def print_stack(self):
115         """
116         dump current stack in log
117         """
118         self.write(traceback.format_exc())
119
120     # bm log uploading is available back again, as of nodeconfig-5.0-2
121     def Upload(self, extra_file=None):
122         """
123         upload the contents of the log to the server
124         """
125         if self.OutputFile is not None:
126             self.OutputFile.flush()
127
128             self.LogEntry("Uploading logs to {}".format(self.VARS['UPLOAD_LOG_SCRIPT']))
129             
130             self.OutputFile.close()
131             self.OutputFile = None
132
133             hostname = self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
134                        self.VARS['INTERFACE_SETTINGS']['domainname']
135             bs_request = BootServerRequest.BootServerRequest(self.VARS)
136             try:
137                 # this was working until f10
138                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
139                                        GetVars = None, PostVars = None,
140                                        DoSSL = True, DoCertCheck = True,
141                                        FormData = ["log=@" + self.OutputFilePath,
142                                                    "hostname=" + hostname, 
143                                                    "type=bm.log"])
144             except:
145                 # new pycurl
146                 import pycurl
147                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
148                                        GetVars = None, PostVars = None,
149                                        DoSSL = True, DoCertCheck = True,
150                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
151                                                    ("hostname",hostname),
152                                                    ("type","bm.log")])
153         if extra_file is not None:
154             # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
155             # by adding --login, bash reads .bash_profile before execution.
156             # Also, never fail, since this is an optional feature.
157             utils.sysexec_noerr("""bash --login -c "upload_logs {}" """.format(extra_file), self)
158
159
160 ##############################
161 class BootManager:
162
163     # file containing initial variables/constants
164
165     # the set of valid node run states
166     NodeRunStates = {'reinstall' : None,
167                      'upgrade' : None,
168                      'boot' : None,
169                      'safeboot' : None,
170                      'disabled' : None,
171                      }
172     
173     def __init__(self, log, forceState):
174         # override machine's current state from the command line
175         self.forceState = forceState
176
177         # the main logging point
178         self.LOG = log
179
180         # set to 1 if we can run after initialization
181         self.CAN_RUN = 0
182
183         if log.VARS:
184             # this contains a set of information used and updated by each step
185             self.VARS = log.VARS
186         else:
187             return
188              
189         # not sure what the current PATH is set to, replace it with what
190         # we know will work with all the boot cds
191         os.environ['PATH'] = string.join(BIN_PATH,":")
192
193         self.CAN_RUN = 1
194
195     def Run(self):
196         """
197         core boot manager logic.
198
199         the way errors are handled is as such: if any particular step
200         cannot continue or unexpectibly fails, an exception is thrown.
201         in this case, the boot manager cannot continue running.
202
203         these step functions can also return a 0/1 depending on whether
204         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
205         a 0 is returned and no exception is thrown if the user chose not
206         to confirm the install. The same goes with the CheckHardwareRequirements.
207         If requriements not met, but tests were succesfull, return 0.
208
209         for steps that run within the installer, they are expected to either
210         complete succesfully and return 1, or throw an exception.
211
212         For exact return values and expected operations, see the comments
213         at the top of each of the invididual step functions.
214         """
215
216         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
217             # called by the _xxxState() functions below upon failure
218             self.VARS['RUN_LEVEL'] = 'failboot'
219             notify = getattr(notify_messages, message)
220             self.VARS['STATE_CHANGE_NOTIFY'] = 1
221             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = notify
222             raise BootManagerException, notify
223
224         def _bootRun():
225             # implements the boot logic, which consists of first
226             # double checking that the node was properly installed,
227             # checking whether someone added or changed disks, and
228             # then finally chain boots.
229
230             # starting the fallback/debug ssh daemon for safety:
231             # if the node install somehow hangs, or if it simply takes ages, 
232             # we can still enter and investigate
233             try:
234                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
235             except:
236                 pass
237
238             InstallInit.Run(self.VARS, self.LOG)                    
239             ret = ValidateNodeInstall.Run(self.VARS, self.LOG)
240             if ret == 1:
241 # Thierry - feb. 2013 turning off WriteModprobeConfig for now on lxc
242 # for one thing this won't work at all with f18, as modules.pcimap
243 # has disappeared (Daniel suggested modules.aliases could be used instead)
244 # and second, in any case it's been years now that modprobe.conf was deprecated
245 # so most likely this code has no actual effect
246                 if self.VARS['virt'] == 'vs':
247                     WriteModprobeConfig.Run(self.VARS, self.LOG)
248                 WriteNetworkConfig.Run(self.VARS, self.LOG)
249                 CheckForNewDisks.Run(self.VARS, self.LOG)
250                 SendHardwareConfigToPLC.Run(self.VARS, self.LOG)
251                 ChainBootNode.Run(self.VARS, self.LOG)
252             elif ret == -1:
253                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
254             elif ret == -2:
255                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
256             elif ret == -3:
257                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
258             else:
259                 _nodeNotInstalled()
260
261         def _reinstallRun(upgrade=False):
262
263             # starting the fallback/debug ssh daemon for safety:
264             # if the node install somehow hangs, or if it simply takes ages, 
265             # we can still enter and investigate
266             try:
267                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
268             except:
269                 pass
270
271             # implements the reinstall logic, which will check whether
272             # the min. hardware requirements are met, install the
273             # software, and upon correct installation will switch too
274             # 'boot' state and chainboot into the production system
275             if not CheckHardwareRequirements.Run(self.VARS, self.LOG):
276                 self.VARS['RUN_LEVEL'] = 'failboot'
277                 raise BootManagerException, "Hardware requirements not met."
278
279             # runinstaller
280             InstallInit.Run(self.VARS, self.LOG)                    
281             if not upgrade:
282                 InstallPartitionDisks.Run(self.VARS, self.LOG)            
283             InstallBootstrapFS.Run(self.VARS, self.LOG)            
284             InstallWriteConfig.Run(self.VARS, self.LOG)
285             InstallUninitHardware.Run(self.VARS, self.LOG)
286             self.VARS['BOOT_STATE'] = 'boot'
287             self.VARS['STATE_CHANGE_NOTIFY'] = 1
288             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = \
289                  notify_messages.MSG_INSTALL_FINISHED
290             AnsibleHook.Run(self.VARS, self.LOG)
291             UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
292             _bootRun()
293             
294         def _installRun():
295             # implements the new install logic, which will first check
296             # with the user whether it is ok to install on this
297             # machine, switch to 'reinstall' state and then invoke the reinstall
298             # logic.  See reinstallState logic comments for further
299             # details.
300             if not ConfirmInstallWithUser.Run(self.VARS, self.LOG):
301                 return 0
302             self.VARS['BOOT_STATE'] = 'reinstall'
303
304             AnsibleHook.Run(self.VARS, self.LOG)
305             _reinstallRun()
306
307         def _debugRun(state='failboot'):
308             # implements debug logic, which starts the sshd and just waits around
309             self.VARS['RUN_LEVEL'] = state
310             StartDebug.Run(self.VARS, self.LOG)
311             # fsck/mount fs if present, and ignore return value if it's not.
312             ValidateNodeInstall.Run(self.VARS, self.LOG)
313
314         def _badstateRun():
315             # should never happen; log event
316             self.LOG.write("\nInvalid BOOT_STATE = {}\n".format(self.VARS['BOOT_STATE']))
317             _debugRun()
318
319         # setup state -> function hash table
320         BootManager.NodeRunStates['reinstall']  = lambda : _reinstallRun(upgrade=False)
321         BootManager.NodeRunStates['upgrade']    = lambda : _reinstallRun(upgrade=True)
322         BootManager.NodeRunStates['boot']       = _bootRun
323         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
324         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
325
326         success = 0
327         try:
328             InitializeBootManager.Run(self.VARS, self.LOG)
329             ReadNodeConfiguration.Run(self.VARS, self.LOG)
330             AuthenticateWithPLC.Run(self.VARS, self.LOG)
331             UpdateLastBootOnce.Run(self.VARS, self.LOG)
332             StartRunlevelAgent.Run(self.VARS, self.LOG)
333             GetAndUpdateNodeDetails.Run(self.VARS, self.LOG)
334
335             # override machine's current state from the command line
336             if self.forceState is not None:
337                 self.VARS['BOOT_STATE'] = self.forceState
338                 UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
339
340             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'], _badstateRun)
341             stateRun()
342             success = 1
343
344         except KeyError as e:
345             self.LOG.write("\n\nKeyError while running: {}\n".format(e))
346             self.LOG.print_stack ()
347         except BootManagerException as e:
348             self.LOG.write("\n\nException while running: {}\n".format(e))
349             self.LOG.print_stack ()
350         except BootManagerAuthenticationException as e:
351             self.LOG.write("\n\nFailed to Authenticate Node: {}\n".format(e))
352             self.LOG.print_stack ()
353             # sets /tmp/CANCEL_BOOT flag
354             StartDebug.Run(self.VARS, self.LOG)
355             # Return immediately b/c any other calls to API will fail
356             return success
357         except:
358             self.LOG.write("\n\nImplementation Error\n")
359             self.LOG.print_stack ()
360
361         if not success:
362             try:
363                 _debugRun()
364             except BootManagerException, e:
365                 self.LOG.write("\n\nException while running: {}\n".format(e))
366             except:
367                 self.LOG.write("\n\nImplementation Error\n")
368                 traceback.print_exc(file=self.LOG.OutputFile)
369                 traceback.print_exc()
370
371         return success
372             
373             
374 def main(argv):
375
376     import utils
377     utils.prompt_for_breakpoint_mode()
378
379 #    utils.breakpoint ("Entering BootManager::main")
380     
381     # set to 1 if error occurred
382     error = 0
383     
384     # all output goes through this class so we can save it and post
385     # the data back to PlanetLab central
386     LOG = log(BM_NODE_LOG)
387
388     # NOTE: assume CWD is BM's source directory, but never fail
389     utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
390
391     LOG.LogEntry("BootManager started at: {}"\
392                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
393
394     try:
395         forceState = None
396         if len(argv) == 2:
397             fState = argv[1]
398             if BootManager.NodeRunStates.has_key(fState):
399                 forceState = fState
400             else:
401                 LOG.LogEntry("FATAL: cannot force node run state to={}".format(fState))
402                 error = 1
403     except:
404         traceback.print_exc(file=LOG.OutputFile)
405         traceback.print_exc()
406         
407     if error:
408         LOG.LogEntry("BootManager finished at: {}"\
409                      .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
410         LOG.Upload()
411         return error
412
413     try:
414         bm = BootManager(LOG, forceState)
415         if bm.CAN_RUN == 0:
416             LOG.LogEntry("Unable to initialize BootManager.")
417         else:
418             LOG.LogEntry("Running version {} of BootManager.".format(bm.VARS['VERSION']))
419             success = bm.Run()
420             if success:
421                 LOG.LogEntry("\nDone!");
422             else:
423                 LOG.LogEntry("\nError occurred!");
424                 error = 1
425     except:
426         traceback.print_exc(file=LOG.OutputFile)
427         traceback.print_exc()
428
429     LOG.LogEntry("BootManager finished at: {}"\
430                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
431     LOG.Upload()
432
433     return error
434
435     
436 if __name__ == "__main__":
437     error = main(sys.argv)
438     sys.exit(error)