c1c2f857178ea6848642556c6dcf02eebb87303e
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 from __future__ import print_function
10
11 import sys, os
12 import traceback
13 import string
14 import time
15 import gzip
16
17 from steps import *
18 from Exceptions import *
19 import notify_messages
20 import BootServerRequest
21 import utils
22
23 # all output is written to this file
24 BM_NODE_LOG = "/tmp/bm.log"
25 VARS_FILE = "configuration"
26
27 # the new contents of PATH when the boot manager is running
28 BIN_PATH = ('/usr/local/bin',
29             '/usr/local/sbin',
30             '/usr/bin',
31             '/usr/sbin',
32             '/bin',
33             '/sbin')
34
35 def read_configuration_file(filename):
36     # read in and store all variables in VARS_FILE into each line
37     # is in the format name=val (any whitespace around the = is
38     # removed. everything after the = to the end of the line is
39     # the value
40     vars = {}
41     vars_file = file(filename,'r')
42     validConfFile = True
43     for line in vars_file:
44         # if its a comment or a whitespace line, ignore
45         if line[:1] == "#" or string.strip(line) == "":
46             continue
47
48         parts = string.split(line, "=")
49         if len(parts) != 2:
50             validConfFile = False
51             raise Exception("Invalid line in vars file: {}".format(line))
52
53         name = string.strip(parts[0])
54         value = string.strip(parts[1])
55         value = value.replace("'", "")   # remove quotes
56         value = value.replace('"', "")   # remove quotes
57         vars[name] = value
58
59     vars_file.close()
60     if not validConfFile:
61         raise Exception("Unable to read configuration vars.")
62
63     # find out which directory we are running it, and set a variable
64     # for that. future steps may need to get files out of the bootmanager
65     # directory
66     current_dir = os.getcwd()
67     vars['BM_SOURCE_DIR'] = current_dir
68
69     return vars
70
71 ##############################
72 class log:
73
74     format = "%H:%M:%S(%Z) "
75
76     def __init__(self, OutputFilePath=None):
77         try:
78             self.OutputFile = open(OutputFilePath, "w")
79             self.OutputFilePath = OutputFilePath
80         except:
81             print("bootmanager log : Unable to open output file {}, continuing"\
82                   .format(OutputFilePath))
83             self.OutputFile = None
84
85         self.VARS = None
86         try:
87             vars = read_configuration_file(VARS_FILE)
88             self.VARS = vars
89         except Exception, e:
90             self.LogEntry(str(e))
91             return
92
93     def LogEntry(self, str, inc_newline = 1, display_screen = 1):
94         now = time.strftime(log.format, time.localtime())
95         if self.OutputFile:
96             self.OutputFile.write(now + str)
97         if display_screen:
98             sys.stdout.write(now + str)
99
100         if inc_newline:
101             if display_screen:
102                 sys.stdout.write("\n")
103             if self.OutputFile:
104                 self.OutputFile.write("\n")
105
106         if self.OutputFile:
107             self.OutputFile.flush()
108
109     def write(self, str):
110         """
111         make log behave like a writable file object
112         (for traceback prints)
113         """
114         self.LogEntry(str, 0, 1)
115
116     def print_stack(self):
117         """
118         dump current stack in log
119         """
120         self.write(traceback.format_exc())
121
122     # bm log uploading is available back again, as of nodeconfig-5.0-2
123     def Upload(self, extra_file=None):
124         """
125         upload the contents of the log to the server
126         """
127         if self.OutputFile is not None:
128             self.OutputFile.flush()
129
130             self.LogEntry("Uploading logs to {}".format(self.VARS['UPLOAD_LOG_SCRIPT']))
131
132             self.OutputFile.close()
133             self.OutputFile = None
134
135             hostname = self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
136                        self.VARS['INTERFACE_SETTINGS']['domainname']
137             bs_request = BootServerRequest.BootServerRequest(self.VARS)
138             try:
139                 # this was working until f10
140                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
141                                        GetVars = None, PostVars = None,
142                                        DoSSL = True, DoCertCheck = True,
143                                        FormData = ["log=@" + self.OutputFilePath,
144                                                    "hostname=" + hostname,
145                                                    "type=bm.log"])
146             except:
147                 # new pycurl
148                 import pycurl
149                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
150                                        GetVars = None, PostVars = None,
151                                        DoSSL = True, DoCertCheck = True,
152                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
153                                                    ("hostname",hostname),
154                                                    ("type","bm.log")])
155         if extra_file is not None:
156             # NOTE: for code-reuse, evoke the bash function 'upload_logs';
157             # by adding --login, bash reads .bash_profile before execution.
158             # Also, never fail, since this is an optional feature.
159             utils.sysexec_noerr("""bash --login -c "upload_logs {}" """.format(extra_file), self)
160
161
162 ##############################
163 class BootManager:
164
165     # file containing initial variables/constants
166
167     # the set of valid node run states
168     NodeRunStates = {'reinstall' : None,
169                      'upgrade' : None,
170                      'boot' : None,
171                      'safeboot' : None,
172                      'disabled' : None,
173                      }
174
175     def __init__(self, log, forceState):
176         # override machine's current state from the command line
177         self.forceState = forceState
178
179         # the main logging point
180         self.LOG = log
181
182         # set to 1 if we can run after initialization
183         self.CAN_RUN = 0
184
185         if log.VARS:
186             # this contains a set of information used and updated by each step
187             self.VARS = log.VARS
188         else:
189             return
190
191         # not sure what the current PATH is set to, replace it with what
192         # we know will work with all the boot cds
193         os.environ['PATH'] = string.join(BIN_PATH,":")
194
195         self.CAN_RUN = 1
196
197     def Run(self):
198         """
199         core boot manager logic.
200
201         the way errors are handled is as such: if any particular step
202         cannot continue or unexpectibly fails, an exception is thrown.
203         in this case, the boot manager cannot continue running.
204
205         these step functions can also return a 0/1 depending on whether
206         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
207         a 0 is returned and no exception is thrown if the user chose not
208         to confirm the install. The same goes with the CheckHardwareRequirements.
209         If requriements not met, but tests were succesfull, return 0.
210
211         for steps that run within the installer, they are expected to either
212         complete succesfully and return 1, or throw an exception.
213
214         For exact return values and expected operations, see the comments
215         at the top of each of the invididual step functions.
216         """
217
218         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
219             # called by the _xxxState() functions below upon failure
220             self.VARS['RUN_LEVEL'] = 'failboot'
221             notify = getattr(notify_messages, message)
222             self.VARS['STATE_CHANGE_NOTIFY'] = 1
223             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = notify
224             raise BootManagerException, notify
225
226         def _bootRun():
227             # implements the boot logic, which consists of first
228             # double checking that the node was properly installed,
229             # checking whether someone added or changed disks, and
230             # then finally chain boots.
231
232             # starting the fallback/debug ssh daemon for safety:
233             # if the node install somehow hangs, or if it simply takes ages,
234             # we can still enter and investigate
235             try:
236                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
237             except:
238                 pass
239
240             InstallInit.Run(self.VARS, self.LOG)
241             ret = ValidateNodeInstall.Run(self.VARS, self.LOG)
242             if ret == 1:
243 # Thierry - feb. 2013 turning off WriteModprobeConfig for now on lxc
244 # for one thing this won't work at all with f18, as modules.pcimap
245 # has disappeared (Daniel suggested modules.aliases could be used instead)
246 # and second, in any case it's been years now that modprobe.conf was deprecated
247 # so most likely this code has no actual effect
248                 if self.VARS['virt'] == 'vs':
249                     WriteModprobeConfig.Run(self.VARS, self.LOG)
250                 WriteNetworkConfig.Run(self.VARS, self.LOG)
251                 CheckForNewDisks.Run(self.VARS, self.LOG)
252                 SendHardwareConfigToPLC.Run(self.VARS, self.LOG)
253                 ChainBootNode.Run(self.VARS, self.LOG)
254             elif ret == -1:
255                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
256             elif ret == -2:
257                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
258             elif ret == -3:
259                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
260             else:
261                 _nodeNotInstalled()
262
263         def _reinstallRun(upgrade=False):
264
265             # starting the fallback/debug ssh daemon for safety:
266             # if the node install somehow hangs, or if it simply takes ages,
267             # we can still enter and investigate
268             try:
269                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
270             except:
271                 pass
272
273             # implements the reinstall logic, which will check whether
274             # the min. hardware requirements are met, install the
275             # software, and upon correct installation will switch too
276             # 'boot' state and chainboot into the production system
277             if not CheckHardwareRequirements.Run(self.VARS, self.LOG):
278                 self.VARS['RUN_LEVEL'] = 'failboot'
279                 raise BootManagerException, "Hardware requirements not met."
280
281             # runinstaller
282             InstallInit.Run(self.VARS, self.LOG)
283             if not upgrade:
284                 InstallPartitionDisks.Run(self.VARS, self.LOG)
285             InstallBootstrapFS.Run(self.VARS, self.LOG)
286             InstallWriteConfig.Run(self.VARS, self.LOG)
287             InstallUninitHardware.Run(self.VARS, self.LOG)
288             self.VARS['BOOT_STATE'] = 'boot'
289             self.VARS['STATE_CHANGE_NOTIFY'] = 1
290             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = \
291                  notify_messages.MSG_INSTALL_FINISHED
292             AnsibleHook.Run(self.VARS, self.LOG)
293             UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
294             _bootRun()
295
296         def _installRun():
297             # implements the new install logic, which will first check
298             # with the user whether it is ok to install on this
299             # machine, switch to 'reinstall' state and then invoke the reinstall
300             # logic.  See reinstallState logic comments for further
301             # details.
302             if not ConfirmInstallWithUser.Run(self.VARS, self.LOG):
303                 return 0
304             self.VARS['BOOT_STATE'] = 'reinstall'
305
306             AnsibleHook.Run(self.VARS, self.LOG)
307             _reinstallRun()
308
309         def _debugRun(state='failboot'):
310             # implements debug logic, which starts the sshd and just waits around
311             self.VARS['RUN_LEVEL'] = state
312             StartDebug.Run(self.VARS, self.LOG)
313             # fsck/mount fs if present, and ignore return value if it's not.
314             ValidateNodeInstall.Run(self.VARS, self.LOG)
315
316         def _badstateRun():
317             # should never happen; log event
318             self.LOG.write("\nInvalid BOOT_STATE = {}\n".format(self.VARS['BOOT_STATE']))
319             _debugRun()
320
321         # setup state -> function hash table
322         BootManager.NodeRunStates['reinstall']  = lambda : _reinstallRun(upgrade=False)
323         BootManager.NodeRunStates['upgrade']    = lambda : _reinstallRun(upgrade=True)
324         BootManager.NodeRunStates['boot']       = _bootRun
325         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
326         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
327
328         success = 0
329         try:
330             InitializeBootManager.Run(self.VARS, self.LOG)
331             ReadNodeConfiguration.Run(self.VARS, self.LOG)
332             AuthenticateWithPLC.Run(self.VARS, self.LOG)
333             UpdateLastBootOnce.Run(self.VARS, self.LOG)
334             StartRunlevelAgent.Run(self.VARS, self.LOG)
335             GetAndUpdateNodeDetails.Run(self.VARS, self.LOG)
336
337             # override machine's current state from the command line
338             if self.forceState is not None:
339                 self.VARS['BOOT_STATE'] = self.forceState
340                 UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
341
342             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'], _badstateRun)
343             stateRun()
344             success = 1
345
346         except KeyError as e:
347             self.LOG.write("\n\nKeyError while running: {}\n".format(e))
348             self.LOG.print_stack()
349         except BootManagerException as e:
350             self.LOG.write("\n\nException while running: {}\n".format(e))
351             self.LOG.print_stack()
352         except BootManagerAuthenticationException as e:
353             self.LOG.write("\n\nFailed to Authenticate Node: {}\n".format(e))
354             self.LOG.print_stack()
355             # sets /tmp/CANCEL_BOOT flag
356             StartDebug.Run(self.VARS, self.LOG)
357             # Return immediately b/c any other calls to API will fail
358             return success
359         except:
360             self.LOG.write("\n\nImplementation Error\n")
361             self.LOG.print_stack()
362
363         if not success:
364             try:
365                 _debugRun()
366             except BootManagerException, e:
367                 self.LOG.write("\n\nException while running: {}\n".format(e))
368             except:
369                 self.LOG.write("\n\nImplementation Error\n")
370                 traceback.print_exc(file=self.LOG.OutputFile)
371                 traceback.print_exc()
372
373         return success
374
375 def main(argv):
376
377     import utils
378     utils.prompt_for_breakpoint_mode()
379
380 #    utils.breakpoint ("Entering BootManager::main")
381
382     # set to 1 if error occurred
383     error = 0
384
385     # all output goes through this class so we can save it and post
386     # the data back to PlanetLab central
387     LOG = log(BM_NODE_LOG)
388
389     # NOTE: assume CWD is BM's source directory, but never fail
390     utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
391
392     LOG.LogEntry("BootManager started at: {}"\
393                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
394
395     try:
396         forceState = None
397         if len(argv) == 2:
398             fState = argv[1]
399             if BootManager.NodeRunStates.has_key(fState):
400                 forceState = fState
401             else:
402                 LOG.LogEntry("FATAL: cannot force node run state to={}".format(fState))
403                 error = 1
404     except:
405         traceback.print_exc(file=LOG.OutputFile)
406         traceback.print_exc()
407
408     if error:
409         LOG.LogEntry("BootManager finished at: {}"\
410                      .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
411         LOG.Upload()
412         return error
413
414     try:
415         bm = BootManager(LOG, forceState)
416         if bm.CAN_RUN == 0:
417             LOG.LogEntry("Unable to initialize BootManager.")
418         else:
419             LOG.LogEntry("Running version {} of BootManager.".format(bm.VARS['VERSION']))
420             success = bm.Run()
421             if success:
422                 LOG.LogEntry("\nDone!");
423             else:
424                 LOG.LogEntry("\nError occurred!");
425                 error = 1
426     except:
427         traceback.print_exc(file=LOG.OutputFile)
428         traceback.print_exc()
429
430     LOG.LogEntry("BootManager finished at: {}"\
431                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
432     LOG.Upload()
433
434     return error
435
436 if __name__ == "__main__":
437     error = main(sys.argv)
438     sys.exit(error)