a first stab at upgrading nodes
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # Copyright (c) 2003 Intel Corporation
4 # All rights reserved.
5 #
6 # Copyright (c) 2004-2006 The Trustees of Princeton University
7 # All rights reserved.
8
9 from __future__ import print_function
10
11 import sys, os
12 import traceback
13 import string
14 import time
15 import gzip
16
17 from steps import *
18 from Exceptions import *
19 import notify_messages
20 import BootServerRequest
21 import utils
22
23 # all output is written to this file
24 BM_NODE_LOG = "/tmp/bm.log"
25 VARS_FILE = "configuration"
26
27 # the new contents of PATH when the boot manager is running
28 BIN_PATH = ('/usr/local/bin',
29             '/usr/local/sbin',
30             '/usr/bin',
31             '/usr/sbin',
32             '/bin',
33             '/sbin')
34
35 def read_configuration_file(filename):
36     # read in and store all variables in VARS_FILE into each line
37     # is in the format name=val (any whitespace around the = is
38     # removed. everything after the = to the end of the line is
39     # the value
40     vars = {}
41     vars_file = file(filename,'r')
42     validConfFile = True
43     for line in vars_file:
44         # if its a comment or a whitespace line, ignore
45         if line[:1] == "#" or string.strip(line) == "":
46             continue
47
48         parts = string.split(line, "=")
49         if len(parts) != 2:
50             validConfFile = False
51             raise Exception("Invalid line in vars file: {}".format(line))
52
53         name = string.strip(parts[0])
54         value = string.strip(parts[1])
55         value = value.replace("'", "")   # remove quotes
56         value = value.replace('"', "")   # remove quotes
57         vars[name] = value
58
59     vars_file.close()
60     if not validConfFile:
61         raise Exception("Unable to read configuration vars.")
62
63     # find out which directory we are running it, and set a variable
64     # for that. future steps may need to get files out of the bootmanager
65     # directory
66     current_dir = os.getcwd()
67     vars['BM_SOURCE_DIR'] = current_dir
68
69     return vars
70
71 ##############################
72 class log:
73
74     format = "%H:%M:%S(%Z) "
75
76     def __init__(self, OutputFilePath=None):
77         try:
78             self.OutputFile = open(OutputFilePath, "w")
79             self.OutputFilePath = OutputFilePath
80         except:
81             print("bootmanager log : Unable to open output file {}, continuing"\
82                   .format(OutputFilePath))
83             self.OutputFile = None
84
85         self.VARS = None
86         try:
87             vars = read_configuration_file(VARS_FILE)
88             self.VARS = vars
89         except Exception, e:
90             self.LogEntry(str(e))
91             return
92
93     def LogEntry(self, str, inc_newline = 1, display_screen = 1):
94         now = time.strftime(log.format, time.localtime())
95         if self.OutputFile:
96             self.OutputFile.write(now + str)
97         if display_screen:
98             sys.stdout.write(now + str)
99
100         if inc_newline:
101             if display_screen:
102                 sys.stdout.write("\n")
103             if self.OutputFile:
104                 self.OutputFile.write("\n")
105
106         if self.OutputFile:
107             self.OutputFile.flush()
108
109     def write(self, str):
110         """
111         make log behave like a writable file object
112         (for traceback prints)
113         """
114         self.LogEntry(str, 0, 1)
115
116     def print_stack(self):
117         """
118         dump current stack in log
119         """
120         self.write(traceback.format_exc())
121
122     # bm log uploading is available back again, as of nodeconfig-5.0-2
123     def Upload(self, extra_file=None):
124         """
125         upload the contents of the log to the server
126         """
127         if self.OutputFile is not None:
128             self.OutputFile.flush()
129
130             self.LogEntry("Uploading logs to {}".format(self.VARS['UPLOAD_LOG_SCRIPT']))
131
132             self.OutputFile.close()
133             self.OutputFile = None
134
135             hostname = self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
136                        self.VARS['INTERFACE_SETTINGS']['domainname']
137             bs_request = BootServerRequest.BootServerRequest(self.VARS)
138             try:
139                 # this was working until f10
140                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
141                                        GetVars = None, PostVars = None,
142                                        DoSSL = True, DoCertCheck = True,
143                                        FormData = ["log=@" + self.OutputFilePath,
144                                                    "hostname=" + hostname,
145                                                    "type=bm.log"])
146             except:
147                 # new pycurl
148                 import pycurl
149                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
150                                        GetVars = None, PostVars = None,
151                                        DoSSL = True, DoCertCheck = True,
152                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
153                                                    ("hostname",hostname),
154                                                    ("type","bm.log")])
155         if extra_file is not None:
156             # NOTE: for code-reuse, evoke the bash function 'upload_logs';
157             # by adding --login, bash reads .bash_profile before execution.
158             # Also, never fail, since this is an optional feature.
159             utils.sysexec_noerr("""bash --login -c "upload_logs {}" """.format(extra_file), self)
160
161
162 ##############################
163 class BootManager:
164
165     # file containing initial variables/constants
166
167     # the set of valid node run states
168     NodeRunStates = {'reinstall' : None,
169                      'upgrade' : None,
170                      'boot' : None,
171                      'safeboot' : None,
172                      'disabled' : None,
173                      }
174
175     def __init__(self, log, forceState):
176         # override machine's current state from the command line
177         self.forceState = forceState
178
179         # the main logging point
180         self.LOG = log
181
182         # set to 1 if we can run after initialization
183         self.CAN_RUN = 0
184
185         if log.VARS:
186             # this contains a set of information used and updated by each step
187             self.VARS = log.VARS
188         else:
189             return
190
191         # not sure what the current PATH is set to, replace it with what
192         # we know will work with all the boot cds
193         os.environ['PATH'] = string.join(BIN_PATH,":")
194
195         self.CAN_RUN = 1
196
197     def Run(self):
198         """
199         core boot manager logic.
200
201         the way errors are handled is as such: if any particular step
202         cannot continue or unexpectibly fails, an exception is thrown.
203         in this case, the boot manager cannot continue running.
204
205         these step functions can also return a 0/1 depending on whether
206         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
207         a 0 is returned and no exception is thrown if the user chose not
208         to confirm the install. The same goes with the CheckHardwareRequirements.
209         If requriements not met, but tests were succesfull, return 0.
210
211         for steps that run within the installer, they are expected to either
212         complete succesfully and return 1, or throw an exception.
213
214         For exact return values and expected operations, see the comments
215         at the top of each of the invididual step functions.
216         """
217
218         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
219             # called by the _xxxState() functions below upon failure
220             self.VARS['RUN_LEVEL'] = 'failboot'
221             notify = getattr(notify_messages, message)
222             self.VARS['STATE_CHANGE_NOTIFY'] = 1
223             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = notify
224             raise BootManagerException, notify
225
226         def _bootRun():
227             # implements the boot logic, which consists of first
228             # double checking that the node was properly installed,
229             # checking whether someone added or changed disks, and
230             # then finally chain boots.
231
232             # starting the fallback/debug ssh daemon for safety:
233             # if the node install somehow hangs, or if it simply takes ages,
234             # we can still enter and investigate
235             try:
236                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
237             except:
238                 pass
239
240             InstallInit.Run(self.VARS, self.LOG)
241             ret = ValidateNodeInstall.Run(self.VARS, self.LOG)
242             if ret == 1:
243 # Thierry - feb. 2013 turning off WriteModprobeConfig for now on lxc
244 # for one thing this won't work at all with f18, as modules.pcimap
245 # has disappeared (Daniel suggested modules.aliases could be used instead)
246 # and second, in any case it's been years now that modprobe.conf was deprecated
247 # so most likely this code has no actual effect
248                 if self.VARS['virt'] == 'vs':
249                     WriteModprobeConfig.Run(self.VARS, self.LOG)
250                 WriteNetworkConfig.Run(self.VARS, self.LOG)
251                 CheckForNewDisks.Run(self.VARS, self.LOG)
252                 SendHardwareConfigToPLC.Run(self.VARS, self.LOG)
253                 ChainBootNode.Run(self.VARS, self.LOG)
254             elif ret == -1:
255                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
256             elif ret == -2:
257                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
258             elif ret == -3:
259                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
260             else:
261                 _nodeNotInstalled()
262
263         def _reinstallRun(upgrade=False):
264
265             # starting the fallback/debug ssh daemon for safety:
266             # if the node install somehow hangs, or if it simply takes ages,
267             # we can still enter and investigate
268             try:
269                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
270             except:
271                 pass
272
273             # implements the reinstall logic, which will check whether
274             # the min. hardware requirements are met, install the
275             # software, and upon correct installation will switch too
276             # 'boot' state and chainboot into the production system
277             if not CheckHardwareRequirements.Run(self.VARS, self.LOG):
278                 self.VARS['RUN_LEVEL'] = 'failboot'
279                 raise BootManagerException("Hardware requirements not met.")
280
281             # runinstaller
282             InstallInit.Run(self.VARS, self.LOG)
283             # do not erase disks in upgrade mode
284             if not upgrade:
285                 InstallPartitionDisks.Run(self.VARS, self.LOG)
286             # pass upgrade boolean to this step so we can do extra cleanup
287             InstallBootstrapFS.Run(self.VARS, upgrade, self.LOG)
288             InstallWriteConfig.Run(self.VARS, self.LOG)
289             InstallUninitHardware.Run(self.VARS, self.LOG)
290             self.VARS['BOOT_STATE'] = 'boot'
291             self.VARS['STATE_CHANGE_NOTIFY'] = 1
292             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE'] = \
293                  notify_messages.MSG_INSTALL_FINISHED
294             AnsibleHook.Run(self.VARS, self.LOG)
295             UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
296             _bootRun()
297
298         def _installRun():
299             # implements the new install logic, which will first check
300             # with the user whether it is ok to install on this
301             # machine, switch to 'reinstall' state and then invoke the reinstall
302             # logic.  See reinstallState logic comments for further
303             # details.
304             if not ConfirmInstallWithUser.Run(self.VARS, self.LOG):
305                 return 0
306             self.VARS['BOOT_STATE'] = 'reinstall'
307
308             AnsibleHook.Run(self.VARS, self.LOG)
309             _reinstallRun()
310
311         def _debugRun(state='failboot'):
312             # implements debug logic, which starts the sshd and just waits around
313             self.VARS['RUN_LEVEL'] = state
314             StartDebug.Run(self.VARS, self.LOG)
315             # fsck/mount fs if present, and ignore return value if it's not.
316             ValidateNodeInstall.Run(self.VARS, self.LOG)
317
318         def _badstateRun():
319             # should never happen; log event
320             self.LOG.write("\nInvalid BOOT_STATE = {}\n".format(self.VARS['BOOT_STATE']))
321             _debugRun()
322
323         # setup state -> function hash table
324         BootManager.NodeRunStates['reinstall']  = lambda : _reinstallRun(upgrade=False)
325         BootManager.NodeRunStates['upgrade']    = lambda : _reinstallRun(upgrade=True)
326         BootManager.NodeRunStates['boot']       = _bootRun
327         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
328         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
329
330         success = 0
331         try:
332             InitializeBootManager.Run(self.VARS, self.LOG)
333             ReadNodeConfiguration.Run(self.VARS, self.LOG)
334             AuthenticateWithPLC.Run(self.VARS, self.LOG)
335             UpdateLastBootOnce.Run(self.VARS, self.LOG)
336             StartRunlevelAgent.Run(self.VARS, self.LOG)
337             GetAndUpdateNodeDetails.Run(self.VARS, self.LOG)
338
339             # override machine's current state from the command line
340             if self.forceState is not None:
341                 self.VARS['BOOT_STATE'] = self.forceState
342                 UpdateBootStateWithPLC.Run(self.VARS, self.LOG)
343
344             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'], _badstateRun)
345             stateRun()
346             success = 1
347
348         except KeyError as e:
349             self.LOG.write("\n\nKeyError while running: {}\n".format(e))
350             self.LOG.print_stack()
351         except BootManagerException as e:
352             self.LOG.write("\n\nException while running: {}\n".format(e))
353             self.LOG.print_stack()
354         except BootManagerAuthenticationException as e:
355             self.LOG.write("\n\nFailed to Authenticate Node: {}\n".format(e))
356             self.LOG.print_stack()
357             # sets /tmp/CANCEL_BOOT flag
358             StartDebug.Run(self.VARS, self.LOG)
359             # Return immediately b/c any other calls to API will fail
360             return success
361         except:
362             self.LOG.write("\n\nImplementation Error\n")
363             self.LOG.print_stack()
364
365         if not success:
366             try:
367                 _debugRun()
368             except BootManagerException, e:
369                 self.LOG.write("\n\nException while running: {}\n".format(e))
370             except:
371                 self.LOG.write("\n\nImplementation Error\n")
372                 traceback.print_exc(file=self.LOG.OutputFile)
373                 traceback.print_exc()
374
375         return success
376
377 def main(argv):
378
379     import utils
380     utils.prompt_for_breakpoint_mode()
381
382 #    utils.breakpoint("Entering BootManager::main")
383
384     # set to 1 if error occurred
385     error = 0
386
387     # all output goes through this class so we can save it and post
388     # the data back to PlanetLab central
389     LOG = log(BM_NODE_LOG)
390
391     # NOTE: assume CWD is BM's source directory, but never fail
392     utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
393
394     LOG.LogEntry("BootManager started at: {}"\
395                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
396
397     try:
398         forceState = None
399         if len(argv) == 2:
400             fState = argv[1]
401             if BootManager.NodeRunStates.has_key(fState):
402                 forceState = fState
403             else:
404                 LOG.LogEntry("FATAL: cannot force node run state to={}".format(fState))
405                 error = 1
406     except:
407         traceback.print_exc(file=LOG.OutputFile)
408         traceback.print_exc()
409
410     if error:
411         LOG.LogEntry("BootManager finished at: {}"\
412                      .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
413         LOG.Upload()
414         return error
415
416     try:
417         bm = BootManager(LOG, forceState)
418         if bm.CAN_RUN == 0:
419             LOG.LogEntry("Unable to initialize BootManager.")
420         else:
421             LOG.LogEntry("Running version {} of BootManager.".format(bm.VARS['VERSION']))
422             success = bm.Run()
423             if success:
424                 LOG.LogEntry("\nDone!");
425             else:
426                 LOG.LogEntry("\nError occurred!");
427                 error = 1
428     except:
429         traceback.print_exc(file=LOG.OutputFile)
430         traceback.print_exc()
431
432     LOG.LogEntry("BootManager finished at: {}"\
433                  .format(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
434     LOG.Upload()
435
436     return error
437
438 if __name__ == "__main__":
439     error = main(sys.argv)
440     sys.exit(error)