This is supported only if there is a receiving server, such as myops, for the
[bootmanager.git] / source / BootManager.py
1 #!/usr/bin/python -u
2 #
3 # $Id$
4 # $URL$
5 #
6 # Copyright (c) 2003 Intel Corporation
7 # All rights reserved.
8 #
9 # Copyright (c) 2004-2006 The Trustees of Princeton University
10 # All rights reserved.
11
12 import string
13 import sys, os, traceback
14 import time
15 import gzip
16
17 from steps import *
18 from Exceptions import *
19 import notify_messages
20 import BootServerRequest
21 import utils
22
23 # all output is written to this file
24 BM_NODE_LOG= "/tmp/bm.log"
25 VARS_FILE = "configuration"
26
27 # the new contents of PATH when the boot manager is running
28 BIN_PATH= ('/usr/local/bin',
29            '/usr/local/sbin',
30            '/usr/bin',
31            '/usr/sbin',
32            '/bin',
33            '/sbin')
34
35 def read_configuration_file(filename):
36     # read in and store all variables in VARS_FILE into each line
37     # is in the format name=val (any whitespace around the = is
38     # removed. everything after the = to the end of the line is
39     # the value
40     vars = {}
41     vars_file= file(filename,'r')
42     validConfFile = True
43     for line in vars_file:
44         # if its a comment or a whitespace line, ignore
45         if line[:1] == "#" or string.strip(line) == "":
46             continue
47
48         parts= string.split(line,"=")
49         if len(parts) != 2:
50             validConfFile = False
51             raise Exception( "Invalid line in vars file: %s" % line )
52
53         name= string.strip(parts[0])
54         value= string.strip(parts[1])
55         value= value.replace("'", "")   # remove quotes
56         value= value.replace('"', "")   # remove quotes
57         vars[name]= value
58
59     vars_file.close()
60     if not validConfFile:
61         raise Exception( "Unable to read configuration vars." )
62
63     # find out which directory we are running it, and set a variable
64     # for that. future steps may need to get files out of the bootmanager
65     # directory
66     current_dir= os.getcwd()
67     vars['BM_SOURCE_DIR']= current_dir
68
69     return vars
70
71 ##############################
72 class log:
73
74     format="%H:%M:%S(%Z) "
75
76     def __init__( self, OutputFilePath= None ):
77         try:
78             self.OutputFile= open( OutputFilePath, "w")
79             self.OutputFilePath= OutputFilePath
80         except:
81             print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
82             self.OutputFile= None
83
84         self.VARS = None
85         try:
86             vars = read_configuration_file(VARS_FILE)
87             self.VARS = vars
88         except Exception, e:
89             self.LogEntry( str(e) )
90             return
91     
92     def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
93         now=time.strftime(log.format, time.localtime())
94         if self.OutputFile:
95             self.OutputFile.write( now+str )
96         if display_screen:
97             sys.stdout.write( now+str )
98             
99         if inc_newline:
100             if display_screen:
101                 sys.stdout.write( "\n" )
102             if self.OutputFile:
103                 self.OutputFile.write( "\n" )
104
105         if self.OutputFile:
106             self.OutputFile.flush()
107
108     def write( self, str ):
109         """
110         make log behave like a writable file object (for traceback
111         prints)
112         """
113         self.LogEntry( str, 0, 1 )
114     
115     # bm log uploading is available back again, as of nodeconfig-5.0-2
116     def Upload( self, extra_file=None ):
117         """
118         upload the contents of the log to the server
119         """
120         if self.OutputFile is not None:
121             self.OutputFile.flush()
122
123             self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
124             
125             self.OutputFile.close()
126             self.OutputFile= None
127
128             hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
129                       self.VARS['INTERFACE_SETTINGS']['domainname']
130             bs_request = BootServerRequest.BootServerRequest(self.VARS)
131             try:
132                 # this was working until f10
133                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
134                                        GetVars = None, PostVars = None,
135                                        DoSSL = True, DoCertCheck = True,
136                                        FormData = ["log=@" + self.OutputFilePath,
137                                                    "hostname=" + hostname, 
138                                                    "type=bm.log"])
139             except:
140                 # new pycurl
141                 import pycurl
142                 bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
143                                        GetVars = None, PostVars = None,
144                                        DoSSL = True, DoCertCheck = True,
145                                        FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
146                                                    ("hostname",hostname),
147                                                    ("type","bm.log")])
148         if extra_file is not None:
149             # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
150             # by adding --login, bash reads .bash_profile before execution.
151             utils.sysexec( """bash --login -c "upload_logs %s" """ % extra_file, self)
152
153
154 ##############################
155 class BootManager:
156
157     # file containing initial variables/constants
158
159     # the set of valid node run states
160     NodeRunStates = {'reinstall':None,
161                      'boot':None,
162                      'safeboot':None,
163                      'disabled':None,
164                      }
165     
166     def __init__(self, log, forceState):
167         # override machine's current state from the command line
168         self.forceState = forceState
169
170         # the main logging point
171         self.LOG= log
172
173         # set to 1 if we can run after initialization
174         self.CAN_RUN = 0
175
176         if log.VARS:
177             # this contains a set of information used and updated by each step
178             self.VARS= log.VARS
179         else:
180             return
181              
182         # not sure what the current PATH is set to, replace it with what
183         # we know will work with all the boot cds
184         os.environ['PATH']= string.join(BIN_PATH,":")
185
186         self.CAN_RUN= 1
187
188     def Run(self):
189         """
190         core boot manager logic.
191
192         the way errors are handled is as such: if any particular step
193         cannot continue or unexpectibly fails, an exception is thrown.
194         in this case, the boot manager cannot continue running.
195
196         these step functions can also return a 0/1 depending on whether
197         or not it succeeded. In the case of steps like ConfirmInstallWithUser,
198         a 0 is returned and no exception is thrown if the user chose not
199         to confirm the install. The same goes with the CheckHardwareRequirements.
200         If requriements not met, but tests were succesfull, return 0.
201
202         for steps that run within the installer, they are expected to either
203         complete succesfully and return 1, or throw an execption.
204
205         For exact return values and expected operations, see the comments
206         at the top of each of the invididual step functions.
207         """
208
209         def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
210             # called by the _xxxState() functions below upon failure
211             self.VARS['RUN_LEVEL']= 'failboot'
212             notify = getattr(notify_messages, message)
213             self.VARS['STATE_CHANGE_NOTIFY']= 1
214             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
215             raise BootManagerException, notify
216
217         def _bootRun():
218             # implements the boot logic, which consists of first
219             # double checking that the node was properly installed,
220             # checking whether someone added or changed disks, and
221             # then finally chain boots.
222
223             # starting the fallback/debug ssh daemon for safety:
224             # if the node install somehow hangs, or if it simply takes ages, 
225             # we can still enter and investigate
226             try:
227                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
228             except:
229                 pass
230
231             InstallInit.Run( self.VARS, self.LOG )                    
232             ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
233             if ret == 1:
234                 WriteModprobeConfig.Run( self.VARS, self.LOG )
235                 WriteNetworkConfig.Run( self.VARS, self.LOG )
236                 CheckForNewDisks.Run( self.VARS, self.LOG )
237                 SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
238                 ChainBootNode.Run( self.VARS, self.LOG )
239             elif ret == -1:
240                 _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
241             elif ret == -2:
242                 _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
243             elif ret == -3:
244                 _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
245             else:
246                 _nodeNotInstalled()
247
248         def _reinstallRun():
249
250             # starting the fallback/debug ssh daemon for safety:
251             # if the node install somehow hangs, or if it simply takes ages, 
252             # we can still enter and investigate
253             try:
254                 StartDebug.Run(self.VARS, self.LOG, last_resort = False)
255             except:
256                 pass
257
258             # implements the reinstall logic, which will check whether
259             # the min. hardware requirements are met, install the
260             # software, and upon correct installation will switch too
261             # 'boot' state and chainboot into the production system
262             if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
263                 self.VARS['RUN_LEVEL']= 'failboot'
264                 raise BootManagerException, "Hardware requirements not met."
265
266             # runinstaller
267             InstallInit.Run( self.VARS, self.LOG )                    
268             InstallPartitionDisks.Run( self.VARS, self.LOG )            
269             InstallBootstrapFS.Run( self.VARS, self.LOG )            
270             InstallWriteConfig.Run( self.VARS, self.LOG )
271             InstallUninitHardware.Run( self.VARS, self.LOG )
272             self.VARS['BOOT_STATE']= 'boot'
273             self.VARS['STATE_CHANGE_NOTIFY']= 1
274             self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
275                  notify_messages.MSG_INSTALL_FINISHED
276             UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
277             _bootRun()
278             
279         def _installRun():
280             # implements the new install logic, which will first check
281             # with the user whether it is ok to install on this
282             # machine, switch to 'reinstall' state and then invoke the reinstall
283             # logic.  See reinstallState logic comments for further
284             # details.
285             if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
286                 return 0
287             self.VARS['BOOT_STATE']= 'reinstall'
288             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
289             _reinstallRun()
290
291         def _debugRun(state='failboot'):
292             # implements debug logic, which starts the sshd and just waits around
293             self.VARS['RUN_LEVEL']=state
294             UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
295             StartDebug.Run( self.VARS, self.LOG )
296             # fsck/mount fs if present, and ignore return value if it's not.
297             ValidateNodeInstall.Run( self.VARS, self.LOG )
298
299         def _badstateRun():
300             # should never happen; log event
301             self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
302             _debugRun()
303
304         # setup state -> function hash table
305         BootManager.NodeRunStates['reinstall']  = _reinstallRun
306         BootManager.NodeRunStates['boot']       = _bootRun
307         BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
308         BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
309
310         success = 0
311         try:
312             InitializeBootManager.Run( self.VARS, self.LOG )
313             ReadNodeConfiguration.Run( self.VARS, self.LOG )
314             AuthenticateWithPLC.Run( self.VARS, self.LOG )
315             StartRunlevelAgent.Run( self.VARS, self.LOG )
316             GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
317
318             # override machine's current state from the command line
319             if self.forceState is not None:
320                 self.VARS['BOOT_STATE']= self.forceState
321                 UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
322                 UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
323
324             stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
325             stateRun()
326             success = 1
327
328         except KeyError, e:
329             self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
330         except BootManagerException, e:
331             self.LOG.write( "\n\nException while running: %s\n" % str(e) )
332         except BootManagerAuthenticationException, e:
333             self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
334             # sets /tmp/CANCEL_BOOT flag
335             StartDebug.Run(self.VARS, self.LOG )
336             # Return immediately b/c any other calls to API will fail
337             return success
338         except:
339             self.LOG.write( "\n\nImplementation Error\n")
340             traceback.print_exc(file=self.LOG.OutputFile)
341             traceback.print_exc()
342
343         if not success:
344             try:
345                 _debugRun()
346             except BootManagerException, e:
347                 self.LOG.write( "\n\nException while running: %s\n" % str(e) )
348             except:
349                 self.LOG.write( "\n\nImplementation Error\n")
350                 traceback.print_exc(file=self.LOG.OutputFile)
351                 traceback.print_exc()
352
353         return success
354             
355             
356 def main(argv):
357
358     import utils
359     utils.prompt_for_breakpoint_mode()
360
361     utils.breakpoint ("Entering BootManager::main")
362     
363     # set to 1 if error occurred
364     error= 0
365     
366     # all output goes through this class so we can save it and post
367     # the data back to PlanetLab central
368     LOG= log( BM_NODE_LOG )
369
370     # NOTE: assume CWD is BM's source directory, but never fail
371     utils.sysexec("./setup_bash_history_scripts.sh || /bin/true", LOG)
372
373     LOG.LogEntry( "BootManager started at: %s" % \
374                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
375
376     try:
377         forceState = None
378         if len(argv) == 2:
379             fState = argv[1]
380             if BootManager.NodeRunStates.has_key(fState):
381                 forceState = fState
382             else:
383                 LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
384                 error = 1
385     except:
386         traceback.print_exc(file=LOG.OutputFile)
387         traceback.print_exc()
388         
389     if error:
390         LOG.LogEntry( "BootManager finished at: %s" % \
391                       time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
392         LOG.Upload()
393         return error
394
395     try:
396         bm= BootManager(LOG,forceState)
397         if bm.CAN_RUN == 0:
398             LOG.LogEntry( "Unable to initialize BootManager." )
399         else:
400             LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
401             success= bm.Run()
402             if success:
403                 LOG.LogEntry( "\nDone!" );
404             else:
405                 LOG.LogEntry( "\nError occurred!" );
406                 error = 1
407     except:
408         traceback.print_exc(file=LOG.OutputFile)
409         traceback.print_exc()
410
411     LOG.LogEntry( "BootManager finished at: %s" % \
412                   time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
413     LOG.Upload()
414
415     return error
416
417     
418 if __name__ == "__main__":
419     error = main(sys.argv)
420     sys.exit(error)