X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=RunlevelAgent.py;h=04dcfeef447c74019c7637a15d01faeb2de71e31;hb=40588e1f900ba82db3ca69c5cc375805028f2430;hp=49fa631a6a86d79eed47c503c5664c350c3e0762;hpb=2150c5b72779c0e07bbf4831da4ace7529b220ef;p=monitor.git diff --git a/RunlevelAgent.py b/RunlevelAgent.py index 49fa631..04dcfee 100644 --- a/RunlevelAgent.py +++ b/RunlevelAgent.py @@ -1,4 +1,11 @@ #!/usr/bin/python +# +# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is +# online and whether it is in boot or pre-boot run-level. +# This is useful to identify nodes that are behind a firewall, as well as to +# have the machine report run-time status both in safeboot and boot modes, +# so that it is immediately visible at myplc (gui or api). +# import xml, xmlrpclib import logging @@ -13,7 +20,7 @@ SESSION_FILE="/etc/planetlab/session" def read_config_file(filename): ## NOTE: text copied from BootManager.py - # TODO: unify this code to make it common. i.e. use ConfigParser module + # TODO: unify this code to make it common. i.e. use ConfigParser module vars = {} vars_file= file(filename,'r') validConfFile = True @@ -49,60 +56,101 @@ except: class Auth: - def __init__(self, username=None, password=None, **kwargs): - if 'session' in kwargs: - self.auth= { 'AuthMethod' : 'session', - 'session' : kwargs['session'] } - else: - if username==None and password==None: - self.auth = {'AuthMethod': "anonymous"} - else: - self.auth = {'Username' : username, - 'AuthMethod' : 'password', - 'AuthString' : password} + def __init__(self, username=None, password=None, **kwargs): + if 'session' in kwargs: + self.auth= { 'AuthMethod' : 'session', + 'session' : kwargs['session'] } + else: + if username==None and password==None: + self.auth = {'AuthMethod': "anonymous"} + else: + self.auth = {'Username' : username, + 'AuthMethod' : 'password', + 'AuthString' : password} class PLC: - def __init__(self, auth, url): - self.auth = auth - self.url = url - self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True) + def __init__(self, auth, url): + self.auth = auth + self.url = url + self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True) - def __getattr__(self, name): - method = getattr(self.api, name) - if method is None: - raise AssertionError("method does not exist") + def __getattr__(self, name): + method = getattr(self.api, name) + if method is None: + raise AssertionError("method does not exist") - return lambda *params : method(self.auth.auth, *params) + return lambda *params : method(self.auth.auth, *params) - def __repr__(self): - return self.api.__repr__() + def __repr__(self): + return self.api.__repr__() + +def extract_from(filename, pattern): + f = os.popen("grep -E %s %s" % (pattern, filename)) + val = f.read().strip() + return val + +def check_running(commandname): + f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname)) + val = f.read().strip() + return val + def main(): f=open(SESSION_FILE,'r') session_str=f.read().strip() api = PLC(Auth(session=session_str), api_server_url) - # NOTE: should we rely on bootmanager for this functionality? + # NOTE: What should we do if this call fails? + # TODO: handle dns failure here. api.AuthCheck() + try: + env = 'production' + if len(sys.argv) > 1: + env = sys.argv[1] + except: + traceback.print_exc() + while True: - print "reporting status: ", os.popen("uptime").read().strip() try: - # NOTE: alternately, check other stuff in the environment to infer - # run_level - # is BootManager running? - # what is the boot_state at PLC? - # does /vservers exist? - # what about /tmp/source? - # is BootManager in /tmp/source? - # is /tmp/mnt/sysimg mounted? - # how long have we been running? if we were in safeboot and - # still running, we're likely in failboot now. - # length of runtime increases the certainty of inferred state. - # - api.ReportRunlevel({'run_level' : 'safeboot'}) + # NOTE: here we are inferring the runlevel by environmental + # observations. We know how this process was started by the + # given command line argument. Then in bootmanager + # runlevle, the bm.log gives information about the current + # activity. + # other options: + # call plc for current boot state? + # how long have we been running? + if env == "bootmanager": + bs_val = extract_from('/tmp/bm.log', 'Current boot state:') + if len(bs_val) > 0: bs_val = bs_val.split()[-1] + ex_val = extract_from('/tmp/bm.log', 'Exception') + fs_val = extract_from('/tmp/bm.log', 'mke2fs') + bm_val = check_running("BootManager.py") + + if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']: + api.ReportRunlevel({'run_level' : 'safeboot'}) + + elif len(ex_val) > len("Exception"): + api.ReportRunlevel({'run_level' : 'failboot'}) + + elif len(fs_val) > 0 and len(bm_val) > 0: + api.ReportRunlevel({'run_level' : 'reinstall'}) + + else: + api.ReportRunlevel({'run_level' : 'failboot'}) + + elif env == "production": + api.ReportRunlevel({'run_level' : 'boot'}) + else: + api.ReportRunlevel({'run_level' : 'failboot'}) + except: + print "reporting error: ", os.popen("uptime").read().strip() traceback.print_exc() - time.sleep(30) + + sys.stdout.flush() + # TODO: change to a configurable value + time.sleep(60*15) if __name__ == "__main__": main()