X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=RunlevelAgent.py;h=646b0a7a35b34de8333d53ede072f8ded1c25b71;hp=d959dc9acf60fa12e9bd1d0ca8cd29bd5c2c8cc8;hb=HEAD;hpb=e135f91adda6bd0b5578c502aa270aea6775b5fc diff --git a/RunlevelAgent.py b/RunlevelAgent.py index d959dc9..646b0a7 100644 --- a/RunlevelAgent.py +++ b/RunlevelAgent.py @@ -1,4 +1,11 @@ #!/usr/bin/python +# +# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is +# online and whether it is in boot or pre-boot run-level. +# This is useful to identify nodes that are behind a firewall, as well as to +# have the machine report run-time status both in safeboot and boot modes, +# so that it is immediately visible at myplc (gui or api). +# import xml, xmlrpclib import logging @@ -77,23 +84,33 @@ class PLC: return self.api.__repr__() def extract_from(filename, pattern): - f = os.popen("grep -E %s %s" % (pattern, filename)) - val = f.read().strip() - return val + f = os.popen("grep -E %s %s" % (pattern, filename)) + val = f.read().strip() + return val def check_running(commandname): - f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname)) - val = f.read().strip() - return val - + f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname)) + val = f.read().strip() + return val + def main(): - f=open(SESSION_FILE,'r') - session_str=f.read().strip() - api = PLC(Auth(session=session_str), api_server_url) - # NOTE: What should we do if this call fails? - api.AuthCheck() + # Keep trying to authenticate session, waiting for NM to re-write the + # session file, or DNS to succeed, until AuthCheck succeeds. + while True: + try: + f=open(SESSION_FILE,'r') + session_str=f.read().strip() + api = PLC(Auth(session=session_str), api_server_url) + # NOTE: What should we do if this call fails? + # TODO: handle dns failure here. + api.AuthCheck() + break + except: + print "Retry in 30 seconds: ", os.popen("uptime").read().strip() + traceback.print_exc() + time.sleep(30) try: env = 'production' @@ -103,33 +120,33 @@ def main(): traceback.print_exc() while True: - #print "reporting status: ", os.popen("uptime").read().strip() try: # NOTE: here we are inferring the runlevel by environmental - # observations. We know how this process was started by the - # given command line argument. Then in bootmanager - # runlevle, the bm.log gives information about the current - # activity. - # other options: - # call plc for current boot state? - # how long have we been running? + # observations. We know how this process was started by the + # given command line argument. Then in bootmanager + # runlevel, the bm.log gives information about the current + # activity. + # other options: + # call plc for current boot state? + # how long have we been running? if env == "bootmanager": - bs_val = extract_from('/tmp/bm.log', 'Current boot state:').split()[3] - ex_val = extract_from('/tmp/bm.log', 'Exception') - fs_val = extract_from('/tmp/bm.log', 'mke2fs') - bm_val = check_running("BootManager.py") + bs_val = extract_from('/tmp/bm.log', "'Current boot state:'") + if len(bs_val) > 0: bs_val = bs_val.split()[-1] + ex_val = extract_from('/tmp/bm.log', 'Exception') + fs_val = extract_from('/tmp/bm.log', 'mke2fs') + bm_val = check_running("BootManager.py") - if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']: - api.ReportRunlevel({'run_level' : 'safeboot'}) + if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']: + api.ReportRunlevel({'run_level' : 'safeboot'}) - elif len(ex_val) > len("Exception"): - api.ReportRunlevel({'run_level' : 'failboot'}) + elif len(ex_val) > len("Exception"): + api.ReportRunlevel({'run_level' : 'failboot'}) - elif len(fs_val) > 0 and len(bm_val) > 0: - api.ReportRunlevel({'run_level' : 'reinstall'}) + elif len(fs_val) > 0 and len(bm_val) > 0: + api.ReportRunlevel({'run_level' : 'reinstall'}) - else: - api.ReportRunlevel({'run_level' : 'failboot'}) + else: + api.ReportRunlevel({'run_level' : 'failboot'}) elif env == "production": api.ReportRunlevel({'run_level' : 'boot'}) @@ -137,10 +154,11 @@ def main(): api.ReportRunlevel({'run_level' : 'failboot'}) except: + print "reporting error: ", os.popen("uptime").read().strip() traceback.print_exc() - # TODO: change to a configurable value sys.stdout.flush() + # TODO: change to a configurable value time.sleep(60*15) if __name__ == "__main__":