#!/usr/bin/python
+#
+# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
+# online and whether it is in boot or pre-boot run-level.
+# This is useful to identify nodes that are behind a firewall, as well as to
+# have the machine report run-time status both in safeboot and boot modes,
+# so that it is immediately visible at myplc (gui or api).
+#
import xml, xmlrpclib
import logging
def __repr__(self):
return self.api.__repr__()
+def extract_from(filename, pattern):
+ f = os.popen("grep -E %s %s" % (pattern, filename))
+ val = f.read().strip()
+ return val
+
+def check_running(commandname):
+ f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
+ val = f.read().strip()
+ return val
+
+
def main():
- f=open(SESSION_FILE,'r')
- session_str=f.read().strip()
- api = PLC(Auth(session=session_str), api_server_url)
- # NOTE: should we rely on bootmanager for this functionality?
- api.AuthCheck()
+ # Keep trying to authenticate session, waiting for NM to re-write the
+ # session file, or DNS to succeed, until AuthCheck succeeds.
+ while True:
+ try:
+ f=open(SESSION_FILE,'r')
+ session_str=f.read().strip()
+ api = PLC(Auth(session=session_str), api_server_url)
+ # NOTE: What should we do if this call fails?
+ # TODO: handle dns failure here.
+ api.AuthCheck()
+ break
+ except:
+ print "Retry in 30 seconds: ", os.popen("uptime").read().strip()
+ traceback.print_exc()
+ time.sleep(30)
try:
env = 'production'
env = sys.argv[1]
except:
traceback.print_exc()
- pass
while True:
- # TODO: remove from output
- print "reporting status: ", os.popen("uptime").read().strip()
try:
- # NOTE: alternately, check other stuff in the environment to infer
- # run_level
- # is BootManager running?
- # what is the boot_state at PLC?
- # does /vservers exist?
- # what about /tmp/source?
- # is BootManager in /tmp/source?
- # is /tmp/mnt/sysimg mounted?
- # how long have we been running? if we were in safeboot and
- # still running, we're likely in failboot now.
- # length of runtime increases the certainty of inferred state.
- #
+ # NOTE: here we are inferring the runlevel by environmental
+ # observations. We know how this process was started by the
+ # given command line argument. Then in bootmanager
+ # runlevel, the bm.log gives information about the current
+ # activity.
+ # other options:
+ # call plc for current boot state?
+ # how long have we been running?
if env == "bootmanager":
- # if bm not running, and plc bootstate = boot, then
- api.ReportRunlevel({'run_level' : 'failboot'})
- api.ReportRunlevel({'run_level' : 'reinstall'})
- # if bm not running, and plc bootstate = safeboot, then
- api.ReportRunlevel({'run_level' : 'safeboot'})
+ bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
+ if len(bs_val) > 0: bs_val = bs_val.split()[-1]
+ ex_val = extract_from('/tmp/bm.log', 'Exception')
+ fs_val = extract_from('/tmp/bm.log', 'mke2fs')
+ bm_val = check_running("BootManager.py")
+
+ if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
+ api.ReportRunlevel({'run_level' : 'safeboot'})
+
+ elif len(ex_val) > len("Exception"):
+ api.ReportRunlevel({'run_level' : 'failboot'})
+
+ elif len(fs_val) > 0 and len(bm_val) > 0:
+ api.ReportRunlevel({'run_level' : 'reinstall'})
+
+ else:
+ api.ReportRunlevel({'run_level' : 'failboot'})
+
elif env == "production":
api.ReportRunlevel({'run_level' : 'boot'})
else:
api.ReportRunlevel({'run_level' : 'failboot'})
except:
+ print "reporting error: ", os.popen("uptime").read().strip()
traceback.print_exc()
- # TODO: change to a configurable value
sys.stdout.flush()
- time.sleep(60)
+ # TODO: change to a configurable value
+ time.sleep(60*15)
if __name__ == "__main__":
main()