X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=blobdiff_plain;f=RunlevelAgent.py;h=646b0a7a35b34de8333d53ede072f8ded1c25b71;hp=d959dc9acf60fa12e9bd1d0ca8cd29bd5c2c8cc8;hb=HEAD;hpb=e135f91adda6bd0b5578c502aa270aea6775b5fc

diff --git a/RunlevelAgent.py b/RunlevelAgent.py
index d959dc9..646b0a7 100644
--- a/RunlevelAgent.py
+++ b/RunlevelAgent.py
@@ -1,4 +1,11 @@
 #!/usr/bin/python
+#
+# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
+#     online and whether it is in boot or pre-boot run-level.
+#   This is useful to identify nodes that are behind a firewall, as well as to
+#   have the machine report run-time status both in safeboot and boot modes,
+#   so that it is immediately visible at myplc (gui or api).
+# 
 
 import xml, xmlrpclib
 import logging
@@ -77,23 +84,33 @@ class PLC:
         return self.api.__repr__()
 
 def extract_from(filename, pattern):
-	f = os.popen("grep -E %s %s" % (pattern, filename))
-	val = f.read().strip()
-	return val
+    f = os.popen("grep -E %s %s" % (pattern, filename))
+    val = f.read().strip()
+    return val
 
 def check_running(commandname):
-	f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
-	val = f.read().strip()
-	return val
-	
+    f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
+    val = f.read().strip()
+    return val
+    
 
 def main():
 
-    f=open(SESSION_FILE,'r')
-    session_str=f.read().strip()
-    api = PLC(Auth(session=session_str), api_server_url)
-	# NOTE: What should we do if this call fails?
-    api.AuthCheck()
+    # Keep trying to authenticate session, waiting for NM to re-write the
+    # session file, or DNS to succeed, until AuthCheck succeeds.
+    while True:
+        try:
+            f=open(SESSION_FILE,'r')
+            session_str=f.read().strip()
+            api = PLC(Auth(session=session_str), api_server_url)
+            # NOTE: What should we do if this call fails?
+            # TODO: handle dns failure here.
+            api.AuthCheck()
+            break
+        except:
+            print "Retry in 30 seconds: ", os.popen("uptime").read().strip()
+            traceback.print_exc()
+            time.sleep(30)
 
     try:
         env = 'production'
@@ -103,33 +120,33 @@ def main():
         traceback.print_exc()
 
     while True:
-        #print "reporting status: ", os.popen("uptime").read().strip()
         try:
             # NOTE: here we are inferring the runlevel by environmental
-			# 		observations.  We know how this process was started by the
-			# 		given command line argument.  Then in bootmanager
-			# 		runlevle, the bm.log gives information about the current
-			# 		activity.
-			# other options:
-			#   call plc for current boot state?
-			#   how long have we been running?
+            #         observations.  We know how this process was started by the
+            #         given command line argument.  Then in bootmanager
+            #         runlevel, the bm.log gives information about the current
+            #         activity.
+            # other options:
+            #   call plc for current boot state?
+            #   how long have we been running?
             if env == "bootmanager":
-				bs_val = extract_from('/tmp/bm.log', 'Current boot state:').split()[3]
-				ex_val = extract_from('/tmp/bm.log', 'Exception')
-				fs_val = extract_from('/tmp/bm.log', 'mke2fs')
-				bm_val = check_running("BootManager.py")
+                bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
+                if len(bs_val) > 0: bs_val = bs_val.split()[-1]
+                ex_val = extract_from('/tmp/bm.log', 'Exception')
+                fs_val = extract_from('/tmp/bm.log', 'mke2fs')
+                bm_val = check_running("BootManager.py")
 
-				if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
-                	api.ReportRunlevel({'run_level' : 'safeboot'})
+                if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
+                    api.ReportRunlevel({'run_level' : 'safeboot'})
 
-				elif len(ex_val) > len("Exception"):
-                	api.ReportRunlevel({'run_level' : 'failboot'})
+                elif len(ex_val) > len("Exception"):
+                    api.ReportRunlevel({'run_level' : 'failboot'})
 
-				elif len(fs_val) > 0 and len(bm_val) > 0:
-                	api.ReportRunlevel({'run_level' : 'reinstall'})
+                elif len(fs_val) > 0 and len(bm_val) > 0:
+                    api.ReportRunlevel({'run_level' : 'reinstall'})
 
-				else:
-                	api.ReportRunlevel({'run_level' : 'failboot'})
+                else:
+                    api.ReportRunlevel({'run_level' : 'failboot'})
 
             elif env == "production":
                 api.ReportRunlevel({'run_level' : 'boot'})
@@ -137,10 +154,11 @@ def main():
                 api.ReportRunlevel({'run_level' : 'failboot'})
                 
         except:
+            print "reporting error: ", os.popen("uptime").read().strip()
             traceback.print_exc()
 
-        # TODO: change to a configurable value
         sys.stdout.flush()
+        # TODO: change to a configurable value
         time.sleep(60*15)
 
 if __name__ == "__main__":