clearer names for actions, and infer actions better

[monitor.git] / RunlevelAgent.py
diff --git a/RunlevelAgent.py b/RunlevelAgent.py

index 739678d..646b0a7 100644 (file)
--- a/RunlevelAgent.py
+++ b/RunlevelAgent.py
@@ -1,4 +1,11 @@
  #!/usr/bin/python
  #!/usr/bin/python
+#
+# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
+#     online and whether it is in boot or pre-boot run-level.
+#   This is useful to identify nodes that are behind a firewall, as well as to
+#   have the machine report run-time status both in safeboot and boot modes,
+#   so that it is immediately visible at myplc (gui or api).
+# 
  
  import xml, xmlrpclib
  import logging
  
  import xml, xmlrpclib
  import logging
@@ -76,13 +83,34 @@ class PLC:
      def __repr__(self):
          return self.api.__repr__()
  
      def __repr__(self):
          return self.api.__repr__()
  
+def extract_from(filename, pattern):
+    f = os.popen("grep -E %s %s" % (pattern, filename))
+    val = f.read().strip()
+    return val
+
+def check_running(commandname):
+    f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
+    val = f.read().strip()
+    return val
+    
+
  def main():
  
  def main():
  
-    f=open(SESSION_FILE,'r')
-    session_str=f.read().strip()
-    api = PLC(Auth(session=session_str), api_server_url)
-    # NOTE: should we rely on bootmanager for this functionality?
-    api.AuthCheck()
+    # Keep trying to authenticate session, waiting for NM to re-write the
+    # session file, or DNS to succeed, until AuthCheck succeeds.
+    while True:
+        try:
+            f=open(SESSION_FILE,'r')
+            session_str=f.read().strip()
+            api = PLC(Auth(session=session_str), api_server_url)
+            # NOTE: What should we do if this call fails?
+            # TODO: handle dns failure here.
+            api.AuthCheck()
+            break
+        except:
+            print "Retry in 30 seconds: ", os.popen("uptime").read().strip()
+            traceback.print_exc()
+            time.sleep(30)
  
      try:
          env = 'production'
  
      try:
          env = 'production'
@@ -90,41 +118,48 @@ def main():
              env = sys.argv[1]
      except:
          traceback.print_exc()
              env = sys.argv[1]
      except:
          traceback.print_exc()
-        pass
  
      while True:
  
      while True:
-        # TODO: remove from output
-        print "reporting status: ", os.popen("uptime").read().strip()
          try:
          try:
-            # NOTE: alternately, check other stuff in the environment to infer
-            # run_level
-            #     is BootManager running?
-            #     what is the boot_state at PLC?
-            #     does /vservers exist?
-            #     what about /tmp/source?
-            #     is BootManager in /tmp/source?
-            #     is /tmp/mnt/sysimg mounted?
-            #     how long have we been running?  if we were in safeboot and
-            #       still running, we're likely in failboot now.
-            #     length of runtime increases the certainty of inferred state.
-            #     
+            # NOTE: here we are inferring the runlevel by environmental
+            #         observations.  We know how this process was started by the
+            #         given command line argument.  Then in bootmanager
+            #         runlevel, the bm.log gives information about the current
+            #         activity.
+            # other options:
+            #   call plc for current boot state?
+            #   how long have we been running?
              if env == "bootmanager":
              if env == "bootmanager":
-                # if bm not running, and plc bootstate = boot, then
-                #api.ReportRunlevel({'run_level' : 'failboot'})
-                #api.ReportRunlevel({'run_level' : 'reinstall'})
-                # if bm not running, and plc bootstate = safeboot, then
-                api.ReportRunlevel({'run_level' : 'safeboot'})
+                bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
+                if len(bs_val) > 0: bs_val = bs_val.split()[-1]
+                ex_val = extract_from('/tmp/bm.log', 'Exception')
+                fs_val = extract_from('/tmp/bm.log', 'mke2fs')
+                bm_val = check_running("BootManager.py")
+
+                if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
+                    api.ReportRunlevel({'run_level' : 'safeboot'})
+
+                elif len(ex_val) > len("Exception"):
+                    api.ReportRunlevel({'run_level' : 'failboot'})
+
+                elif len(fs_val) > 0 and len(bm_val) > 0:
+                    api.ReportRunlevel({'run_level' : 'reinstall'})
+
+                else:
+                    api.ReportRunlevel({'run_level' : 'failboot'})
+
              elif env == "production":
                  api.ReportRunlevel({'run_level' : 'boot'})
              else:
                  api.ReportRunlevel({'run_level' : 'failboot'})
                  
          except:
              elif env == "production":
                  api.ReportRunlevel({'run_level' : 'boot'})
              else:
                  api.ReportRunlevel({'run_level' : 'failboot'})
                  
          except:
+            print "reporting error: ", os.popen("uptime").read().strip()
              traceback.print_exc()
  
              traceback.print_exc()
  
-        # TODO: change to a configurable value
          sys.stdout.flush()
          sys.stdout.flush()
-        time.sleep(60)
+        # TODO: change to a configurable value
+        time.sleep(60*15)
  
  if __name__ == "__main__":
      main()
  
  if __name__ == "__main__":
      main()