X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=RunlevelAgent.py;h=04dcfeef447c74019c7637a15d01faeb2de71e31;hb=40588e1f900ba82db3ca69c5cc375805028f2430;hp=49fa631a6a86d79eed47c503c5664c350c3e0762;hpb=2150c5b72779c0e07bbf4831da4ace7529b220ef;p=monitor.git

diff --git a/RunlevelAgent.py b/RunlevelAgent.py
index 49fa631..04dcfee 100644
--- a/RunlevelAgent.py
+++ b/RunlevelAgent.py
@@ -1,4 +1,11 @@
 #!/usr/bin/python
+#
+# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
+# 	online and whether it is in boot or pre-boot run-level.
+#   This is useful to identify nodes that are behind a firewall, as well as to
+#   have the machine report run-time status both in safeboot and boot modes,
+#   so that it is immediately visible at myplc (gui or api).
+# 
 
 import xml, xmlrpclib
 import logging
@@ -13,7 +20,7 @@ SESSION_FILE="/etc/planetlab/session"
 
 def read_config_file(filename):
     ## NOTE: text copied from BootManager.py 
-	# TODO: unify this code to make it common. i.e. use ConfigParser module
+    # TODO: unify this code to make it common. i.e. use ConfigParser module
     vars = {}
     vars_file= file(filename,'r')
     validConfFile = True
@@ -49,60 +56,101 @@ except:
 
 
 class Auth:
-	def __init__(self, username=None, password=None, **kwargs):
-		if 'session' in kwargs:
-			self.auth= { 'AuthMethod' : 'session',
-					'session' : kwargs['session'] }
-		else:
-			if username==None and password==None:
-				self.auth = {'AuthMethod': "anonymous"}
-			else:
-				self.auth = {'Username' : username,
-							'AuthMethod' : 'password',
-							'AuthString' : password}
+    def __init__(self, username=None, password=None, **kwargs):
+        if 'session' in kwargs:
+            self.auth= { 'AuthMethod' : 'session',
+                    'session' : kwargs['session'] }
+        else:
+            if username==None and password==None:
+                self.auth = {'AuthMethod': "anonymous"}
+            else:
+                self.auth = {'Username' : username,
+                            'AuthMethod' : 'password',
+                            'AuthString' : password}
 class PLC:
-	def __init__(self, auth, url):
-		self.auth = auth
-		self.url = url
-		self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True)
+    def __init__(self, auth, url):
+        self.auth = auth
+        self.url = url
+        self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True)
 
-	def __getattr__(self, name):
-		method = getattr(self.api, name)
-		if method is None:
-			raise AssertionError("method does not exist")
+    def __getattr__(self, name):
+        method = getattr(self.api, name)
+        if method is None:
+            raise AssertionError("method does not exist")
 
-		return lambda *params : method(self.auth.auth, *params)
+        return lambda *params : method(self.auth.auth, *params)
 
-	def __repr__(self):
-		return self.api.__repr__()
+    def __repr__(self):
+        return self.api.__repr__()
+
+def extract_from(filename, pattern):
+    f = os.popen("grep -E %s %s" % (pattern, filename))
+    val = f.read().strip()
+    return val
+
+def check_running(commandname):
+    f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
+    val = f.read().strip()
+    return val
+    
 
 def main():
 
     f=open(SESSION_FILE,'r')
     session_str=f.read().strip()
     api = PLC(Auth(session=session_str), api_server_url)
-    # NOTE: should we rely on bootmanager for this functionality?
+    # NOTE: What should we do if this call fails?
+	# TODO: handle dns failure here.
     api.AuthCheck()
 
+    try:
+        env = 'production'
+        if len(sys.argv) > 1:
+            env = sys.argv[1]
+    except:
+        traceback.print_exc()
+
     while True:
-        print "reporting status: ", os.popen("uptime").read().strip()
         try:
-            # NOTE: alternately, check other stuff in the environment to infer
-            # run_level
-            #     is BootManager running?
-            #     what is the boot_state at PLC?
-            #     does /vservers exist?
-            #     what about /tmp/source?
-            #     is BootManager in /tmp/source?
-            #     is /tmp/mnt/sysimg mounted?
-            #     how long have we been running?  if we were in safeboot and
-            #       still running, we're likely in failboot now.
-            #     length of runtime increases the certainty of inferred state.
-            #     
-            api.ReportRunlevel({'run_level' : 'safeboot'})
+            # NOTE: here we are inferring the runlevel by environmental
+            #         observations.  We know how this process was started by the
+            #         given command line argument.  Then in bootmanager
+            #         runlevle, the bm.log gives information about the current
+            #         activity.
+            # other options:
+            #   call plc for current boot state?
+            #   how long have we been running?
+            if env == "bootmanager":
+                bs_val = extract_from('/tmp/bm.log', 'Current boot state:')
+                if len(bs_val) > 0: bs_val = bs_val.split()[-1]
+                ex_val = extract_from('/tmp/bm.log', 'Exception')
+                fs_val = extract_from('/tmp/bm.log', 'mke2fs')
+                bm_val = check_running("BootManager.py")
+
+                if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
+                    api.ReportRunlevel({'run_level' : 'safeboot'})
+
+                elif len(ex_val) > len("Exception"):
+                    api.ReportRunlevel({'run_level' : 'failboot'})
+
+                elif len(fs_val) > 0 and len(bm_val) > 0:
+                    api.ReportRunlevel({'run_level' : 'reinstall'})
+
+                else:
+                    api.ReportRunlevel({'run_level' : 'failboot'})
+
+            elif env == "production":
+                api.ReportRunlevel({'run_level' : 'boot'})
+            else:
+                api.ReportRunlevel({'run_level' : 'failboot'})
+                
         except:
+            print "reporting error: ", os.popen("uptime").read().strip()
             traceback.print_exc()
-        time.sleep(30)
+
+        sys.stdout.flush()
+        # TODO: change to a configurable value
+        time.sleep(60*15)
 
 if __name__ == "__main__":
     main()