source/RunlevelAgent.py

   1 #!/usr/bin/python
   2 #
   3 # RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
   4 #     online and whether it is in boot or pre-boot run-level.
   5 #   This is useful to identify nodes that are behind a firewall, as well as to
   6 #   have the machine report run-time status both in safeboot and boot modes,
   7 #   so that it is immediately visible at myplc (gui or api).
   8 #
   9
  10 import xml, xmlrpclib
  11 import logging
  12 import time
  13 import traceback
  14 import sys
  15 import os
  16 import string
  17
  18 CONFIG_FILE = "/tmp/source/configuration"
  19 SESSION_FILE = "/etc/planetlab/session"
  20 RLA_PID_FILE = "/var/run/rla.pid"
  21
  22 def read_config_file(filename):
  23     ## NOTE: text copied from BootManager.py
  24     # TODO: unify this code to make it common. i.e. use ConfigParser module
  25     vars = {}
  26     vars_file = file(filename,'r')
  27     validConfFile = True
  28     for line in vars_file:
  29         # if its a comment or a whitespace line, ignore
  30         if line[:1] == "#" or string.strip(line) == "":
  31             continue
  32
  33         parts = string.split(line, "=")
  34         if len(parts) != 2:
  35             print "Invalid line in vars file: {}".format(line)
  36             validConfFile = False
  37             break
  38
  39         name = string.strip(parts[0])
  40         value = string.strip(parts[1])
  41         vars[name] = value
  42
  43     vars_file.close()
  44     if not validConfFile:
  45         print "Unable to read configuration vars."
  46
  47     return vars
  48
  49 try:
  50     sys.path = ['/etc/planetlab'] + sys.path
  51     import plc_config
  52     api_server_url = "https://" + plc_config.PLC_API_HOST + plc_config.PLC_API_PATH
  53 except:
  54     filename  = CONFIG_FILE
  55     vars = read_config_file(filename)
  56     api_server_url = vars['BOOT_API_SERVER']
  57
  58
  59 class Auth:
  60     def __init__(self, username=None, password=None, **kwargs):
  61         if 'session' in kwargs:
  62             self.auth = { 'AuthMethod' : 'session',
  63                           'session' : kwargs['session'] }
  64         else:
  65             if username is None and password is None:
  66                 self.auth = {'AuthMethod': "anonymous"}
  67             else:
  68                 self.auth = {'Username' : username,
  69                              'AuthMethod' : 'password',
  70                              'AuthString' : password}
  71 class PLC:
  72     def __init__(self, auth, url):
  73         self.auth = auth
  74         self.url = url
  75         self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True)
  76
  77     def __getattr__(self, name):
  78         method = getattr(self.api, name)
  79         if method is None:
  80             raise AssertionError("method does not exist")
  81
  82         return lambda *params : method(self.auth.auth, *params)
  83
  84     def __repr__(self):
  85         return self.api.__repr__()
  86
  87 def extract_from(filename, pattern):
  88     f = os.popen("grep -E {} {}".format(pattern, filename))
  89     val = f.read().strip()
  90     return val
  91
  92 def check_running(commandname):
  93     f = os.popen("ps ax | grep -E {} | grep -v grep".format(commandname))
  94     val = f.read().strip()
  95     return val
  96
  97
  98 def save_pid():
  99     # save PID
 100     try:
 101         pid = os.getpid()
 102         f = open(RLA_PID_FILE, 'w')
 103         f.write("{}\n".format(pid))
 104         f.close()
 105     except:
 106         print "Uuuhhh.... this should not occur."
 107         sys.exit(1)
 108
 109 def start_and_run():
 110
 111     save_pid()
 112
 113     # Keep trying to authenticate session, waiting for NM to re-write the
 114     # session file, or DNS to succeed, until AuthCheck succeeds.
 115     while True:
 116         try:
 117             f = open(SESSION_FILE, 'r')
 118             session_str = f.read().strip()
 119             api = PLC(Auth(session=session_str), api_server_url)
 120             # NOTE: What should we do if this call fails?
 121             # TODO: handle dns failure here.
 122             api.AuthCheck()
 123             break
 124         except:
 125             print "Retry in 30 seconds: ", os.popen("uptime").read().strip()
 126             traceback.print_exc()
 127             time.sleep(30)
 128
 129     try:
 130         env = 'production'
 131         if len(sys.argv) > 2:
 132             env = sys.argv[2]
 133     except:
 134         traceback.print_exc()
 135
 136     while True:
 137         try:
 138             # NOTE: here we are inferring the runlevel by environmental
 139             #         observations.  We know how this process was started by the
 140             #         given command line argument.  Then in bootmanager
 141             #         runlevel, the bm.log gives information about the current
 142             #         activity.
 143             # other options:
 144             #   call plc for current boot state?
 145             #   how long have we been running?
 146             if env == "bootmanager":
 147                 bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
 148                 if len(bs_val) > 0: bs_val = bs_val.split()[-1]
 149                 ex_val = extract_from('/tmp/bm.log', 'Exception')
 150                 fs_val = extract_from('/tmp/bm.log', 'mke2fs')
 151                 bm_val = check_running("BootManager.py")
 152
 153                 if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
 154                     api.ReportRunlevel({'run_level' : 'safeboot'})
 155
 156                 elif len(ex_val) > len("Exception"):
 157                     api.ReportRunlevel({'run_level' : 'failboot'})
 158
 159                 elif len(fs_val) > 0 and len(bm_val) > 0:
 160                     api.ReportRunlevel({'run_level' : 'reinstall'})
 161
 162                 else:
 163                     api.ReportRunlevel({'run_level' : 'failboot'})
 164
 165             elif env == "production":
 166                 api.ReportRunlevel({'run_level' : 'boot'})
 167             else:
 168                 api.ReportRunlevel({'run_level' : 'failboot'})
 169
 170         except:
 171             print "reporting error: ", os.popen("uptime").read().strip()
 172             traceback.print_exc()
 173
 174         sys.stdout.flush()
 175         # TODO: change to a configurable value
 176         time.sleep(60*15)
 177
 178 def agent_running():
 179     try:
 180         os.stat(RLA_PID_FILE)
 181         f = os.popen("ps ax | grep RunlevelAgent | grep -Ev 'grep|vim' | awk '{print $1}' | wc -l")
 182         l = f.read().strip()
 183         if int(l) >= 2:
 184             return True
 185         else:
 186             try:
 187                 os.unlink(RLA_PID_FILE)
 188             except:
 189                 pass
 190             return False
 191     except:
 192         return False
 193
 194
 195 def shutdown():
 196     import signal
 197
 198     pid = open(RLA_PID_FILE, 'r').read().strip()
 199
 200     # Try three different ways to kill the process.  Just to be sure.
 201     os.kill(int(pid), signal.SIGKILL)
 202     os.system("pkill RunlevelAgent.py")
 203     os.system("ps ax | grep RunlevelAgent | grep -v grep | awk '{print $1}' | xargs kill -9 ")
 204
 205 if __name__ == "__main__":
 206     if "start" in sys.argv and not agent_running():
 207         start_and_run()
 208
 209     if "stop" in sys.argv and agent_running():
 210         shutdown()