source/RunlevelAgent.py

   1 #!/usr/bin/python
   2 #
   3 # RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
   4 #     online and whether it is in boot or pre-boot run-level.
   5 #   This is useful to identify nodes that are behind a firewall, as well as to
   6 #   have the machine report run-time status both in safeboot and boot modes,
   7 #   so that it is immediately visible at myplc (gui or api).
   8 #
   9
  10 from __future__ import print_function
  11
  12 import xml, xmlrpclib
  13 import logging
  14 import time
  15 import traceback
  16 import sys
  17 import os
  18 import string
  19
  20 CONFIG_FILE = "/tmp/source/configuration"
  21 SESSION_FILE = "/etc/planetlab/session"
  22 RLA_PID_FILE = "/var/run/rla.pid"
  23
  24 def read_config_file(filename):
  25     ## NOTE: text copied from BootManager.py
  26     # TODO: unify this code to make it common. i.e. use ConfigParser module
  27     vars = {}
  28     vars_file = file(filename,'r')
  29     validConfFile = True
  30     for line in vars_file:
  31         # if its a comment or a whitespace line, ignore
  32         if line[:1] == "#" or string.strip(line) == "":
  33             continue
  34
  35         parts = string.split(line, "=")
  36         if len(parts) != 2:
  37             print("Invalid line in vars file: {}".format(line))
  38             validConfFile = False
  39             break
  40
  41         name = string.strip(parts[0])
  42         value = string.strip(parts[1])
  43         vars[name] = value
  44
  45     vars_file.close()
  46     if not validConfFile:
  47         print("Unable to read configuration vars.")
  48
  49     return vars
  50
  51 try:
  52     sys.path = ['/etc/planetlab'] + sys.path
  53     import plc_config
  54     api_server_url = "https://" + plc_config.PLC_API_HOST + plc_config.PLC_API_PATH
  55 except:
  56     filename  = CONFIG_FILE
  57     vars = read_config_file(filename)
  58     api_server_url = vars['BOOT_API_SERVER']
  59
  60
  61 class Auth:
  62     def __init__(self, username=None, password=None, **kwargs):
  63         if 'session' in kwargs:
  64             self.auth = { 'AuthMethod' : 'session',
  65                           'session' : kwargs['session'] }
  66         else:
  67             if username is None and password is None:
  68                 self.auth = {'AuthMethod': "anonymous"}
  69             else:
  70                 self.auth = {'Username' : username,
  71                              'AuthMethod' : 'password',
  72                              'AuthString' : password}
  73 class PLC:
  74     def __init__(self, auth, url):
  75         self.auth = auth
  76         self.url = url
  77         self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True)
  78
  79     def __getattr__(self, name):
  80         method = getattr(self.api, name)
  81         if method is None:
  82             raise AssertionError("method does not exist")
  83
  84         return lambda *params : method(self.auth.auth, *params)
  85
  86     def __repr__(self):
  87         return self.api.__repr__()
  88
  89 def extract_from(filename, pattern):
  90     f = os.popen("grep -E {} {}".format(pattern, filename))
  91     val = f.read().strip()
  92     return val
  93
  94 def check_running(commandname):
  95     f = os.popen("ps ax | grep -E {} | grep -v grep".format(commandname))
  96     val = f.read().strip()
  97     return val
  98
  99
 100 def save_pid():
 101     # save PID
 102     try:
 103         pid = os.getpid()
 104         f = open(RLA_PID_FILE, 'w')
 105         f.write("{}\n".format(pid))
 106         f.close()
 107     except:
 108         print("Uuuhhh.... this should not occur.")
 109         sys.exit(1)
 110
 111 def start_and_run():
 112
 113     save_pid()
 114
 115     # Keep trying to authenticate session, waiting for NM to re-write the
 116     # session file, or DNS to succeed, until AuthCheck succeeds.
 117     while True:
 118         try:
 119             f = open(SESSION_FILE, 'r')
 120             session_str = f.read().strip()
 121             api = PLC(Auth(session=session_str), api_server_url)
 122             # NOTE: What should we do if this call fails?
 123             # TODO: handle dns failure here.
 124             api.AuthCheck()
 125             break
 126         except:
 127             print("Retry in 30 seconds: ", os.popen("uptime").read().strip())
 128             traceback.print_exc()
 129             time.sleep(30)
 130
 131     try:
 132         env = 'production'
 133         if len(sys.argv) > 2:
 134             env = sys.argv[2]
 135     except:
 136         traceback.print_exc()
 137
 138     while True:
 139         try:
 140             # NOTE: here we are inferring the runlevel by environmental
 141             #         observations.  We know how this process was started by the
 142             #         given command line argument.  Then in bootmanager
 143             #         runlevel, the bm.log gives information about the current
 144             #         activity.
 145             # other options:
 146             #   call plc for current boot state?
 147             #   how long have we been running?
 148             if env == "bootmanager":
 149                 bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
 150                 if len(bs_val) > 0: bs_val = bs_val.split()[-1]
 151                 ex_val = extract_from('/tmp/bm.log', 'Exception')
 152                 fs_val = extract_from('/tmp/bm.log', 'mke2fs')
 153                 bm_val = check_running("BootManager.py")
 154
 155                 if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
 156                     api.ReportRunlevel({'run_level' : 'safeboot'})
 157
 158                 elif len(ex_val) > len("Exception"):
 159                     api.ReportRunlevel({'run_level' : 'failboot'})
 160
 161                 elif len(fs_val) > 0 and len(bm_val) > 0:
 162                     api.ReportRunlevel({'run_level' : 'reinstall'})
 163
 164                 else:
 165                     api.ReportRunlevel({'run_level' : 'failboot'})
 166
 167             elif env == "production":
 168                 api.ReportRunlevel({'run_level' : 'boot'})
 169             else:
 170                 api.ReportRunlevel({'run_level' : 'failboot'})
 171
 172         except:
 173             print("reporting error: ", os.popen("uptime").read().strip())
 174             traceback.print_exc()
 175
 176         sys.stdout.flush()
 177         # TODO: change to a configurable value
 178         time.sleep(60*15)
 179
 180 def agent_running():
 181     try:
 182         os.stat(RLA_PID_FILE)
 183         f = os.popen("ps ax | grep RunlevelAgent | grep -Ev 'grep|vim' | awk '{print $1}' | wc -l")
 184         l = f.read().strip()
 185         if int(l) >= 2:
 186             return True
 187         else:
 188             try:
 189                 os.unlink(RLA_PID_FILE)
 190             except:
 191                 pass
 192             return False
 193     except:
 194         return False
 195
 196
 197 def shutdown():
 198     import signal
 199
 200     pid = open(RLA_PID_FILE, 'r').read().strip()
 201
 202     # Try three different ways to kill the process.  Just to be sure.
 203     os.kill(int(pid), signal.SIGKILL)
 204     os.system("pkill RunlevelAgent.py")
 205     os.system("ps ax | grep RunlevelAgent | grep -v grep | awk '{print $1}' | xargs kill -9 ")
 206
 207 if __name__ == "__main__":
 208     if "start" in sys.argv and not agent_running():
 209         start_and_run()
 210
 211     if "stop" in sys.argv and agent_running():
 212         shutdown()