def __init__(self, ec):
self._ec = weakref.ref(ec)
self._failure_level = FailureLevel.OK
+ self._abort = False
@property
def ec(self):
@property
def abort(self):
+ return self._abort
+
+ def eval_failure(self, guid):
if self._failure_level == FailureLevel.OK:
- for guid in self.ec.resources:
- try:
- state = self.ec.state(guid)
- critical = self.ec.get(guid, "critical")
- if state == ResourceState.FAILED and critical:
- self._failure_level = FailureLevel.RM_FAILURE
- self.ec.logger.debug("RM critical failure occurred on guid %d." \
- " Setting EC FAILURE LEVEL to RM_FAILURE" % guid)
- break
- except:
- # An error might occure because a RM was deleted abruptly.
- # In this case the error should be ignored.
- if guid in self.ec._resources:
- raise
-
- return self._failure_level != FailureLevel.OK
+ rm = self.ec.get_resource(guid)
+ state = rm.state
+ critical = rm.get("critical")
+
+ if state == ResourceState.FAILED and critical:
+ self._failure_level = FailureLevel.RM_FAILURE
+ self._abort = True
+ self.ec.logger.debug("RM critical failure occurred on guid %d." \
+ " Setting EC FAILURE LEVEL to RM_FAILURE" % guid)
def set_ec_failure(self):
self._failure_level = FailureLevel.EC_FAILURE
# EC state
self._state = ECState.RUNNING
- # Blacklist file for PL nodes
- nepi_home = os.path.join(os.path.expanduser("~"), ".nepi")
- plblacklist_file = os.path.join(nepi_home, "plblacklist.txt")
- if not os.path.exists(plblacklist_file):
- if os.path.isdir(nepi_home):
- open(plblacklist_file, 'w').close()
- else:
- os.makedirs(nepi_home)
- open(plblacklist_file, 'w').close()
-
# The runner is a pool of threads used to parallelize
# execution of tasks
- nthreads = int(os.environ.get("NEPI_NTHREADS", "50"))
+ nthreads = int(os.environ.get("NEPI_NTHREADS", "20"))
self._runner = ParallelRun(maxthreads = nthreads)
# Event processing thread
"""
return self._logger
+ @property
+ def failure_level(self):
+ """ Returns the level of FAILURE of th experiment
+
+ """
+
+ return self._fm._failure_level
+
@property
def ecstate(self):
""" Returns the state of the Experiment Controller
"""
return self._fm.abort
+ def inform_failure(self, guid):
+ """ Reports a failure in a RM to the EC for evaluation
+
+ :param guid: Resource id
+ :type guid: int
+
+ """
+
+ return self._fm.eval_failure(guid)
+
def wait_finished(self, guids):
""" Blocking method that waits until all RMs in the 'guids' list
have reached a state >= STOPPED (i.e. STOPPED, FAILED or
:type guids: list
"""
-
if isinstance(guids, int):
guids = [guids]
break
# If a guid reached one of the target states, remove it from list
- guid = guids[0]
- rstate = self.state(guid)
+ guid = guids.pop()
+ rm = self.get_resource(guid)
+ rstate = rm.state
- hrrstate = ResourceState2str.get(rstate)
- hrstate = ResourceState2str.get(state)
-
if rstate >= state:
- guids.remove(guid)
- rm = self.get_resource(guid)
self.logger.debug(" %s guid %d DONE - state is %s, required is >= %s " % (
- rm.get_rtype(), guid, hrrstate, hrstate))
+ rm.get_rtype(), guid, rstate, state))
else:
# Debug...
self.logger.debug(" WAITING FOR guid %d - state is %s, required is >= %s " % (
- guid, hrrstate, hrstate))
+ guid, rstate, state))
+
+ guids.append(guid)
+
time.sleep(0.5)
def get_task(self, tid):
if not guids:
# If no guids list was passed, all 'NEW' RMs will be deployed
guids = []
- for guid in self.resources:
- if self.state(guid) == ResourceState.NEW:
+ for guid, rm in self._resources.iteritems():
+ if rm.state == ResourceState.NEW:
guids.append(guid)
if isinstance(guids, int):