+class FailurePolicy(object):
+ """ Defines how to respond to experiment failures
+ """
+ IGNORE_RM_FAILURE = 1
+ ABORT_ON_RM_FAILURE = 2
+
+class FailureLevel(object):
+ """ Describe the system failure state
+ """
+ OK = 1
+ RM_FAILURE = 2
+ TASK_FAILURE = 3
+ EC_FAILURE = 4
+
+class FailureManager(object):
+ """ The FailureManager is responsible for handling errors,
+ and deciding whether an experiment should be aborted
+ """
+
+ def __init__(self, failure_policy = None):
+ self._failure_level = FailureLevel.OK
+ self._failure_policy = failure_policy or \
+ FailurePolicy.ABORT_ON_RM_FAILURE
+
+ @property
+ def abort(self):
+ if self._failure_level == FailureLevel.EC_FAILURE:
+ return True
+
+ if self._failure_level in [FailureLevel.TASK_FAILURE,
+ FailureLevel.RM_FAILURE] and \
+ self._failure_policy == FailurePolicy.ABORT_ON_RM_FAILURE:
+ return True
+
+ return False
+
+ def set_rm_failure(self):
+ self._failure_level = FailureLevel.RM_FAILURE
+
+ def set_task_failure(self):
+ self._failure_level = FailureLevel.TASK_FAILURE
+
+ def set_ec_failure(self):
+ self._failure_level = FailureLevel.EC_FAILURE
+