2 # NEPI, a framework to manage network experiments
3 # Copyright (C) 2013 INRIA
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # Author: Alina Quereilhac <alina.quereilhac@inria.fr>
20 from nepi.util.timefuncs import tnow, tdiff, tdiffsec, stabsformat
21 from nepi.util.logger import Logger
22 from nepi.execution.attribute import Attribute, Flags, Types
23 from nepi.execution.trace import TraceAttr
34 reschedule_delay = "1s"
37 """ Action that a user can order to a Resource Manager
45 """ State of a Resource Manager
57 ResourceState2str = dict({
58 ResourceState.NEW : "NEW",
59 ResourceState.DISCOVERED : "DISCOVERED",
60 ResourceState.PROVISIONED : "PROVISIONED",
61 ResourceState.READY : "READY",
62 ResourceState.STARTED : "STARTED",
63 ResourceState.STOPPED : "STOPPED",
64 ResourceState.FAILED : "FAILED",
65 ResourceState.RELEASED : "RELEASED",
69 """ Initializes template information (i.e. attributes and traces)
70 on classes derived from the ResourceManager class.
72 It is used as a decorator in the class declaration as follows:
75 class MyResourceManager(ResourceManager):
84 def clsinit_copy(cls):
85 """ Initializes template information (i.e. attributes and traces)
86 on classes direved from the ResourceManager class.
87 It differs from the clsinit method in that it forces inheritance
88 of attributes and traces from the parent class.
90 It is used as a decorator in the class declaration as follows:
93 class MyResourceManager(ResourceManager):
98 clsinit_copy should be prefered to clsinit when creating new
99 ResourceManager child classes.
107 """ Decorator function for instance methods that should set the
108 RM state to FAILED when an error is raised. The methods that must be
109 decorated are: discover, provision, deploy, start, stop.
112 def wrapped(self, *args, **kwargs):
114 return func(self, *args, **kwargs)
117 err = traceback.format_exc()
119 self.debug("SETTING guid %d to state FAILED" % self.guid)
126 class ResourceManager(Logger):
127 """ Base clase for all ResourceManagers.
129 A ResourceManger is specific to a resource type (e.g. Node,
130 Switch, Application, etc) on a specific backend (e.g. PlanetLab,
133 The ResourceManager instances are responsible for interacting with
134 and controlling concrete (physical or virtual) resources in the
135 experimental backends.
145 def _register_attribute(cls, attr):
146 """ Resource subclasses will invoke this method to add a
151 cls._attributes[attr.name] = attr
154 def _remove_attribute(cls, name):
155 """ Resource subclasses will invoke this method to remove a
160 del cls._attributes[name]
163 def _register_trace(cls, trace):
164 """ Resource subclasses will invoke this method to add a
169 cls._traces[trace.name] = trace
172 def _remove_trace(cls, name):
173 """ Resource subclasses will invoke this method to remove a
178 del cls._traces[name]
181 def _register_attributes(cls):
182 """ Resource subclasses will invoke this method to register
185 This method should be overriden in the RMs that define
189 critical = Attribute("critical",
190 "Defines whether the resource is critical. "
191 "A failure on a critical resource will interrupt "
195 flags = Flags.Design)
196 hard_release = Attribute("hardRelease",
197 "Forces removal of all result files and directories associated "
198 "to the RM upon resource release. After release the RM will "
199 "be removed from the EC and the results will not longer be "
203 flags = Flags.Design)
205 cls._register_attribute(critical)
206 cls._register_attribute(hard_release)
209 def _register_traces(cls):
210 """ Resource subclasses will invoke this method to register
213 This method should be overriden in the RMs that define traces.
221 """ ResourceManager classes have different attributes and traces.
222 Attribute and traces are stored in 'class attribute' dictionaries.
223 When a new ResourceManager class is created, the _clsinit method is
224 called to create a new instance of those dictionaries and initialize
227 The _clsinit method is called by the clsinit decorator method.
231 # static template for resource attributes
232 cls._attributes = dict()
233 cls._register_attributes()
235 # static template for resource traces
237 cls._register_traces()
240 def _clsinit_copy(cls):
241 """ Same as _clsinit, except that after creating new instances of the
242 dictionaries it copies all the attributes and traces from the parent
245 The _clsinit_copy method is called by the clsinit_copy decorator method.
248 # static template for resource attributes
249 cls._attributes = copy.deepcopy(cls._attributes)
250 cls._register_attributes()
252 # static template for resource traces
253 cls._traces = copy.deepcopy(cls._traces)
254 cls._register_traces()
258 """ Returns the type of the Resource Manager
264 def get_attributes(cls):
265 """ Returns a copy of the attributes
268 return copy.deepcopy(cls._attributes.values())
271 def get_attribute(cls, name):
272 """ Returns a copy of the attribute with name 'name'
275 return copy.deepcopy(cls._attributes[name])
280 """ Returns a copy of the traces
283 return copy.deepcopy(cls._traces.values())
287 """ Returns the description of the type of Resource
293 def get_backend(cls):
294 """ Returns the identified of the backend (i.e. testbed, environment)
301 def get_global(cls, name):
302 """ Returns the value of a global attribute
303 Global attribute meaning an attribute for
304 all the resources from a rtype
306 :param name: Name of the attribute
310 global_attr = cls._attributes[name]
311 return global_attr.value
314 def set_global(cls, name, value):
315 """ Set value for a global attribute
317 :param name: Name of the attribute
319 :param name: Value of the attribute
322 global_attr = cls._attributes[name]
323 global_attr.value = value
326 def __init__(self, ec, guid):
327 super(ResourceManager, self).__init__(self.get_rtype())
330 self._ec = weakref.ref(ec)
331 self._connections = set()
332 self._conditions = dict()
334 # the resource instance gets a copy of all attributes
335 self._attrs = copy.deepcopy(self._attributes)
337 # the resource instance gets a copy of all traces
338 self._trcs = copy.deepcopy(self._traces)
340 # Each resource is placed on a deployment group by the EC
342 self.deployment_group = None
344 self._start_time = None
345 self._stop_time = None
346 self._discover_time = None
347 self._provision_time = None
348 self._ready_time = None
349 self._release_time = None
350 self._failed_time = None
352 self._state = ResourceState.NEW
354 # instance lock to synchronize exclusive state change methods (such
355 # as deploy and release methods), in order to prevent them from being
356 # executed at the same time
357 self._release_lock = threading.Lock()
361 """ Returns the global unique identifier of the RM """
366 """ Returns the Experiment Controller of the RM """
370 def connections(self):
371 """ Returns the set of guids of connected RMs """
372 return self._connections
375 def conditions(self):
376 """ Returns the conditions to which the RM is subjected to.
378 This method returns a dictionary of conditions lists indexed by
382 return self._conditions
385 def start_time(self):
386 """ Returns the start time of the RM as a timestamp """
387 return self._start_time
391 """ Returns the stop time of the RM as a timestamp """
392 return self._stop_time
395 def discover_time(self):
396 """ Returns the discover time of the RM as a timestamp """
397 return self._discover_time
400 def provision_time(self):
401 """ Returns the provision time of the RM as a timestamp """
402 return self._provision_time
405 def ready_time(self):
406 """ Returns the deployment time of the RM as a timestamp """
407 return self._ready_time
410 def release_time(self):
411 """ Returns the release time of the RM as a timestamp """
412 return self._release_time
415 def failed_time(self):
416 """ Returns the time failure occured for the RM as a timestamp """
417 return self._failed_time
421 """ Get the current state of the RM """
424 def log_message(self, msg):
425 """ Returns the log message formatted with added information.
427 :param msg: text message
432 return " %s guid %d - %s " % (self._rtype, self.guid, msg)
434 def register_connection(self, guid):
435 """ Registers a connection to the RM identified by guid
437 This method should not be overriden. Specific functionality
438 should be added in the do_connect method.
440 :param guid: Global unique identified of the RM to connect to
444 if self.valid_connection(guid):
445 self.do_connect(guid)
446 self._connections.add(guid)
448 def unregister_connection(self, guid):
449 """ Removes a registered connection to the RM identified by guid
451 This method should not be overriden. Specific functionality
452 should be added in the do_disconnect method.
454 :param guid: Global unique identified of the RM to connect to
458 if guid in self._connections:
459 self.do_disconnect(guid)
460 self._connections.remove(guid)
464 """ Performs resource discovery.
466 This method is responsible for selecting an individual resource
467 matching user requirements.
469 This method should not be overriden directly. Specific functionality
470 should be added in the do_discover method.
473 with self._release_lock:
474 if self._state != ResourceState.RELEASED:
479 """ Performs resource provisioning.
481 This method is responsible for provisioning one resource.
482 After this method has been successfully invoked, the resource
483 should be accessible/controllable by the RM.
485 This method should not be overriden directly. Specific functionality
486 should be added in the do_provision method.
489 with self._release_lock:
490 if self._state != ResourceState.RELEASED:
495 """ Starts the RM (e.g. launch remote process).
497 There is no standard start behavior. Some RMs will not need to perform
498 any actions upon start.
500 This method should not be overriden directly. Specific functionality
501 should be added in the do_start method.
505 if not self.state in [ResourceState.READY, ResourceState.STOPPED]:
506 self.error("Wrong state %s for start" % self.state)
509 with self._release_lock:
510 if self._state != ResourceState.RELEASED:
515 """ Interrupts the RM, stopping any tasks the RM was performing.
517 There is no standard stop behavior. Some RMs will not need to perform
518 any actions upon stop.
520 This method should not be overriden directly. Specific functionality
521 should be added in the do_stop method.
524 if not self.state in [ResourceState.STARTED]:
525 self.error("Wrong state %s for stop" % self.state)
528 with self._release_lock:
533 """ Execute all steps required for the RM to reach the state READY.
535 This method is responsible for deploying the resource (and invoking
536 the discover and provision methods).
538 This method should not be overriden directly. Specific functionality
539 should be added in the do_deploy method.
542 if self.state > ResourceState.READY:
543 self.error("Wrong state %s for deploy" % self.state)
546 with self._release_lock:
547 if self._state != ResourceState.RELEASED:
551 """ Perform actions to free resources used by the RM.
553 This method is responsible for releasing resources that were
554 used during the experiment by the RM.
556 This method should not be overriden directly. Specific functionality
557 should be added in the do_release method.
560 with self._release_lock:
565 err = traceback.format_exc()
571 """ Sets the RM to state FAILED.
573 This method should not be overriden directly. Specific functionality
574 should be added in the do_fail method.
577 with self._release_lock:
578 if self._state != ResourceState.RELEASED:
581 def set(self, name, value):
582 """ Set the value of the attribute
584 :param name: Name of the attribute
586 :param name: Value of the attribute
589 attr = self._attrs[name]
594 """ Returns the value of the attribute
596 :param name: Name of the attribute
600 attr = self._attrs[name]
601 if attr.has_flag(Flags.Global):
602 self.warning( "Attribute %s is global. Use get_global instead." % name)
606 def has_changed(self, name):
607 """ Returns the True is the value of the attribute
608 has been modified by the user.
610 :param name: Name of the attribute
614 attr = self._attrs[name]
615 return attr.has_changed()
617 def has_flag(self, name, flag):
618 """ Returns true if the attribute has the flag 'flag'
620 :param flag: Flag to be checked
623 attr = self._attrs[name]
624 return attr.has_flag(flag)
626 def has_attribute(self, name):
627 """ Returns true if the RM has an attribute with name
629 :param name: name of the attribute
632 return name in self._attrs
634 def enable_trace(self, name):
635 """ Explicitly enable trace generation
637 :param name: Name of the trace
640 trace = self._trcs[name]
643 def trace_enabled(self, name):
644 """Returns True if trace is enables
646 :param name: Name of the trace
649 trace = self._trcs[name]
652 def trace(self, name, attr = TraceAttr.ALL, block = 512, offset = 0):
653 """ Get information on collected trace
655 :param name: Name of the trace
658 :param attr: Can be one of:
659 - TraceAttr.ALL (complete trace content),
660 - TraceAttr.STREAM (block in bytes to read starting at offset),
661 - TraceAttr.PATH (full path to the trace file),
662 - TraceAttr.SIZE (size of trace file).
665 :param block: Number of bytes to retrieve from trace, when attr is TraceAttr.STREAM
668 :param offset: Number of 'blocks' to skip, when attr is TraceAttr.STREAM
675 def register_condition(self, action, group, state, time = None):
676 """ Registers a condition on the resource manager to allow execution
677 of 'action' only after 'time' has elapsed from the moment all resources
678 in 'group' reached state 'state'
680 :param action: Action to restrict to condition (either 'START' or 'STOP')
682 :param group: Group of RMs to wait for (list of guids)
683 :type group: int or list of int
684 :param state: State to wait for on all RM in group. (either 'STARTED', 'STOPPED' or 'READY')
686 :param time: Time to wait after 'state' is reached on all RMs in group. (e.g. '2s')
691 if not action in self.conditions:
692 self._conditions[action] = list()
694 conditions = self.conditions.get(action)
696 # For each condition to register a tuple of (group, state, time) is
697 # added to the 'action' list
698 if not isinstance(group, list):
701 conditions.append((group, state, time))
703 def unregister_condition(self, group, action = None):
704 """ Removed conditions for a certain group of guids
706 :param action: Action to restrict to condition (either 'START', 'STOP' or 'READY')
709 :param group: Group of RMs to wait for (list of guids)
710 :type group: int or list of int
713 # For each condition a tuple of (group, state, time) is
714 # added to the 'action' list
715 if not isinstance(group, list):
718 for act, conditions in self.conditions.iteritems():
719 if action and act != action:
722 for condition in list(conditions):
723 (grp, state, time) = condition
725 # If there is an intersection between grp and group,
726 # then remove intersected elements
727 intsec = set(group).intersection(set(grp))
729 idx = conditions.index(condition)
731 newgrp.difference_update(intsec)
732 conditions[idx] = (newgrp, state, time)
734 def get_connected(self, rtype = None):
735 """ Returns the list of RM with the type 'rtype'
737 :param rtype: Type of the RM we look for
739 :return: list of guid
742 rclass = ResourceFactory.get_resource_type(rtype)
743 for guid in self.connections:
745 rm = self.ec.get_resource(guid)
746 if not rtype or isinstance(rm, rclass):
751 def _needs_reschedule(self, group, state, time):
752 """ Internal method that verify if 'time' has elapsed since
753 all elements in 'group' have reached state 'state'.
755 :param group: Group of RMs to wait for (list of guids)
756 :type group: int or list of int
757 :param state: State to wait for on all RM in group. (either 'STARTED', 'STOPPED' or 'READY')
759 :param time: Time to wait after 'state' is reached on all RMs in group. (e.g. '2s')
762 .. note : time should be written like "2s" or "3m" with s for seconds, m for minutes, h for hours, ...
763 If for example, you need to wait 2min 30sec, time could be "150s" or "2.5m".
764 For the moment, 2m30s is not a correct syntax.
768 delay = reschedule_delay
770 # check state and time elapsed on all RMs
772 rm = self.ec.get_resource(guid)
774 # If one of the RMs this resource needs to wait for has FAILED
775 # and is critical we raise an exception
776 if rm.state == ResourceState.FAILED:
777 if not rm.get('critical'):
779 msg = "Resource can not wait for FAILED RM %d. Setting Resource to FAILED"
780 raise RuntimeError, msg
782 # If the RM state is lower than the requested state we must
783 # reschedule (e.g. if RM is READY but we required STARTED).
788 # If there is a time restriction, we must verify the
789 # restriction is satisfied
791 if state == ResourceState.DISCOVERED:
793 if state == ResourceState.PROVISIONED:
794 t = rm.provision_time
795 elif state == ResourceState.READY:
797 elif state == ResourceState.STARTED:
799 elif state == ResourceState.STOPPED:
801 elif state == ResourceState.RELEASED:
806 # time already elapsed since RM changed state
807 waited = "%fs" % tdiffsec(tnow(), t)
810 wait = tdiffsec(stabsformat(time), stabsformat(waited))
817 return reschedule, delay
819 def set_with_conditions(self, name, value, group, state, time):
820 """ Set value 'value' on attribute with name 'name' when 'time'
821 has elapsed since all elements in 'group' have reached state
824 :param name: Name of the attribute to set
826 :param name: Value of the attribute to set
828 :param group: Group of RMs to wait for (list of guids)
829 :type group: int or list of int
830 :param state: State to wait for on all RM in group. (either 'STARTED', 'STOPPED' or 'READY')
832 :param time: Time to wait after 'state' is reached on all RMs in group. (e.g. '2s')
837 delay = reschedule_delay
839 ## evaluate if set conditions are met
841 # only can set with conditions after the RM is started
842 if self.state != ResourceState.STARTED:
845 reschedule, delay = self._needs_reschedule(group, state, time)
848 callback = functools.partial(self.set_with_conditions,
849 name, value, group, state, time)
850 self.ec.schedule(delay, callback)
852 self.set(name, value)
854 def start_with_conditions(self):
855 """ Starts RM when all the conditions in self.conditions for
856 action 'START' are satisfied.
859 #import pdb;pdb.set_trace()
862 delay = reschedule_delay
865 ## evaluate if conditions to start are met
869 # Can only start when RM is either STOPPED or READY
870 if self.state not in [ResourceState.STOPPED, ResourceState.READY]:
872 self.debug("---- RESCHEDULING START ---- state %s " % self.state )
874 start_conditions = self.conditions.get(ResourceAction.START, [])
876 self.debug("---- START CONDITIONS ---- %s" % start_conditions)
878 # Verify all start conditions are met
879 for (group, state, time) in start_conditions:
880 # Uncomment for debug
883 rm = self.ec.get_resource(guid)
884 unmet.append((guid, rm._state))
886 self.debug("---- WAITED STATES ---- %s" % unmet )
888 reschedule, delay = self._needs_reschedule(group, state, time)
893 self.ec.schedule(delay, self.start_with_conditions)
895 self.debug("----- STARTING ---- ")
898 def stop_with_conditions(self):
899 """ Stops RM when all the conditions in self.conditions for
900 action 'STOP' are satisfied.
904 delay = reschedule_delay
906 ## evaluate if conditions to stop are met
910 # only can stop when RM is STARTED
911 if self.state != ResourceState.STARTED:
913 self.debug("---- RESCHEDULING STOP ---- state %s " % self.state )
915 self.debug(" ---- STOP CONDITIONS ---- %s" %
916 self.conditions.get(ResourceAction.STOP))
918 stop_conditions = self.conditions.get(ResourceAction.STOP, [])
919 for (group, state, time) in stop_conditions:
920 reschedule, delay = self._needs_reschedule(group, state, time)
925 callback = functools.partial(self.stop_with_conditions)
926 self.ec.schedule(delay, callback)
928 self.debug(" ----- STOPPING ---- ")
931 def deploy_with_conditions(self):
932 """ Deploy RM when all the conditions in self.conditions for
933 action 'READY' are satisfied.
937 delay = reschedule_delay
939 ## evaluate if conditions to deploy are met
943 # only can deploy when RM is either NEW, DISCOVERED or PROVISIONED
944 if self.state not in [ResourceState.NEW, ResourceState.DISCOVERED,
945 ResourceState.PROVISIONED]:
947 self.debug("---- RESCHEDULING DEPLOY ---- state %s " % self.state )
949 deploy_conditions = self.conditions.get(ResourceAction.DEPLOY, [])
951 self.debug("---- DEPLOY CONDITIONS ---- %s" % deploy_conditions)
953 # Verify all start conditions are met
954 for (group, state, time) in deploy_conditions:
955 # Uncomment for debug
958 # rm = self.ec.get_resource(guid)
959 # unmet.append((guid, rm._state))
961 #self.debug("---- WAITED STATES ---- %s" % unmet )
963 reschedule, delay = self._needs_reschedule(group, state, time)
968 self.ec.schedule(delay, self.deploy_with_conditions)
970 self.debug("----- DEPLOYING ---- ")
973 def do_connect(self, guid):
974 """ Performs actions that need to be taken upon associating RMs.
975 This method should be redefined when necessary in child classes.
979 def do_disconnect(self, guid):
980 """ Performs actions that need to be taken upon disassociating RMs.
981 This method should be redefined when necessary in child classes.
985 def valid_connection(self, guid):
986 """Checks whether a connection with the other RM
988 This method need to be redefined by each new Resource Manager.
990 :param guid: Guid of the current Resource Manager
998 def do_discover(self):
999 self.set_discovered()
1001 def do_provision(self):
1002 self.set_provisioned()
1010 def do_deploy(self):
1013 def do_release(self):
1019 def set_started(self):
1020 """ Mark ResourceManager as STARTED """
1021 self.set_state(ResourceState.STARTED, "_start_time")
1022 self.debug("----- STARTED ---- ")
1024 def set_stopped(self):
1025 """ Mark ResourceManager as STOPPED """
1026 self.set_state(ResourceState.STOPPED, "_stop_time")
1027 self.debug("----- STOPPED ---- ")
1029 def set_ready(self):
1030 """ Mark ResourceManager as READY """
1031 self.set_state(ResourceState.READY, "_ready_time")
1032 self.debug("----- READY ---- ")
1034 def set_released(self):
1035 """ Mark ResourceManager as REALEASED """
1036 self.set_state(ResourceState.RELEASED, "_release_time")
1037 self.debug("----- RELEASED ---- ")
1039 def set_failed(self):
1040 """ Mark ResourceManager as FAILED """
1041 self.set_state(ResourceState.FAILED, "_failed_time")
1042 self.debug("----- FAILED ---- ")
1044 def set_discovered(self):
1045 """ Mark ResourceManager as DISCOVERED """
1046 self.set_state(ResourceState.DISCOVERED, "_discover_time")
1047 self.debug("----- DISCOVERED ---- ")
1049 def set_provisioned(self):
1050 """ Mark ResourceManager as PROVISIONED """
1051 self.set_state(ResourceState.PROVISIONED, "_provision_time")
1052 self.debug("----- PROVISIONED ---- ")
1054 def set_state(self, state, state_time_attr):
1055 """ Set the state of the RM while keeping a trace of the time """
1057 # Ensure that RM state will not change after released
1058 if self._state == ResourceState.RELEASED:
1061 setattr(self, state_time_attr, tnow())
1064 class ResourceFactory(object):
1065 _resource_types = dict()
1068 def resource_types(cls):
1069 """Return the type of the Class"""
1070 return cls._resource_types
1073 def get_resource_type(cls, rtype):
1074 """Return the type of the Class"""
1075 return cls._resource_types.get(rtype)
1078 def register_type(cls, rclass):
1079 """Register a new Ressource Manager"""
1080 cls._resource_types[rclass.get_rtype()] = rclass
1083 def create(cls, rtype, ec, guid):
1084 """Create a new instance of a Ressource Manager"""
1085 rclass = cls._resource_types[rtype]
1086 return rclass(ec, guid)
1088 def populate_factory():
1089 """Register all the possible RM that exists in the current version of Nepi.
1091 # Once the factory is populated, don't repopulate
1092 if not ResourceFactory.resource_types():
1093 for rclass in find_types():
1094 ResourceFactory.register_type(rclass)
1097 """Look into the different folders to find all the
1098 availables Resources Managers
1100 search_path = os.environ.get("NEPI_SEARCH_PATH", "")
1101 search_path = set(search_path.split(" "))
1104 import nepi.resources
1105 path = os.path.dirname(nepi.resources.__file__)
1106 search_path.add(path)
1110 for importer, modname, ispkg in pkgutil.walk_packages(search_path,
1111 prefix = "nepi.resources."):
1113 loader = importer.find_module(modname)
1116 # Notice: Repeated calls to load_module will act as a reload of the module
1117 if modname in sys.modules:
1118 module = sys.modules.get(modname)
1120 module = loader.load_module(modname)
1122 for attrname in dir(module):
1123 if attrname.startswith("_"):
1126 attr = getattr(module, attrname)
1128 if attr == ResourceManager:
1131 if not inspect.isclass(attr):
1134 if issubclass(attr, ResourceManager):
1137 if not modname in sys.modules:
1138 sys.modules[modname] = module
1143 err = traceback.format_exc()
1144 logger = logging.getLogger("Resource.find_types()")
1145 logger.error("Error while loading Resource Managers %s" % err)