Fix slave retrials: was cleaning up, deleting keys, but it still needed them.
[nepi.git] / src / nepi / testbeds / planetlab / application.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from constants import TESTBED_ID
5 import plcapi
6 import operator
7 import os
8 import os.path
9 import sys
10 import nepi.util.server as server
11 import cStringIO
12 import subprocess
13 import rspawn
14 import random
15 import time
16 import socket
17 import threading
18 import logging
19 import re
20
21 from nepi.util.constants import ApplicationStatus as AS
22
23 class Dependency(object):
24     """
25     A Dependency is in every respect like an application.
26     
27     It depends on some packages, it may require building binaries, it must deploy
28     them...
29     
30     But it has no command. Dependencies aren't ever started, or stopped, and have
31     no status.
32     """
33
34     TRACES = ('buildlog')
35
36     def __init__(self, api=None):
37         if not api:
38             api = plcapi.PLCAPI()
39         self._api = api
40         
41         # Attributes
42         self.command = None
43         self.sudo = False
44         
45         self.build = None
46         self.install = None
47         self.depends = None
48         self.buildDepends = None
49         self.sources = None
50         self.rpmFusion = False
51         self.env = {}
52         
53         self.stdin = None
54         self.stdout = None
55         self.stderr = None
56         self.buildlog = None
57         
58         self.add_to_path = True
59         
60         # Those are filled when the app is configured
61         self.home_path = None
62         
63         # Those are filled when an actual node is connected
64         self.node = None
65         
66         # Those are filled when the app is started
67         #   Having both pid and ppid makes it harder
68         #   for pid rollover to induce tracking mistakes
69         self._started = False
70         self._setup = False
71         self._setuper = None
72         self._pid = None
73         self._ppid = None
74
75         # Spanning tree deployment
76         self._master = None
77         self._master_passphrase = None
78         self._master_prk = None
79         self._master_puk = None
80         self._master_token = os.urandom(8).encode("hex")
81         self._build_pid = None
82         self._build_ppid = None
83         
84         # Logging
85         self._logger = logging.getLogger('nepi.testbeds.planetlab')
86         
87     
88     def __str__(self):
89         return "%s<%s>" % (
90             self.__class__.__name__,
91             ' '.join(filter(bool,(self.depends, self.sources)))
92         )
93     
94     def validate(self):
95         if self.home_path is None:
96             raise AssertionError, "Misconfigured application: missing home path"
97         if self.node.ident_path is None or not os.access(self.node.ident_path, os.R_OK):
98             raise AssertionError, "Misconfigured application: missing slice SSH key"
99         if self.node is None:
100             raise AssertionError, "Misconfigured application: unconnected node"
101         if self.node.hostname is None:
102             raise AssertionError, "Misconfigured application: misconfigured node"
103         if self.node.slicename is None:
104             raise AssertionError, "Misconfigured application: unspecified slice"
105     
106     def check_bad_host(self, out, err):
107         """
108         Called whenever an operation fails, it's given the output to be checked for
109         telltale signs of unhealthy hosts.
110         """
111         return False
112     
113     def remote_trace_path(self, whichtrace):
114         if whichtrace in self.TRACES:
115             tracefile = os.path.join(self.home_path, whichtrace)
116         else:
117             tracefile = None
118         
119         return tracefile
120
121     def remote_trace_name(self, whichtrace):
122         if whichtrace in self.TRACES:
123             return whichtrace
124         return None
125
126     def sync_trace(self, local_dir, whichtrace):
127         tracefile = self.remote_trace_path(whichtrace)
128         if not tracefile:
129             return None
130         
131         local_path = os.path.join(local_dir, tracefile)
132         
133         # create parent local folders
134         proc = subprocess.Popen(
135             ["mkdir", "-p", os.path.dirname(local_path)],
136             stdout = open("/dev/null","w"),
137             stdin = open("/dev/null","r"))
138
139         if proc.wait():
140             raise RuntimeError, "Failed to synchronize trace"
141         
142         # sync files
143         try:
144             self._popen_scp(
145                 '%s@%s:%s' % (self.node.slicename, self.node.hostname,
146                     tracefile),
147                 local_path
148                 )
149         except RuntimeError, e:
150             raise RuntimeError, "Failed to synchronize trace: %s %s" \
151                     % (e.args[0], e.args[1],)
152         
153         return local_path
154     
155     def recover(self):
156         # We assume a correct deployment, so recovery only
157         # means we mark this dependency as deployed
158         self._setup = True
159
160     def setup(self):
161         self._logger.info("Setting up %s", self)
162         self._make_home()
163         self._launch_build()
164         self._finish_build()
165         self._setup = True
166     
167     def async_setup(self):
168         if not self._setuper:
169             def setuper():
170                 try:
171                     self.setup()
172                 except:
173                     self._setuper._exc.append(sys.exc_info())
174             self._setuper = threading.Thread(
175                 target = setuper)
176             self._setuper._exc = []
177             self._setuper.start()
178     
179     def async_setup_wait(self):
180         if not self._setup:
181             self._logger.info("Waiting for %s to be setup", self)
182             if self._setuper:
183                 self._setuper.join()
184                 if not self._setup:
185                     if self._setuper._exc:
186                         exctyp,exval,exctrace = self._setuper._exc[0]
187                         raise exctyp,exval,exctrace
188                     else:
189                         raise RuntimeError, "Failed to setup application"
190                 else:
191                     self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
192             else:
193                 self.setup()
194         
195     def _make_home(self):
196         # Make sure all the paths are created where 
197         # they have to be created for deployment
198         # sync files
199         try:
200             self._popen_ssh_command(
201                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
202                     % { 'home' : server.shell_escape(self.home_path) },
203                 timeout = 120,
204                 retry = 3
205                 )
206         except RuntimeError, e:
207             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
208         
209         if self.stdin:
210             # Write program input
211             try:
212                 self._popen_scp(
213                     cStringIO.StringIO(self.stdin),
214                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
215                         os.path.join(self.home_path, 'stdin') ),
216                     )
217             except RuntimeError, e:
218                 raise RuntimeError, "Failed to set up application %s: %s %s" \
219                         % (self.home_path, e.args[0], e.args[1],)
220
221     def _replace_paths(self, command):
222         """
223         Replace all special path tags with shell-escaped actual paths.
224         """
225         # need to append ${HOME} if paths aren't absolute, to MAKE them absolute.
226         root = '' if self.home_path.startswith('/') else "${HOME}/"
227         return ( command
228             .replace("${SOURCES}", root+server.shell_escape(self.home_path))
229             .replace("${BUILD}", root+server.shell_escape(os.path.join(self.home_path,'build'))) )
230
231     def _launch_build(self, trial=0):
232         if self._master is not None:
233             if not trial or self._master_prk is not None:
234                 self._do_install_keys()
235             buildscript = self._do_build_slave()
236         else:
237             buildscript = self._do_build_master()
238             
239         if buildscript is not None:
240             self._logger.info("Building %s at %s", self, self.node.hostname)
241             
242             # upload build script
243             try:
244                 self._popen_scp(
245                     buildscript,
246                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
247                         os.path.join(self.home_path, 'nepi-build.sh') )
248                     )
249             except RuntimeError, e:
250                 raise RuntimeError, "Failed to set up application %s: %s %s" \
251                         % (self.home_path, e.args[0], e.args[1],)
252             
253             # launch build
254             self._do_launch_build()
255     
256     def _finish_build(self):
257         self._do_wait_build()
258         self._do_install()
259
260     def _do_build_slave(self):
261         if not self.sources and not self.build:
262             return None
263             
264         # Create build script
265         files = set()
266         
267         if self.sources:
268             sources = self.sources.split(' ')
269             files.update(
270                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostname, 
271                     os.path.join(self._master.home_path, os.path.basename(source)),)
272                 for source in sources
273             )
274         
275         if self.build:
276             files.add(
277                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostname, 
278                     os.path.join(self._master.home_path, 'build.tar.gz'),)
279             )
280         
281         sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
282         
283         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
284                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
285                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
286         {
287             'prk' : server.shell_escape(self._master_prk_name),
288             'puk' : server.shell_escape(self._master_puk_name),
289         }
290         
291         kill_agent = "kill $SSH_AGENT_PID"
292         
293         waitmaster = (
294             "{ "
295             "echo 'Checking master reachability' ; "
296             "if ping -c 3 %(master_host)s && (. ./.ssh-agent.sh > /dev/null ; ssh -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s echo MASTER SAYS HI ) ; then "
297             "echo 'Master node reachable' ; "
298             "else "
299             "echo 'MASTER NODE UNREACHABLE' && "
300             "exit 1 ; "
301             "fi ; "
302             ". ./.ssh-agent.sh ; "
303             "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
304             "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
305             "}" 
306         ) % {
307             'hostkey' : 'master_known_hosts',
308             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostname),
309             'master_host' : self._master.node.hostname,
310             'token_path' : os.path.join(self._master.home_path, 'build.token'),
311             'token' : server.shell_escape(self._master._master_token),
312             'sshopts' : sshopts,
313         }
314         
315         syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
316             'hostkey' : 'master_known_hosts',
317             'files' : ' '.join(files),
318             'sshopts' : sshopts,
319         }
320         if self.build:
321             syncfiles += " && tar xzf build.tar.gz"
322         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
323         syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
324         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
325         
326         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
327             'prk' : server.shell_escape(self._master_prk_name),
328             'puk' : server.shell_escape(self._master_puk_name),
329         }
330         
331         slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
332             'waitmaster' : waitmaster,
333             'syncfiles' : syncfiles,
334             'cleanup' : cleanup,
335             'kill_agent' : kill_agent,
336             'launch_agent' : launch_agent,
337             'home' : server.shell_escape(self.home_path),
338             'token' : server.shell_escape(self._master_token),
339         }
340         
341         return cStringIO.StringIO(slavescript)
342          
343     def _do_launch_build(self):
344         script = "bash ./nepi-build.sh"
345         if self._master_passphrase:
346             script = "NEPI_MASTER_PASSPHRASE=%s %s" % (
347                 server.shell_escape(self._master_passphrase),
348                 script
349             )
350         (out,err),proc = rspawn.remote_spawn(
351             script,
352             pidfile = 'build-pid',
353             home = self.home_path,
354             stdin = '/dev/null',
355             stdout = 'buildlog',
356             stderr = rspawn.STDOUT,
357             
358             host = self.node.hostname,
359             port = None,
360             user = self.node.slicename,
361             agent = None,
362             ident_key = self.node.ident_path,
363             server_key = self.node.server_key
364             )
365         
366         if proc.wait():
367             if self.check_bad_host(out, err):
368                 self.node.blacklist()
369             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
370         
371         
372         pid = ppid = None
373         delay = 1.0
374         for i in xrange(5):
375             pidtuple = rspawn.remote_check_pid(
376                 os.path.join(self.home_path,'build-pid'),
377                 host = self.node.hostname,
378                 port = None,
379                 user = self.node.slicename,
380                 agent = None,
381                 ident_key = self.node.ident_path,
382                 server_key = self.node.server_key
383                 )
384             
385             if pidtuple:
386                 pid, ppid = pidtuple
387                 self._build_pid, self._build_ppid = pidtuple
388                 break
389             else:
390                 time.sleep(delay)
391                 delay = min(30,delay*1.2)
392         else:
393             raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
394
395         self._logger.info("Deploying %s at %s", self, self.node.hostname)
396         
397     def _do_wait_build(self, trial=0):
398         pid = self._build_pid
399         ppid = self._build_ppid
400         
401         if pid and ppid:
402             delay = 1.0
403             first = True
404             bustspin = 0
405             while True:
406                 status = rspawn.remote_status(
407                     pid, ppid,
408                     host = self.node.hostname,
409                     port = None,
410                     user = self.node.slicename,
411                     agent = None,
412                     ident_key = self.node.ident_path,
413                     server_key = self.node.server_key
414                     )
415                 
416                 if status is rspawn.FINISHED:
417                     self._build_pid = self._build_ppid = None
418                     break
419                 elif status is not rspawn.RUNNING:
420                     bustspin += 1
421                     time.sleep(delay*(5.5+random.random()))
422                     if bustspin > 12:
423                         self._build_pid = self._build_ppid = None
424                         break
425                 else:
426                     if first:
427                         self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
428                             "(build slave)" if self._master is not None else "(build master)")
429                         
430                         first = False
431                     time.sleep(delay*(0.5+random.random()))
432                     delay = min(30,delay*1.2)
433                     bustspin = 0
434             
435             # check build token
436             slave_token = ""
437             for i in xrange(3):
438                 (out, err), proc = self._popen_ssh_command(
439                     "cat %(token_path)s" % {
440                         'token_path' : os.path.join(self.home_path, 'build.token'),
441                     },
442                     timeout = 120,
443                     noerrors = True)
444                 if not proc.wait() and out:
445                     slave_token = out.strip()
446                 
447                 if slave_token:
448                     break
449                 else:
450                     time.sleep(2)
451             
452             if slave_token != self._master_token:
453                 # Get buildlog for the error message
454
455                 (buildlog, err), proc = self._popen_ssh_command(
456                     "cat %(buildlog)s" % {
457                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
458                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
459                     },
460                     timeout = 120,
461                     noerrors = True)
462                 
463                 proc.wait()
464                 
465                 if self.check_bad_host(buildlog, err):
466                     self.node.blacklist()
467                 elif self._master and trial < 3 and 'BAD TOKEN' in buildlog or 'BAD TOKEN' in err:
468                     # bad sync with master, may try again
469                     # but first wait for master
470                     self._master.async_setup_wait()
471                     self._launch_build(trial+1)
472                     self._do_wait_build(trial+1)
473                 else:
474                     # No longer need'em
475                     self._master_prk = None
476                     self._master_puk = None
477         
478                     raise RuntimeError, "Failed to set up application %s: "\
479                             "build failed, got wrong token from pid %s/%s "\
480                             "(expected %r, got %r), see buildlog at %s:\n%s" % (
481                         self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
482
483             # No longer need'em
484             self._master_prk = None
485             self._master_puk = None
486         
487             self._logger.info("Built %s at %s", self, self.node.hostname)
488
489     def _do_kill_build(self):
490         pid = self._build_pid
491         ppid = self._build_ppid
492         
493         if pid and ppid:
494             self._logger.info("Killing build of %s", self)
495             rspawn.remote_kill(
496                 pid, ppid,
497                 host = self.node.hostname,
498                 port = None,
499                 user = self.node.slicename,
500                 agent = None,
501                 ident_key = self.node.ident_path
502                 )
503         
504         
505     def _do_build_master(self):
506         if not self.sources and not self.build and not self.buildDepends:
507             return None
508             
509         if self.sources:
510             sources = self.sources.split(' ')
511             
512             # Copy all sources
513             try:
514                 self._popen_scp(
515                     sources,
516                     "%s@%s:%s" % (self.node.slicename, self.node.hostname, 
517                         os.path.join(self.home_path,'.'),)
518                     )
519             except RuntimeError, e:
520                 raise RuntimeError, "Failed upload source file %r: %s %s" \
521                         % (sources, e.args[0], e.args[1],)
522             
523         buildscript = cStringIO.StringIO()
524         
525         buildscript.write("(\n")
526         
527         if self.buildDepends:
528             # Install build dependencies
529             buildscript.write(
530                 "sudo -S yum -y install %(packages)s\n" % {
531                     'packages' : self.buildDepends
532                 }
533             )
534         
535             
536         if self.build:
537             # Build sources
538             buildscript.write(
539                 "mkdir -p build && ( cd build && ( %(command)s ) )\n" % {
540                     'command' : self._replace_paths(self.build),
541                     'home' : server.shell_escape(self.home_path),
542                 }
543             )
544         
545             # Make archive
546             buildscript.write("tar czf build.tar.gz build\n")
547         
548         # Write token
549         buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
550             'master_token' : server.shell_escape(self._master_token)
551         })
552         
553         buildscript.seek(0)
554
555         return buildscript
556
557     def _do_install(self):
558         if self.install:
559             self._logger.info("Installing %s at %s", self, self.node.hostname)
560             
561             # Install application
562             try:
563                 self._popen_ssh_command(
564                     "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \
565                         {
566                         'command' : self._replace_paths(self.install),
567                         'home' : server.shell_escape(self.home_path),
568                         },
569                     )
570             except RuntimeError, e:
571                 if self.check_bad_host(e.args[0], e.args[1]):
572                     self.node.blacklist()
573                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
574
575     def set_master(self, master):
576         self._master = master
577         
578     def install_keys(self, prk, puk, passphrase):
579         # Install keys
580         self._master_passphrase = passphrase
581         self._master_prk = prk
582         self._master_puk = puk
583         self._master_prk_name = os.path.basename(prk.name)
584         self._master_puk_name = os.path.basename(puk.name)
585         
586     def _do_install_keys(self):
587         prk = self._master_prk
588         puk = self._master_puk
589        
590         try:
591             self._popen_scp(
592                 [ prk.name, puk.name ],
593                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, self.home_path )
594                 )
595         except RuntimeError, e:
596             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
597                     % (e.args[0], e.args[1],)
598
599         try:
600             self._popen_scp(
601                 cStringIO.StringIO('%s,%s %s\n' % (
602                     self._master.node.hostname, socket.gethostbyname(self._master.node.hostname), 
603                     self._master.node.server_key)),
604                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
605                     os.path.join(self.home_path,"master_known_hosts") )
606                 )
607         except RuntimeError, e:
608             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
609                     % (e.args[0], e.args[1],)
610         
611     
612     def cleanup(self):
613         # make sure there's no leftover build processes
614         self._do_kill_build()
615         
616         # No longer need'em
617         self._master_prk = None
618         self._master_puk = None
619
620     @server.eintr_retry
621     def _popen_scp(self, src, dst, retry = 3):
622         while 1:
623             try:
624                 (out,err),proc = server.popen_scp(
625                     src,
626                     dst, 
627                     port = None,
628                     agent = None,
629                     ident_key = self.node.ident_path,
630                     server_key = self.node.server_key
631                     )
632
633                 if server.eintr_retry(proc.wait)():
634                     raise RuntimeError, (out, err)
635                 return (out, err), proc
636             except:
637                 if retry <= 0:
638                     raise
639                 else:
640                     retry -= 1
641   
642
643     @server.eintr_retry
644     def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
645         (out,err),proc = server.popen_ssh_command(
646             command,
647             host = self.node.hostname,
648             port = None,
649             user = self.node.slicename,
650             agent = None,
651             ident_key = self.node.ident_path,
652             server_key = self.node.server_key,
653             timeout = timeout,
654             retry = retry
655             )
656
657         if server.eintr_retry(proc.wait)():
658             if not noerrors:
659                 raise RuntimeError, (out, err)
660         return (out, err), proc
661
662 class Application(Dependency):
663     """
664     An application also has dependencies, but also a command to be ran and monitored.
665     
666     It adds the output of that command as traces.
667     """
668     
669     TRACES = ('stdout','stderr','buildlog', 'output')
670     
671     def __init__(self, api=None):
672         super(Application,self).__init__(api)
673         
674         # Attributes
675         self.command = None
676         self.sudo = False
677         
678         self.stdin = None
679         self.stdout = None
680         self.stderr = None
681         self.output = None
682         
683         # Those are filled when the app is started
684         #   Having both pid and ppid makes it harder
685         #   for pid rollover to induce tracking mistakes
686         self._started = False
687         self._pid = None
688         self._ppid = None
689
690         # Do not add to the python path of nodes
691         self.add_to_path = False
692     
693     def __str__(self):
694         return "%s<command:%s%s>" % (
695             self.__class__.__name__,
696             "sudo " if self.sudo else "",
697             self.command,
698         )
699     
700     def start(self):
701         self._logger.info("Starting %s", self)
702         
703         # Create shell script with the command
704         # This way, complex commands and scripts can be ran seamlessly
705         # sync files
706         command = cStringIO.StringIO()
707         command.write('export PYTHONPATH=$PYTHONPATH:%s\n' % (
708             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
709         ))
710         command.write('export PATH=$PATH:%s\n' % (
711             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
712         ))
713         if self.node.env:
714             for envkey, envvals in self.node.env.iteritems():
715                 for envval in envvals:
716                     command.write('export %s=%s\n' % (envkey, envval))
717         command.write(self.command)
718         command.seek(0)
719
720         try:
721             self._popen_scp(
722                 command,
723                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
724                     os.path.join(self.home_path, "app.sh"))
725                 )
726         except RuntimeError, e:
727             raise RuntimeError, "Failed to set up application: %s %s" \
728                     % (e.args[0], e.args[1],)
729         
730         # Start process in a "daemonized" way, using nohup and heavy
731         # stdin/out redirection to avoid connection issues
732         (out,err),proc = rspawn.remote_spawn(
733             self._replace_paths("bash ./app.sh"),
734             
735             pidfile = './pid',
736             home = self.home_path,
737             stdin = 'stdin' if self.stdin is not None else '/dev/null',
738             stdout = 'stdout' if self.stdout else '/dev/null',
739             stderr = 'stderr' if self.stderr else '/dev/null',
740             sudo = self.sudo,
741             
742             host = self.node.hostname,
743             port = None,
744             user = self.node.slicename,
745             agent = None,
746             ident_key = self.node.ident_path,
747             server_key = self.node.server_key
748             )
749         
750         if proc.wait():
751             if self.check_bad_host(out, err):
752                 self.node.blacklist()
753             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
754
755         self._started = True
756     
757     def recover(self):
758         # Assuming the application is running on PlanetLab,
759         # proper pidfiles should be present at the app's home path.
760         # So we mark this application as started, and check the pidfiles
761         self._started = True
762         self.checkpid()
763
764     def checkpid(self):            
765         # Get PID/PPID
766         # NOTE: wait a bit for the pidfile to be created
767         if self._started and not self._pid or not self._ppid:
768             pidtuple = rspawn.remote_check_pid(
769                 os.path.join(self.home_path,'pid'),
770                 host = self.node.hostname,
771                 port = None,
772                 user = self.node.slicename,
773                 agent = None,
774                 ident_key = self.node.ident_path,
775                 server_key = self.node.server_key
776                 )
777             
778             if pidtuple:
779                 self._pid, self._ppid = pidtuple
780     
781     def status(self):
782         self.checkpid()
783         if not self._started:
784             return AS.STATUS_NOT_STARTED
785         elif not self._pid or not self._ppid:
786             return AS.STATUS_NOT_STARTED
787         else:
788             status = rspawn.remote_status(
789                 self._pid, self._ppid,
790                 host = self.node.hostname,
791                 port = None,
792                 user = self.node.slicename,
793                 agent = None,
794                 ident_key = self.node.ident_path,
795                 server_key = self.node.server_key
796                 )
797             
798             if status is rspawn.NOT_STARTED:
799                 return AS.STATUS_NOT_STARTED
800             elif status is rspawn.RUNNING:
801                 return AS.STATUS_RUNNING
802             elif status is rspawn.FINISHED:
803                 return AS.STATUS_FINISHED
804             else:
805                 # WTF?
806                 return AS.STATUS_NOT_STARTED
807     
808     def kill(self):
809         status = self.status()
810         if status == AS.STATUS_RUNNING:
811             # kill by ppid+pid - SIGTERM first, then try SIGKILL
812             rspawn.remote_kill(
813                 self._pid, self._ppid,
814                 host = self.node.hostname,
815                 port = None,
816                 user = self.node.slicename,
817                 agent = None,
818                 ident_key = self.node.ident_path,
819                 server_key = self.node.server_key,
820                 sudo = self.sudo
821                 )
822             self._logger.info("Killed %s", self)
823
824
825 class NepiDependency(Dependency):
826     """
827     This dependency adds nepi itself to the python path,
828     so that you may run testbeds within PL nodes.
829     """
830     
831     # Class attribute holding a *weak* reference to the shared NEPI tar file
832     # so that they may share it. Don't operate on the file itself, it would
833     # be a mess, just use its path.
834     _shared_nepi_tar = None
835     
836     def __init__(self, api = None):
837         super(NepiDependency, self).__init__(api)
838         
839         self._tarball = None
840         
841         self.depends = 'python python-ipaddr python-setuptools'
842         
843         # our sources are in our ad-hoc tarball
844         self.sources = self.tarball.name
845         
846         tarname = os.path.basename(self.tarball.name)
847         
848         # it's already built - just move the tarball into place
849         self.build = "mv -f ${SOURCES}/%s ." % (tarname,)
850         
851         # unpack it into sources, and we're done
852         self.install = "tar xzf ${BUILD}/%s -C .." % (tarname,)
853     
854     @property
855     def tarball(self):
856         if self._tarball is None:
857             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
858             if shared_tar is not None:
859                 self._tarball = shared_tar
860             else:
861                 # Build an ad-hoc tarball
862                 # Prebuilt
863                 import nepi
864                 import tempfile
865                 
866                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
867                 
868                 proc = subprocess.Popen(
869                     ["tar", "czf", shared_tar.name, 
870                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
871                         'nepi'],
872                     stdout = open("/dev/null","w"),
873                     stdin = open("/dev/null","r"))
874
875                 if proc.wait():
876                     raise RuntimeError, "Failed to create nepi tarball"
877                 
878                 self._tarball = self._shared_nepi_tar = shared_tar
879                 
880         return self._tarball
881
882 class NS3Dependency(Dependency):
883     """
884     This dependency adds NS3 libraries to the library paths,
885     so that you may run the NS3 testbed within PL nodes.
886     
887     You'll also need the NepiDependency.
888     """
889     
890     def __init__(self, api = None):
891         super(NS3Dependency, self).__init__(api)
892         
893         self.buildDepends = 'make waf gcc gcc-c++ gccxml unzip'
894         
895         # We have to download the sources, untar, build...
896         pybindgen_source_url = "http://yans.pl.sophia.inria.fr/trac/nepi/raw-attachment/wiki/WikiStart/pybindgen-r794.tar.gz"
897         pygccxml_source_url = "http://leaseweb.dl.sourceforge.net/project/pygccxml/pygccxml/pygccxml-1.0/pygccxml-1.0.0.zip"
898         ns3_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/ns-3.11-nepi/archive/tip.tar.gz"
899         passfd_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/python-passfd/archive/tip.tar.gz"
900         self.build =(
901             " ( "
902             "  cd .. && "
903             "  python -c 'import pygccxml, pybindgen, passfd' && "
904             "  test -f lib/ns/_core.so && "
905             "  test -f lib/ns/__init__.py && "
906             "  test -f lib/ns/core.py && "
907             "  test -f lib/libns3-core.so && "
908             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
909             " ) || ( "
910                 # Not working, rebuild
911                      # Archive SHA1 sums to check
912                      "echo '7158877faff2254e6c094bf18e6b4283cac19137  pygccxml-1.0.0.zip' > archive_sums.txt && "
913                      "echo 'a18c2ccffd0df517bc37e2f3a2475092517c43f2  pybindgen-src.tar.gz' >> archive_sums.txt && "
914                      " ( " # check existing files
915                      " sha1sum -c archive_sums.txt && "
916                      " test -f passfd-src.tar.gz && "
917                      " test -f ns3-src.tar.gz "
918                      " ) || ( " # nope? re-download
919                      " rm -f pybindgen-src.zip pygccxml-1.0.0.zip passfd-src.tar.gz ns3-src.tar.gz && "
920                      " wget -q -c -O pybindgen-src.tar.gz %(pybindgen_source_url)s && " # continue, to exploit the case when it has already been dl'ed
921                      " wget -q -c -O pygccxml-1.0.0.zip %(pygccxml_source_url)s && " 
922                      " wget -q -c -O passfd-src.tar.gz %(passfd_source_url)s && "
923                      " wget -q -c -O ns3-src.tar.gz %(ns3_source_url)s && "  
924                      " sha1sum -c archive_sums.txt " # Check SHA1 sums when applicable
925                      " ) && "
926                      "unzip -n pygccxml-1.0.0.zip && "
927                      "mkdir -p pybindgen-src && "
928                      "mkdir -p ns3-src && "
929                      "mkdir -p passfd-src && "
930                      "tar xzf ns3-src.tar.gz --strip-components=1 -C ns3-src && "
931                      "tar xzf passfd-src.tar.gz --strip-components=1 -C passfd-src && "
932                      "tar xzf pybindgen-src.tar.gz --strip-components=1 -C pybindgen-src && "
933                      "rm -rf target && "    # mv doesn't like unclean targets
934                      "mkdir -p target && "
935                      "cd pygccxml-1.0.0 && "
936                      "rm -rf unittests docs && " # pygccxml has ~100M of unit tests - excessive - docs aren't needed either
937                      "python setup.py build && "
938                      "python setup.py install --install-lib ${BUILD}/target && "
939                      "python setup.py clean && "
940                      "cd ../pybindgen-src && "
941                      "export PYTHONPATH=$PYTHONPATH:${BUILD}/target && "
942                      "./waf configure --prefix=${BUILD}/target -d release && "
943                      "./waf && "
944                      "./waf install && "
945                      "./waf clean && "
946                      "mv -f ${BUILD}/target/lib/python*/site-packages/pybindgen ${BUILD}/target/. && "
947                      "rm -rf ${BUILD}/target/lib && "
948                      "cd ../passfd-src && "
949                      "python setup.py build && "
950                      "python setup.py install --install-lib ${BUILD}/target && "
951                      "python setup.py clean && "
952                      "cd ../ns3-src && "
953                      "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
954                      "./waf &&"
955                      "./waf install && "
956                      "rm -f ${BUILD}/target/lib/*.so && "
957                      "cp -a ${BUILD}/ns3-src/build/release/libns3*.so ${BUILD}/target/lib && "
958                      "cp -a ${BUILD}/ns3-src/build/release/bindings/python/ns ${BUILD}/target/lib &&"
959                      "./waf clean "
960              " )"
961                      % dict(
962                         pybindgen_source_url = server.shell_escape(pybindgen_source_url),
963                         pygccxml_source_url = server.shell_escape(pygccxml_source_url),
964                         ns3_source_url = server.shell_escape(ns3_source_url),
965                         passfd_source_url = server.shell_escape(passfd_source_url),
966                      ))
967         
968         # Just move ${BUILD}/target
969         self.install = (
970             " ( "
971             "  cd .. && "
972             "  python -c 'import pygccxml, pybindgen, passfd' && "
973             "  test -f lib/ns/_core.so && "
974             "  test -f lib/ns/__init__.py && "
975             "  test -f lib/ns/core.py && "
976             "  test -f lib/libns3-core.so && "
977             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
978             " ) || ( "
979                 # Not working, reinstall
980                     "test -d ${BUILD}/target && "
981                     "[[ \"x\" != \"x$(find ${BUILD}/target -mindepth 1 -print -quit)\" ]] &&"
982                     "( for i in ${BUILD}/target/* ; do rm -rf ${SOURCES}/${i##*/} ; done ) && " # mv doesn't like unclean targets
983                     "mv -f ${BUILD}/target/* ${SOURCES}"
984             " )"
985         )
986         
987         # Set extra environment paths
988         self.env['NEPI_NS3BINDINGS'] = "${SOURCES}/lib"
989         self.env['NEPI_NS3LIBRARY'] = "${SOURCES}/lib"
990     
991     @property
992     def tarball(self):
993         if self._tarball is None:
994             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
995             if shared_tar is not None:
996                 self._tarball = shared_tar
997             else:
998                 # Build an ad-hoc tarball
999                 # Prebuilt
1000                 import nepi
1001                 import tempfile
1002                 
1003                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
1004                 
1005                 proc = subprocess.Popen(
1006                     ["tar", "czf", shared_tar.name, 
1007                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
1008                         'nepi'],
1009                     stdout = open("/dev/null","w"),
1010                     stdin = open("/dev/null","r"))
1011
1012                 if proc.wait():
1013                     raise RuntimeError, "Failed to create nepi tarball"
1014                 
1015                 self._tarball = self._shared_nepi_tar = shared_tar
1016                 
1017         return self._tarball
1018
1019 class YumDependency(Dependency):
1020     """
1021     This dependency is an internal helper class used to
1022     efficiently distribute yum-downloaded rpms.
1023     
1024     It temporarily sets the yum cache as persistent in the
1025     build master, and installs all the required packages.
1026     
1027     The rpm packages left in the yum cache are gathered and
1028     distributed by the underlying Dependency in an efficient
1029     manner. Build slaves will then install those rpms back in
1030     the cache before issuing the install command.
1031     
1032     When packages have been installed already, nothing but an
1033     empty tar is distributed.
1034     """
1035     
1036     # Class attribute holding a *weak* reference to the shared NEPI tar file
1037     # so that they may share it. Don't operate on the file itself, it would
1038     # be a mess, just use its path.
1039     _shared_nepi_tar = None
1040     
1041     def _build_get(self):
1042         # canonical representation of dependencies
1043         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1044         
1045         # download rpms and pack into a tar archive
1046         return (
1047             "sudo -S nice yum -y makecache && "
1048             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
1049             " ( ( "
1050                 "sudo -S nice yum -y install %s ; "
1051                 "rm -f ${BUILD}/packages.tar ; "
1052                 "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
1053             " ) || /bin/true ) && "
1054             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
1055             "( sudo -S nice yum -y clean packages || /bin/true ) "
1056         ) % ( depends, )
1057     def _build_set(self, value):
1058         # ignore
1059         return
1060     build = property(_build_get, _build_set)
1061     
1062     def _install_get(self):
1063         # canonical representation of dependencies
1064         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1065         
1066         # unpack cached rpms into yum cache, install, and cleanup
1067         return (
1068             "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
1069             "sudo -S nice yum -y install %s && "
1070             "( sudo -S nice yum -y clean packages || /bin/true ) "
1071         ) % ( depends, )
1072     def _install_set(self, value):
1073         # ignore
1074         return
1075     install = property(_install_get, _install_set)
1076         
1077     def check_bad_host(self, out, err):
1078         badre = re.compile(r'(?:'
1079                            r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
1080                            r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
1081                            r'|Error: disk I/O error'
1082                            r'|MASTER NODE UNREACHABLE'
1083                            r')', 
1084                            re.I)
1085         return badre.search(out) or badre.search(err)