Retry more - retry retry retry
[nepi.git] / src / nepi / testbeds / planetlab / application.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from constants import TESTBED_ID
5 import plcapi
6 import operator
7 import os
8 import os.path
9 import sys
10 import nepi.util.server as server
11 import cStringIO
12 import subprocess
13 import rspawn
14 import random
15 import time
16 import socket
17 import threading
18 import logging
19 import re
20
21 from nepi.util.constants import ApplicationStatus as AS
22
23 class Dependency(object):
24     """
25     A Dependency is in every respect like an application.
26     
27     It depends on some packages, it may require building binaries, it must deploy
28     them...
29     
30     But it has no command. Dependencies aren't ever started, or stopped, and have
31     no status.
32     """
33
34     TRACES = ('buildlog')
35
36     def __init__(self, api=None):
37         if not api:
38             api = plcapi.PLCAPI()
39         self._api = api
40         
41         # Attributes
42         self.command = None
43         self.sudo = False
44         
45         self.build = None
46         self.install = None
47         self.depends = None
48         self.buildDepends = None
49         self.sources = None
50         self.rpmFusion = False
51         self.env = {}
52         
53         self.stdin = None
54         self.stdout = None
55         self.stderr = None
56         self.buildlog = None
57         
58         self.add_to_path = True
59         
60         # Those are filled when the app is configured
61         self.home_path = None
62         
63         # Those are filled when an actual node is connected
64         self.node = None
65         
66         # Those are filled when the app is started
67         #   Having both pid and ppid makes it harder
68         #   for pid rollover to induce tracking mistakes
69         self._started = False
70         self._setup = False
71         self._setuper = None
72         self._pid = None
73         self._ppid = None
74
75         # Spanning tree deployment
76         self._master = None
77         self._master_passphrase = None
78         self._master_prk = None
79         self._master_puk = None
80         self._master_token = os.urandom(8).encode("hex")
81         self._build_pid = None
82         self._build_ppid = None
83         
84         # Logging
85         self._logger = logging.getLogger('nepi.testbeds.planetlab')
86         
87     
88     def __str__(self):
89         return "%s<%s>" % (
90             self.__class__.__name__,
91             ' '.join(filter(bool,(self.depends, self.sources)))
92         )
93     
94     def validate(self):
95         if self.home_path is None:
96             raise AssertionError, "Misconfigured application: missing home path"
97         if self.node.ident_path is None or not os.access(self.node.ident_path, os.R_OK):
98             raise AssertionError, "Misconfigured application: missing slice SSH key"
99         if self.node is None:
100             raise AssertionError, "Misconfigured application: unconnected node"
101         if self.node.hostname is None:
102             raise AssertionError, "Misconfigured application: misconfigured node"
103         if self.node.slicename is None:
104             raise AssertionError, "Misconfigured application: unspecified slice"
105     
106     def check_bad_host(self, out, err):
107         """
108         Called whenever an operation fails, it's given the output to be checked for
109         telltale signs of unhealthy hosts.
110         """
111         return False
112     
113     def remote_trace_path(self, whichtrace):
114         if whichtrace in self.TRACES:
115             tracefile = os.path.join(self.home_path, whichtrace)
116         else:
117             tracefile = None
118         
119         return tracefile
120
121     def remote_trace_name(self, whichtrace):
122         if whichtrace in self.TRACES:
123             return whichtrace
124         return None
125
126     def sync_trace(self, local_dir, whichtrace):
127         tracefile = self.remote_trace_path(whichtrace)
128         if not tracefile:
129             return None
130         
131         local_path = os.path.join(local_dir, tracefile)
132         
133         # create parent local folders
134         proc = subprocess.Popen(
135             ["mkdir", "-p", os.path.dirname(local_path)],
136             stdout = open("/dev/null","w"),
137             stdin = open("/dev/null","r"))
138
139         if proc.wait():
140             raise RuntimeError, "Failed to synchronize trace"
141         
142         # sync files
143         try:
144             self._popen_scp(
145                 '%s@%s:%s' % (self.node.slicename, self.node.hostname,
146                     tracefile),
147                 local_path
148                 )
149         except RuntimeError, e:
150             raise RuntimeError, "Failed to synchronize trace: %s %s" \
151                     % (e.args[0], e.args[1],)
152         
153         return local_path
154     
155     def recover(self):
156         # We assume a correct deployment, so recovery only
157         # means we mark this dependency as deployed
158         self._setup = True
159
160     def setup(self):
161         self._logger.info("Setting up %s", self)
162         self._make_home()
163         self._launch_build()
164         self._finish_build()
165         self._setup = True
166     
167     def async_setup(self):
168         if not self._setuper:
169             def setuper():
170                 try:
171                     self.setup()
172                 except:
173                     self._setuper._exc.append(sys.exc_info())
174             self._setuper = threading.Thread(
175                 target = setuper)
176             self._setuper._exc = []
177             self._setuper.start()
178     
179     def async_setup_wait(self):
180         if not self._setup:
181             self._logger.info("Waiting for %s to be setup", self)
182             if self._setuper:
183                 self._setuper.join()
184                 if not self._setup:
185                     if self._setuper._exc:
186                         exctyp,exval,exctrace = self._setuper._exc[0]
187                         raise exctyp,exval,exctrace
188                     else:
189                         raise RuntimeError, "Failed to setup application"
190                 else:
191                     self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
192             else:
193                 self.setup()
194         
195     def _make_home(self):
196         # Make sure all the paths are created where 
197         # they have to be created for deployment
198         # sync files
199         try:
200             self._popen_ssh_command(
201                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
202                     % { 'home' : server.shell_escape(self.home_path) },
203                 timeout = 120,
204                 retry = 3
205                 )
206         except RuntimeError, e:
207             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
208         
209         if self.stdin:
210             # Write program input
211             try:
212                 self._popen_scp(
213                     cStringIO.StringIO(self.stdin),
214                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
215                         os.path.join(self.home_path, 'stdin') ),
216                     )
217             except RuntimeError, e:
218                 raise RuntimeError, "Failed to set up application %s: %s %s" \
219                         % (self.home_path, e.args[0], e.args[1],)
220
221     def _replace_paths(self, command):
222         """
223         Replace all special path tags with shell-escaped actual paths.
224         """
225         # need to append ${HOME} if paths aren't absolute, to MAKE them absolute.
226         root = '' if self.home_path.startswith('/') else "${HOME}/"
227         return ( command
228             .replace("${SOURCES}", root+server.shell_escape(self.home_path))
229             .replace("${BUILD}", root+server.shell_escape(os.path.join(self.home_path,'build'))) )
230
231     def _launch_build(self, trial=0):
232         if self._master is not None:
233             if not trial or self._master_prk is not None:
234                 self._do_install_keys()
235             buildscript = self._do_build_slave()
236         else:
237             buildscript = self._do_build_master()
238             
239         if buildscript is not None:
240             self._logger.info("Building %s at %s", self, self.node.hostname)
241             
242             # upload build script
243             try:
244                 self._popen_scp(
245                     buildscript,
246                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
247                         os.path.join(self.home_path, 'nepi-build.sh') )
248                     )
249             except RuntimeError, e:
250                 raise RuntimeError, "Failed to set up application %s: %s %s" \
251                         % (self.home_path, e.args[0], e.args[1],)
252             
253             # launch build
254             self._do_launch_build()
255     
256     def _finish_build(self):
257         self._do_wait_build()
258         self._do_install()
259
260     def _do_build_slave(self):
261         if not self.sources and not self.build:
262             return None
263             
264         # Create build script
265         files = set()
266         
267         if self.sources:
268             sources = self.sources.split(' ')
269             files.update(
270                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostip, 
271                     os.path.join(self._master.home_path, os.path.basename(source)),)
272                 for source in sources
273             )
274         
275         if self.build:
276             files.add(
277                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostip, 
278                     os.path.join(self._master.home_path, 'build.tar.gz'),)
279             )
280         
281         sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
282         
283         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
284                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
285                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
286         {
287             'prk' : server.shell_escape(self._master_prk_name),
288             'puk' : server.shell_escape(self._master_puk_name),
289         }
290         
291         kill_agent = "kill $SSH_AGENT_PID"
292         
293         waitmaster = (
294             "{ "
295             "echo 'Checking master reachability' ; "
296             "if ping -c 3 %(master_host)s && (. ./.ssh-agent.sh > /dev/null ; ssh -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s echo MASTER SAYS HI ) ; then "
297             "echo 'Master node reachable' ; "
298             "else "
299             "echo 'MASTER NODE UNREACHABLE' && "
300             "exit 1 ; "
301             "fi ; "
302             ". ./.ssh-agent.sh ; "
303             "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
304             "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
305             "}" 
306         ) % {
307             'hostkey' : 'master_known_hosts',
308             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostip),
309             'master_host' : self._master.node.hostip,
310             'token_path' : os.path.join(self._master.home_path, 'build.token'),
311             'token' : server.shell_escape(self._master._master_token),
312             'sshopts' : sshopts,
313         }
314         
315         syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
316             'hostkey' : 'master_known_hosts',
317             'files' : ' '.join(files),
318             'sshopts' : sshopts,
319         }
320         if self.build:
321             syncfiles += " && tar xzf build.tar.gz"
322         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
323         syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
324         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
325         
326         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
327             'prk' : server.shell_escape(self._master_prk_name),
328             'puk' : server.shell_escape(self._master_puk_name),
329         }
330         
331         slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
332             'waitmaster' : waitmaster,
333             'syncfiles' : syncfiles,
334             'cleanup' : cleanup,
335             'kill_agent' : kill_agent,
336             'launch_agent' : launch_agent,
337             'home' : server.shell_escape(self.home_path),
338             'token' : server.shell_escape(self._master_token),
339         }
340         
341         return cStringIO.StringIO(slavescript)
342          
343     def _do_launch_build(self):
344         script = "bash ./nepi-build.sh"
345         if self._master_passphrase:
346             script = "NEPI_MASTER_PASSPHRASE=%s %s" % (
347                 server.shell_escape(self._master_passphrase),
348                 script
349             )
350         (out,err),proc = rspawn.remote_spawn(
351             script,
352             pidfile = 'build-pid',
353             home = self.home_path,
354             stdin = '/dev/null',
355             stdout = 'buildlog',
356             stderr = rspawn.STDOUT,
357             
358             host = self.node.hostname,
359             port = None,
360             user = self.node.slicename,
361             agent = None,
362             ident_key = self.node.ident_path,
363             server_key = self.node.server_key,
364             hostip = self.node.hostip,
365             )
366         
367         if proc.wait():
368             if self.check_bad_host(out, err):
369                 self.node.blacklist()
370             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
371         
372         
373         pid = ppid = None
374         delay = 1.0
375         for i in xrange(5):
376             pidtuple = rspawn.remote_check_pid(
377                 os.path.join(self.home_path,'build-pid'),
378                 host = self.node.hostname,
379                 port = None,
380                 user = self.node.slicename,
381                 agent = None,
382                 ident_key = self.node.ident_path,
383                 server_key = self.node.server_key,
384                 hostip = self.node.hostip
385                 )
386             
387             if pidtuple:
388                 pid, ppid = pidtuple
389                 self._build_pid, self._build_ppid = pidtuple
390                 break
391             else:
392                 time.sleep(delay)
393                 delay = min(30,delay*1.2)
394         else:
395             raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
396
397         self._logger.info("Deploying %s at %s", self, self.node.hostname)
398         
399     def _do_wait_build(self, trial=0):
400         pid = self._build_pid
401         ppid = self._build_ppid
402         
403         if pid and ppid:
404             delay = 1.0
405             first = True
406             bustspin = 0
407             while True:
408                 status = rspawn.remote_status(
409                     pid, ppid,
410                     host = self.node.hostname,
411                     port = None,
412                     user = self.node.slicename,
413                     agent = None,
414                     ident_key = self.node.ident_path,
415                     server_key = self.node.server_key,
416                     hostip = self.node.hostip
417                     )
418                 
419                 if status is rspawn.FINISHED:
420                     self._build_pid = self._build_ppid = None
421                     break
422                 elif status is not rspawn.RUNNING:
423                     self._logger.warn("Busted waiting for %s to finish building at %s %s", self, self.node.hostname,
424                             "(build slave)" if self._master is not None else "(build master)")
425                     bustspin += 1
426                     time.sleep(delay*(5.5+random.random()))
427                     if bustspin > 12:
428                         self._build_pid = self._build_ppid = None
429                         break
430                 else:
431                     if first:
432                         self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
433                             "(build slave)" if self._master is not None else "(build master)")
434                         
435                         first = False
436                     time.sleep(delay*(0.5+random.random()))
437                     delay = min(30,delay*1.2)
438                     bustspin = 0
439             
440             # check build token
441             slave_token = ""
442             for i in xrange(3):
443                 (out, err), proc = self._popen_ssh_command(
444                     "cat %(token_path)s" % {
445                         'token_path' : os.path.join(self.home_path, 'build.token'),
446                     },
447                     timeout = 120,
448                     noerrors = True)
449                 if not proc.wait() and out:
450                     slave_token = out.strip()
451                 
452                 if slave_token:
453                     break
454                 else:
455                     time.sleep(2)
456             
457             if slave_token != self._master_token:
458                 # Get buildlog for the error message
459
460                 (buildlog, err), proc = self._popen_ssh_command(
461                     "cat %(buildlog)s" % {
462                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
463                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
464                     },
465                     timeout = 120,
466                     noerrors = True)
467                 
468                 proc.wait()
469                 
470                 if self.check_bad_host(buildlog, err):
471                     self.node.blacklist()
472                 elif self._master and trial < 3 and 'BAD TOKEN' in buildlog or 'BAD TOKEN' in err:
473                     # bad sync with master, may try again
474                     # but first wait for master
475                     self._master.async_setup_wait()
476                     self._launch_build(trial+1)
477                     return self._do_wait_build(trial+1)
478                 elif trial < 3:
479                     return self._do_wait_build(trial+1)
480                 else:
481                     # No longer need'em
482                     self._master_prk = None
483                     self._master_puk = None
484         
485                     raise RuntimeError, "Failed to set up application %s: "\
486                             "build failed, got wrong token from pid %s/%s "\
487                             "(expected %r, got %r), see buildlog at %s:\n%s" % (
488                         self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
489
490             # No longer need'em
491             self._master_prk = None
492             self._master_puk = None
493         
494             self._logger.info("Built %s at %s", self, self.node.hostname)
495
496     def _do_kill_build(self):
497         pid = self._build_pid
498         ppid = self._build_ppid
499         
500         if pid and ppid:
501             self._logger.info("Killing build of %s", self)
502             rspawn.remote_kill(
503                 pid, ppid,
504                 host = self.node.hostname,
505                 port = None,
506                 user = self.node.slicename,
507                 agent = None,
508                 ident_key = self.node.ident_path,
509                 hostip = self.node.hostip
510                 )
511         
512         
513     def _do_build_master(self):
514         if not self.sources and not self.build and not self.buildDepends:
515             return None
516             
517         if self.sources:
518             sources = self.sources.split(' ')
519             
520             # Copy all sources
521             try:
522                 self._popen_scp(
523                     sources,
524                     "%s@%s:%s" % (self.node.slicename, self.node.hostname, 
525                         os.path.join(self.home_path,'.'),)
526                     )
527             except RuntimeError, e:
528                 raise RuntimeError, "Failed upload source file %r: %s %s" \
529                         % (sources, e.args[0], e.args[1],)
530             
531         buildscript = cStringIO.StringIO()
532         
533         buildscript.write("(\n")
534         
535         if self.buildDepends:
536             # Install build dependencies
537             buildscript.write(
538                 "sudo -S yum -y install %(packages)s\n" % {
539                     'packages' : self.buildDepends
540                 }
541             )
542         
543             
544         if self.build:
545             # Build sources
546             buildscript.write(
547                 "mkdir -p build && ( cd build && ( %(command)s ) )\n" % {
548                     'command' : self._replace_paths(self.build),
549                     'home' : server.shell_escape(self.home_path),
550                 }
551             )
552         
553             # Make archive
554             buildscript.write("tar czf build.tar.gz build\n")
555         
556         # Write token
557         buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
558             'master_token' : server.shell_escape(self._master_token)
559         })
560         
561         buildscript.seek(0)
562
563         return buildscript
564
565     def _do_install(self):
566         if self.install:
567             self._logger.info("Installing %s at %s", self, self.node.hostname)
568             
569             # Install application
570             try:
571                 self._popen_ssh_command(
572                     "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \
573                         {
574                         'command' : self._replace_paths(self.install),
575                         'home' : server.shell_escape(self.home_path),
576                         },
577                     )
578             except RuntimeError, e:
579                 if self.check_bad_host(e.args[0], e.args[1]):
580                     self.node.blacklist()
581                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
582
583     def set_master(self, master):
584         self._master = master
585         
586     def install_keys(self, prk, puk, passphrase):
587         # Install keys
588         self._master_passphrase = passphrase
589         self._master_prk = prk
590         self._master_puk = puk
591         self._master_prk_name = os.path.basename(prk.name)
592         self._master_puk_name = os.path.basename(puk.name)
593         
594     def _do_install_keys(self):
595         prk = self._master_prk
596         puk = self._master_puk
597        
598         try:
599             self._popen_scp(
600                 [ prk.name, puk.name ],
601                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, self.home_path )
602                 )
603         except RuntimeError, e:
604             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
605                     % (e.args[0], e.args[1],)
606
607         try:
608             self._popen_scp(
609                 cStringIO.StringIO('%s,%s %s\n' % (
610                     self._master.node.hostname, self._master.node.hostip, 
611                     self._master.node.server_key)),
612                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
613                     os.path.join(self.home_path,"master_known_hosts") )
614                 )
615         except RuntimeError, e:
616             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
617                     % (e.args[0], e.args[1],)
618         
619     
620     def cleanup(self):
621         # make sure there's no leftover build processes
622         self._do_kill_build()
623         
624         # No longer need'em
625         self._master_prk = None
626         self._master_puk = None
627
628     @server.eintr_retry
629     def _popen_scp(self, src, dst, retry = 3):
630         while 1:
631             try:
632                 (out,err),proc = server.popen_scp(
633                     src,
634                     dst, 
635                     port = None,
636                     agent = None,
637                     ident_key = self.node.ident_path,
638                     server_key = self.node.server_key
639                     )
640
641                 if server.eintr_retry(proc.wait)():
642                     raise RuntimeError, (out, err)
643                 return (out, err), proc
644             except:
645                 if retry <= 0:
646                     raise
647                 else:
648                     retry -= 1
649   
650
651     @server.eintr_retry
652     def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
653         (out,err),proc = server.popen_ssh_command(
654             command,
655             host = self.node.hostname,
656             port = None,
657             user = self.node.slicename,
658             agent = None,
659             ident_key = self.node.ident_path,
660             server_key = self.node.server_key,
661             timeout = timeout,
662             retry = retry
663             )
664
665         if server.eintr_retry(proc.wait)():
666             if not noerrors:
667                 raise RuntimeError, (out, err)
668         return (out, err), proc
669
670 class Application(Dependency):
671     """
672     An application also has dependencies, but also a command to be ran and monitored.
673     
674     It adds the output of that command as traces.
675     """
676     
677     TRACES = ('stdout','stderr','buildlog', 'output')
678     
679     def __init__(self, api=None):
680         super(Application,self).__init__(api)
681         
682         # Attributes
683         self.command = None
684         self.sudo = False
685         
686         self.stdin = None
687         self.stdout = None
688         self.stderr = None
689         self.output = None
690         
691         # Those are filled when the app is started
692         #   Having both pid and ppid makes it harder
693         #   for pid rollover to induce tracking mistakes
694         self._started = False
695         self._pid = None
696         self._ppid = None
697
698         # Do not add to the python path of nodes
699         self.add_to_path = False
700     
701     def __str__(self):
702         return "%s<command:%s%s>" % (
703             self.__class__.__name__,
704             "sudo " if self.sudo else "",
705             self.command,
706         )
707     
708     def start(self):
709         self._logger.info("Starting %s", self)
710         
711         # Create shell script with the command
712         # This way, complex commands and scripts can be ran seamlessly
713         # sync files
714         command = cStringIO.StringIO()
715         command.write('export PYTHONPATH=$PYTHONPATH:%s\n' % (
716             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
717         ))
718         command.write('export PATH=$PATH:%s\n' % (
719             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
720         ))
721         if self.node.env:
722             for envkey, envvals in self.node.env.iteritems():
723                 for envval in envvals:
724                     command.write('export %s=%s\n' % (envkey, envval))
725         command.write(self.command)
726         command.seek(0)
727
728         try:
729             self._popen_scp(
730                 command,
731                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
732                     os.path.join(self.home_path, "app.sh"))
733                 )
734         except RuntimeError, e:
735             raise RuntimeError, "Failed to set up application: %s %s" \
736                     % (e.args[0], e.args[1],)
737         
738         # Start process in a "daemonized" way, using nohup and heavy
739         # stdin/out redirection to avoid connection issues
740         (out,err),proc = rspawn.remote_spawn(
741             self._replace_paths("bash ./app.sh"),
742             
743             pidfile = './pid',
744             home = self.home_path,
745             stdin = 'stdin' if self.stdin is not None else '/dev/null',
746             stdout = 'stdout' if self.stdout else '/dev/null',
747             stderr = 'stderr' if self.stderr else '/dev/null',
748             sudo = self.sudo,
749             
750             host = self.node.hostname,
751             port = None,
752             user = self.node.slicename,
753             agent = None,
754             ident_key = self.node.ident_path,
755             server_key = self.node.server_key
756             )
757         
758         if proc.wait():
759             if self.check_bad_host(out, err):
760                 self.node.blacklist()
761             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
762
763         self._started = True
764     
765     def recover(self):
766         # Assuming the application is running on PlanetLab,
767         # proper pidfiles should be present at the app's home path.
768         # So we mark this application as started, and check the pidfiles
769         self._started = True
770         self.checkpid()
771
772     def checkpid(self):            
773         # Get PID/PPID
774         # NOTE: wait a bit for the pidfile to be created
775         if self._started and not self._pid or not self._ppid:
776             pidtuple = rspawn.remote_check_pid(
777                 os.path.join(self.home_path,'pid'),
778                 host = self.node.hostname,
779                 port = None,
780                 user = self.node.slicename,
781                 agent = None,
782                 ident_key = self.node.ident_path,
783                 server_key = self.node.server_key
784                 )
785             
786             if pidtuple:
787                 self._pid, self._ppid = pidtuple
788     
789     def status(self):
790         self.checkpid()
791         if not self._started:
792             return AS.STATUS_NOT_STARTED
793         elif not self._pid or not self._ppid:
794             return AS.STATUS_NOT_STARTED
795         else:
796             status = rspawn.remote_status(
797                 self._pid, self._ppid,
798                 host = self.node.hostname,
799                 port = None,
800                 user = self.node.slicename,
801                 agent = None,
802                 ident_key = self.node.ident_path,
803                 server_key = self.node.server_key
804                 )
805             
806             if status is rspawn.NOT_STARTED:
807                 return AS.STATUS_NOT_STARTED
808             elif status is rspawn.RUNNING:
809                 return AS.STATUS_RUNNING
810             elif status is rspawn.FINISHED:
811                 return AS.STATUS_FINISHED
812             else:
813                 # WTF?
814                 return AS.STATUS_NOT_STARTED
815     
816     def kill(self):
817         status = self.status()
818         if status == AS.STATUS_RUNNING:
819             # kill by ppid+pid - SIGTERM first, then try SIGKILL
820             rspawn.remote_kill(
821                 self._pid, self._ppid,
822                 host = self.node.hostname,
823                 port = None,
824                 user = self.node.slicename,
825                 agent = None,
826                 ident_key = self.node.ident_path,
827                 server_key = self.node.server_key,
828                 sudo = self.sudo
829                 )
830             self._logger.info("Killed %s", self)
831
832
833 class NepiDependency(Dependency):
834     """
835     This dependency adds nepi itself to the python path,
836     so that you may run testbeds within PL nodes.
837     """
838     
839     # Class attribute holding a *weak* reference to the shared NEPI tar file
840     # so that they may share it. Don't operate on the file itself, it would
841     # be a mess, just use its path.
842     _shared_nepi_tar = None
843     
844     def __init__(self, api = None):
845         super(NepiDependency, self).__init__(api)
846         
847         self._tarball = None
848         
849         self.depends = 'python python-ipaddr python-setuptools'
850         
851         # our sources are in our ad-hoc tarball
852         self.sources = self.tarball.name
853         
854         tarname = os.path.basename(self.tarball.name)
855         
856         # it's already built - just move the tarball into place
857         self.build = "mv -f ${SOURCES}/%s ." % (tarname,)
858         
859         # unpack it into sources, and we're done
860         self.install = "tar xzf ${BUILD}/%s -C .." % (tarname,)
861     
862     @property
863     def tarball(self):
864         if self._tarball is None:
865             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
866             if shared_tar is not None:
867                 self._tarball = shared_tar
868             else:
869                 # Build an ad-hoc tarball
870                 # Prebuilt
871                 import nepi
872                 import tempfile
873                 
874                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
875                 
876                 proc = subprocess.Popen(
877                     ["tar", "czf", shared_tar.name, 
878                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
879                         'nepi'],
880                     stdout = open("/dev/null","w"),
881                     stdin = open("/dev/null","r"))
882
883                 if proc.wait():
884                     raise RuntimeError, "Failed to create nepi tarball"
885                 
886                 self._tarball = self._shared_nepi_tar = shared_tar
887                 
888         return self._tarball
889
890 class NS3Dependency(Dependency):
891     """
892     This dependency adds NS3 libraries to the library paths,
893     so that you may run the NS3 testbed within PL nodes.
894     
895     You'll also need the NepiDependency.
896     """
897     
898     def __init__(self, api = None):
899         super(NS3Dependency, self).__init__(api)
900         
901         self.buildDepends = 'make waf gcc gcc-c++ gccxml unzip'
902         
903         # We have to download the sources, untar, build...
904         pybindgen_source_url = "http://yans.pl.sophia.inria.fr/trac/nepi/raw-attachment/wiki/WikiStart/pybindgen-r794.tar.gz"
905         pygccxml_source_url = "http://leaseweb.dl.sourceforge.net/project/pygccxml/pygccxml/pygccxml-1.0/pygccxml-1.0.0.zip"
906         ns3_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/ns-3.11-nepi/archive/tip.tar.gz"
907         passfd_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/python-passfd/archive/tip.tar.gz"
908         self.build =(
909             " ( "
910             "  cd .. && "
911             "  python -c 'import pygccxml, pybindgen, passfd' && "
912             "  test -f lib/ns/_core.so && "
913             "  test -f lib/ns/__init__.py && "
914             "  test -f lib/ns/core.py && "
915             "  test -f lib/libns3-core.so && "
916             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
917             " ) || ( "
918                 # Not working, rebuild
919                      # Archive SHA1 sums to check
920                      "echo '7158877faff2254e6c094bf18e6b4283cac19137  pygccxml-1.0.0.zip' > archive_sums.txt && "
921                      "echo 'a18c2ccffd0df517bc37e2f3a2475092517c43f2  pybindgen-src.tar.gz' >> archive_sums.txt && "
922                      " ( " # check existing files
923                      " sha1sum -c archive_sums.txt && "
924                      " test -f passfd-src.tar.gz && "
925                      " test -f ns3-src.tar.gz "
926                      " ) || ( " # nope? re-download
927                      " rm -f pybindgen-src.zip pygccxml-1.0.0.zip passfd-src.tar.gz ns3-src.tar.gz && "
928                      " wget -q -c -O pybindgen-src.tar.gz %(pybindgen_source_url)s && " # continue, to exploit the case when it has already been dl'ed
929                      " wget -q -c -O pygccxml-1.0.0.zip %(pygccxml_source_url)s && " 
930                      " wget -q -c -O passfd-src.tar.gz %(passfd_source_url)s && "
931                      " wget -q -c -O ns3-src.tar.gz %(ns3_source_url)s && "  
932                      " sha1sum -c archive_sums.txt " # Check SHA1 sums when applicable
933                      " ) && "
934                      "unzip -n pygccxml-1.0.0.zip && "
935                      "mkdir -p pybindgen-src && "
936                      "mkdir -p ns3-src && "
937                      "mkdir -p passfd-src && "
938                      "tar xzf ns3-src.tar.gz --strip-components=1 -C ns3-src && "
939                      "tar xzf passfd-src.tar.gz --strip-components=1 -C passfd-src && "
940                      "tar xzf pybindgen-src.tar.gz --strip-components=1 -C pybindgen-src && "
941                      "rm -rf target && "    # mv doesn't like unclean targets
942                      "mkdir -p target && "
943                      "cd pygccxml-1.0.0 && "
944                      "rm -rf unittests docs && " # pygccxml has ~100M of unit tests - excessive - docs aren't needed either
945                      "python setup.py build && "
946                      "python setup.py install --install-lib ${BUILD}/target && "
947                      "python setup.py clean && "
948                      "cd ../pybindgen-src && "
949                      "export PYTHONPATH=$PYTHONPATH:${BUILD}/target && "
950                      "./waf configure --prefix=${BUILD}/target -d release && "
951                      "./waf && "
952                      "./waf install && "
953                      "./waf clean && "
954                      "mv -f ${BUILD}/target/lib/python*/site-packages/pybindgen ${BUILD}/target/. && "
955                      "rm -rf ${BUILD}/target/lib && "
956                      "cd ../passfd-src && "
957                      "python setup.py build && "
958                      "python setup.py install --install-lib ${BUILD}/target && "
959                      "python setup.py clean && "
960                      "cd ../ns3-src && "
961                      "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
962                      "./waf &&"
963                      "./waf install && "
964                      "rm -f ${BUILD}/target/lib/*.so && "
965                      "cp -a ${BUILD}/ns3-src/build/release/libns3*.so ${BUILD}/target/lib && "
966                      "cp -a ${BUILD}/ns3-src/build/release/bindings/python/ns ${BUILD}/target/lib &&"
967                      "./waf clean "
968              " )"
969                      % dict(
970                         pybindgen_source_url = server.shell_escape(pybindgen_source_url),
971                         pygccxml_source_url = server.shell_escape(pygccxml_source_url),
972                         ns3_source_url = server.shell_escape(ns3_source_url),
973                         passfd_source_url = server.shell_escape(passfd_source_url),
974                      ))
975         
976         # Just move ${BUILD}/target
977         self.install = (
978             " ( "
979             "  cd .. && "
980             "  python -c 'import pygccxml, pybindgen, passfd' && "
981             "  test -f lib/ns/_core.so && "
982             "  test -f lib/ns/__init__.py && "
983             "  test -f lib/ns/core.py && "
984             "  test -f lib/libns3-core.so && "
985             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
986             " ) || ( "
987                 # Not working, reinstall
988                     "test -d ${BUILD}/target && "
989                     "[[ \"x\" != \"x$(find ${BUILD}/target -mindepth 1 -print -quit)\" ]] &&"
990                     "( for i in ${BUILD}/target/* ; do rm -rf ${SOURCES}/${i##*/} ; done ) && " # mv doesn't like unclean targets
991                     "mv -f ${BUILD}/target/* ${SOURCES}"
992             " )"
993         )
994         
995         # Set extra environment paths
996         self.env['NEPI_NS3BINDINGS'] = "${SOURCES}/lib"
997         self.env['NEPI_NS3LIBRARY'] = "${SOURCES}/lib"
998     
999     @property
1000     def tarball(self):
1001         if self._tarball is None:
1002             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
1003             if shared_tar is not None:
1004                 self._tarball = shared_tar
1005             else:
1006                 # Build an ad-hoc tarball
1007                 # Prebuilt
1008                 import nepi
1009                 import tempfile
1010                 
1011                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
1012                 
1013                 proc = subprocess.Popen(
1014                     ["tar", "czf", shared_tar.name, 
1015                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
1016                         'nepi'],
1017                     stdout = open("/dev/null","w"),
1018                     stdin = open("/dev/null","r"))
1019
1020                 if proc.wait():
1021                     raise RuntimeError, "Failed to create nepi tarball"
1022                 
1023                 self._tarball = self._shared_nepi_tar = shared_tar
1024                 
1025         return self._tarball
1026
1027 class YumDependency(Dependency):
1028     """
1029     This dependency is an internal helper class used to
1030     efficiently distribute yum-downloaded rpms.
1031     
1032     It temporarily sets the yum cache as persistent in the
1033     build master, and installs all the required packages.
1034     
1035     The rpm packages left in the yum cache are gathered and
1036     distributed by the underlying Dependency in an efficient
1037     manner. Build slaves will then install those rpms back in
1038     the cache before issuing the install command.
1039     
1040     When packages have been installed already, nothing but an
1041     empty tar is distributed.
1042     """
1043     
1044     # Class attribute holding a *weak* reference to the shared NEPI tar file
1045     # so that they may share it. Don't operate on the file itself, it would
1046     # be a mess, just use its path.
1047     _shared_nepi_tar = None
1048     
1049     def _build_get(self):
1050         # canonical representation of dependencies
1051         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1052         
1053         # download rpms and pack into a tar archive
1054         return (
1055             "sudo -S nice yum -y makecache && "
1056             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
1057             " ( ( "
1058                 "sudo -S nice yum -y install %s ; "
1059                 "rm -f ${BUILD}/packages.tar ; "
1060                 "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
1061             " ) || /bin/true ) && "
1062             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
1063             "( sudo -S nice yum -y clean packages || /bin/true ) "
1064         ) % ( depends, )
1065     def _build_set(self, value):
1066         # ignore
1067         return
1068     build = property(_build_get, _build_set)
1069     
1070     def _install_get(self):
1071         # canonical representation of dependencies
1072         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1073         
1074         # unpack cached rpms into yum cache, install, and cleanup
1075         return (
1076             "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
1077             "sudo -S nice yum -y install %s && "
1078             "( sudo -S nice yum -y clean packages || /bin/true ) "
1079         ) % ( depends, )
1080     def _install_set(self, value):
1081         # ignore
1082         return
1083     install = property(_install_get, _install_set)
1084         
1085     def check_bad_host(self, out, err):
1086         badre = re.compile(r'(?:'
1087                            r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
1088                            r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
1089                            r'|Error: disk I/O error'
1090                            r'|MASTER NODE UNREACHABLE'
1091                            r')', 
1092                            re.I)
1093         return badre.search(out) or badre.search(err) or self.node.check_bad_host(out,err)