Robustness improvements:
[nepi.git] / src / nepi / testbeds / planetlab / application.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from constants import TESTBED_ID
5 import plcapi
6 import operator
7 import os
8 import os.path
9 import sys
10 import nepi.util.server as server
11 import cStringIO
12 import subprocess
13 import rspawn
14 import random
15 import time
16 import socket
17 import threading
18 import logging
19 import re
20
21 from nepi.util.constants import ApplicationStatus as AS
22
23 class Dependency(object):
24     """
25     A Dependency is in every respect like an application.
26     
27     It depends on some packages, it may require building binaries, it must deploy
28     them...
29     
30     But it has no command. Dependencies aren't ever started, or stopped, and have
31     no status.
32     """
33
34     TRACES = ('buildlog')
35
36     def __init__(self, api=None):
37         if not api:
38             api = plcapi.PLCAPI()
39         self._api = api
40         
41         # Attributes
42         self.command = None
43         self.sudo = False
44         
45         self.build = None
46         self.install = None
47         self.depends = None
48         self.buildDepends = None
49         self.sources = None
50         self.rpmFusion = False
51         self.env = {}
52         
53         self.stdin = None
54         self.stdout = None
55         self.stderr = None
56         self.buildlog = None
57         
58         self.add_to_path = True
59         
60         # Those are filled when the app is configured
61         self.home_path = None
62         
63         # Those are filled when an actual node is connected
64         self.node = None
65         
66         # Those are filled when the app is started
67         #   Having both pid and ppid makes it harder
68         #   for pid rollover to induce tracking mistakes
69         self._started = False
70         self._setup = False
71         self._setuper = None
72         self._pid = None
73         self._ppid = None
74
75         # Spanning tree deployment
76         self._master = None
77         self._master_passphrase = None
78         self._master_prk = None
79         self._master_puk = None
80         self._master_token = os.urandom(8).encode("hex")
81         self._build_pid = None
82         self._build_ppid = None
83         
84         # Logging
85         self._logger = logging.getLogger('nepi.testbeds.planetlab')
86         
87     
88     def __str__(self):
89         return "%s<%s>" % (
90             self.__class__.__name__,
91             ' '.join(filter(bool,(self.depends, self.sources)))
92         )
93     
94     def validate(self):
95         if self.home_path is None:
96             raise AssertionError, "Misconfigured application: missing home path"
97         if self.node.ident_path is None or not os.access(self.node.ident_path, os.R_OK):
98             raise AssertionError, "Misconfigured application: missing slice SSH key"
99         if self.node is None:
100             raise AssertionError, "Misconfigured application: unconnected node"
101         if self.node.hostname is None:
102             raise AssertionError, "Misconfigured application: misconfigured node"
103         if self.node.slicename is None:
104             raise AssertionError, "Misconfigured application: unspecified slice"
105     
106     def check_bad_host(self, out, err):
107         """
108         Called whenever an operation fails, it's given the output to be checked for
109         telltale signs of unhealthy hosts.
110         """
111         return False
112     
113     def remote_trace_path(self, whichtrace):
114         if whichtrace in self.TRACES:
115             tracefile = os.path.join(self.home_path, whichtrace)
116         else:
117             tracefile = None
118         
119         return tracefile
120
121     def remote_trace_name(self, whichtrace):
122         if whichtrace in self.TRACES:
123             return whichtrace
124         return None
125
126     def sync_trace(self, local_dir, whichtrace):
127         tracefile = self.remote_trace_path(whichtrace)
128         if not tracefile:
129             return None
130         
131         local_path = os.path.join(local_dir, tracefile)
132         
133         # create parent local folders
134         proc = subprocess.Popen(
135             ["mkdir", "-p", os.path.dirname(local_path)],
136             stdout = open("/dev/null","w"),
137             stdin = open("/dev/null","r"))
138
139         if proc.wait():
140             raise RuntimeError, "Failed to synchronize trace"
141         
142         # sync files
143         try:
144             self._popen_scp(
145                 '%s@%s:%s' % (self.node.slicename, self.node.hostname,
146                     tracefile),
147                 local_path
148                 )
149         except RuntimeError, e:
150             raise RuntimeError, "Failed to synchronize trace: %s %s" \
151                     % (e.args[0], e.args[1],)
152         
153         return local_path
154     
155     def recover(self):
156         # We assume a correct deployment, so recovery only
157         # means we mark this dependency as deployed
158         self._setup = True
159
160     def setup(self):
161         self._logger.info("Setting up %s", self)
162         self._make_home()
163         self._launch_build()
164         self._finish_build()
165         self._setup = True
166     
167     def async_setup(self):
168         if not self._setuper:
169             def setuper():
170                 try:
171                     self.setup()
172                 except:
173                     self._setuper._exc.append(sys.exc_info())
174             self._setuper = threading.Thread(
175                 target = setuper)
176             self._setuper._exc = []
177             self._setuper.start()
178     
179     def async_setup_wait(self):
180         if not self._setup:
181             self._logger.info("Waiting for %s to be setup", self)
182             if self._setuper:
183                 self._setuper.join()
184                 if not self._setup:
185                     if self._setuper._exc:
186                         exctyp,exval,exctrace = self._setuper._exc[0]
187                         raise exctyp,exval,exctrace
188                     else:
189                         raise RuntimeError, "Failed to setup application"
190                 else:
191                     self._logger.info("Setup ready: %s at %s", self, self.node.hostname)
192             else:
193                 self.setup()
194         
195     def _make_home(self):
196         # Make sure all the paths are created where 
197         # they have to be created for deployment
198         # sync files
199         try:
200             self._popen_ssh_command(
201                 "mkdir -p %(home)s && ( rm -f %(home)s/{pid,build-pid,nepi-build.sh} >/dev/null 2>&1 || /bin/true )" \
202                     % { 'home' : server.shell_escape(self.home_path) },
203                 timeout = 120,
204                 retry = 3
205                 )
206         except RuntimeError, e:
207             raise RuntimeError, "Failed to set up application %s: %s %s" % (self.home_path, e.args[0], e.args[1],)
208         
209         if self.stdin:
210             # Write program input
211             try:
212                 self._popen_scp(
213                     cStringIO.StringIO(self.stdin),
214                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
215                         os.path.join(self.home_path, 'stdin') ),
216                     )
217             except RuntimeError, e:
218                 raise RuntimeError, "Failed to set up application %s: %s %s" \
219                         % (self.home_path, e.args[0], e.args[1],)
220
221     def _replace_paths(self, command):
222         """
223         Replace all special path tags with shell-escaped actual paths.
224         """
225         # need to append ${HOME} if paths aren't absolute, to MAKE them absolute.
226         root = '' if self.home_path.startswith('/') else "${HOME}/"
227         return ( command
228             .replace("${SOURCES}", root+server.shell_escape(self.home_path))
229             .replace("${BUILD}", root+server.shell_escape(os.path.join(self.home_path,'build'))) )
230
231     def _launch_build(self, trial=0):
232         if self._master is not None:
233             if not trial or self._master_prk is not None:
234                 self._do_install_keys()
235             buildscript = self._do_build_slave()
236         else:
237             buildscript = self._do_build_master()
238             
239         if buildscript is not None:
240             self._logger.info("Building %s at %s", self, self.node.hostname)
241             
242             # upload build script
243             try:
244                 self._popen_scp(
245                     buildscript,
246                     '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
247                         os.path.join(self.home_path, 'nepi-build.sh') )
248                     )
249             except RuntimeError, e:
250                 raise RuntimeError, "Failed to set up application %s: %s %s" \
251                         % (self.home_path, e.args[0], e.args[1],)
252             
253             # launch build
254             self._do_launch_build()
255     
256     def _finish_build(self):
257         self._do_wait_build()
258         self._do_install()
259
260     def _do_build_slave(self):
261         if not self.sources and not self.build:
262             return None
263             
264         # Create build script
265         files = set()
266         
267         if self.sources:
268             sources = self.sources.split(' ')
269             files.update(
270                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostip, 
271                     os.path.join(self._master.home_path, os.path.basename(source)),)
272                 for source in sources
273             )
274         
275         if self.build:
276             files.add(
277                 "%s@%s:%s" % (self._master.node.slicename, self._master.node.hostip, 
278                     os.path.join(self._master.home_path, 'build.tar.gz'),)
279             )
280         
281         sshopts = "-o ConnectTimeout=30 -o ConnectionAttempts=3 -o ServerAliveInterval=30 -o TCPKeepAlive=yes"
282         
283         launch_agent = "{ ( echo -e '#!/bin/sh\\ncat' > .ssh-askpass ) && chmod u+x .ssh-askpass"\
284                         " && export SSH_ASKPASS=$(pwd)/.ssh-askpass "\
285                         " && ssh-agent > .ssh-agent.sh ; } && . ./.ssh-agent.sh && ( echo $NEPI_MASTER_PASSPHRASE | ssh-add %(prk)s ) && rm -rf %(prk)s %(puk)s" %  \
286         {
287             'prk' : server.shell_escape(self._master_prk_name),
288             'puk' : server.shell_escape(self._master_puk_name),
289         }
290         
291         kill_agent = "kill $SSH_AGENT_PID"
292         
293         waitmaster = (
294             "{ "
295             "echo 'Checking master reachability' ; "
296             "if ping -c 3 %(master_host)s && (. ./.ssh-agent.sh > /dev/null ; ssh -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s echo MASTER SAYS HI ) ; then "
297             "echo 'Master node reachable' ; "
298             "else "
299             "echo 'MASTER NODE UNREACHABLE' && "
300             "exit 1 ; "
301             "fi ; "
302             ". ./.ssh-agent.sh ; "
303             "while [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s.retcode || /bin/true) != %(token)s ]] ; do sleep 5 ; done ; "
304             "if [[ $(. ./.ssh-agent.sh > /dev/null ; ssh -q -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(master)s cat %(token_path)s || /bin/true) != %(token)s ]] ; then echo BAD TOKEN ; exit 1 ; fi ; "
305             "}" 
306         ) % {
307             'hostkey' : 'master_known_hosts',
308             'master' : "%s@%s" % (self._master.node.slicename, self._master.node.hostip),
309             'master_host' : self._master.node.hostip,
310             'token_path' : os.path.join(self._master.home_path, 'build.token'),
311             'token' : server.shell_escape(self._master._master_token),
312             'sshopts' : sshopts,
313         }
314         
315         syncfiles = ". ./.ssh-agent.sh && scp -p -o UserKnownHostsFile=%(hostkey)s %(sshopts)s %(files)s ." % {
316             'hostkey' : 'master_known_hosts',
317             'files' : ' '.join(files),
318             'sshopts' : sshopts,
319         }
320         if self.build:
321             syncfiles += " && tar xzf build.tar.gz"
322         syncfiles += " && ( echo %s > build.token )" % (server.shell_escape(self._master_token),)
323         syncfiles += " && ( echo %s > build.token.retcode )" % (server.shell_escape(self._master_token),)
324         syncfiles = "{ . ./.ssh-agent.sh ; %s ; }" % (syncfiles,)
325         
326         cleanup = "{ . ./.ssh-agent.sh ; kill $SSH_AGENT_PID ; rm -rf %(prk)s %(puk)s master_known_hosts .ssh-askpass ; }" % {
327             'prk' : server.shell_escape(self._master_prk_name),
328             'puk' : server.shell_escape(self._master_puk_name),
329         }
330         
331         slavescript = "( ( %(launch_agent)s && %(waitmaster)s && %(syncfiles)s && %(kill_agent)s && %(cleanup)s ) || %(cleanup)s ) ; echo %(token)s > build.token.retcode" % {
332             'waitmaster' : waitmaster,
333             'syncfiles' : syncfiles,
334             'cleanup' : cleanup,
335             'kill_agent' : kill_agent,
336             'launch_agent' : launch_agent,
337             'home' : server.shell_escape(self.home_path),
338             'token' : server.shell_escape(self._master_token),
339         }
340         
341         return cStringIO.StringIO(slavescript)
342          
343     def _do_launch_build(self):
344         script = "bash ./nepi-build.sh"
345         if self._master_passphrase:
346             script = "NEPI_MASTER_PASSPHRASE=%s %s" % (
347                 server.shell_escape(self._master_passphrase),
348                 script
349             )
350         (out,err),proc = rspawn.remote_spawn(
351             script,
352             pidfile = 'build-pid',
353             home = self.home_path,
354             stdin = '/dev/null',
355             stdout = 'buildlog',
356             stderr = rspawn.STDOUT,
357             
358             host = self.node.hostname,
359             port = None,
360             user = self.node.slicename,
361             agent = None,
362             ident_key = self.node.ident_path,
363             server_key = self.node.server_key,
364             hostip = self.node.hostip,
365             )
366         
367         if proc.wait():
368             if self.check_bad_host(out, err):
369                 self.node.blacklist()
370             raise RuntimeError, "Failed to set up build slave %s: %s %s" % (self.home_path, out,err,)
371         
372         
373         pid = ppid = None
374         delay = 1.0
375         for i in xrange(5):
376             pidtuple = rspawn.remote_check_pid(
377                 os.path.join(self.home_path,'build-pid'),
378                 host = self.node.hostname,
379                 port = None,
380                 user = self.node.slicename,
381                 agent = None,
382                 ident_key = self.node.ident_path,
383                 server_key = self.node.server_key,
384                 hostip = self.node.hostip
385                 )
386             
387             if pidtuple:
388                 pid, ppid = pidtuple
389                 self._build_pid, self._build_ppid = pidtuple
390                 break
391             else:
392                 time.sleep(delay)
393                 delay = min(30,delay*1.2)
394         else:
395             raise RuntimeError, "Failed to set up build slave %s: cannot get pid" % (self.home_path,)
396
397         self._logger.info("Deploying %s at %s", self, self.node.hostname)
398         
399     def _do_wait_build(self, trial=0):
400         pid = self._build_pid
401         ppid = self._build_ppid
402         
403         if pid and ppid:
404             delay = 1.0
405             first = True
406             bustspin = 0
407             while True:
408                 status = rspawn.remote_status(
409                     pid, ppid,
410                     host = self.node.hostname,
411                     port = None,
412                     user = self.node.slicename,
413                     agent = None,
414                     ident_key = self.node.ident_path,
415                     server_key = self.node.server_key,
416                     hostip = self.node.hostip
417                     )
418                 
419                 if status is rspawn.FINISHED:
420                     self._build_pid = self._build_ppid = None
421                     break
422                 elif status is not rspawn.RUNNING:
423                     self._logger.warn("Busted waiting for %s to finish building at %s %s", self, self.node.hostname,
424                             "(build slave)" if self._master is not None else "(build master)")
425                     bustspin += 1
426                     time.sleep(delay*(5.5+random.random()))
427                     if bustspin > 12:
428                         self._build_pid = self._build_ppid = None
429                         break
430                 else:
431                     if first:
432                         self._logger.info("Waiting for %s to finish building at %s %s", self, self.node.hostname,
433                             "(build slave)" if self._master is not None else "(build master)")
434                         
435                         first = False
436                     time.sleep(delay*(0.5+random.random()))
437                     delay = min(30,delay*1.2)
438                     bustspin = 0
439             
440             # check build token
441             slave_token = ""
442             for i in xrange(3):
443                 (out, err), proc = self._popen_ssh_command(
444                     "cat %(token_path)s" % {
445                         'token_path' : os.path.join(self.home_path, 'build.token'),
446                     },
447                     timeout = 120,
448                     noerrors = True)
449                 if not proc.wait() and out:
450                     slave_token = out.strip()
451                 
452                 if slave_token:
453                     break
454                 else:
455                     time.sleep(2)
456             
457             if slave_token != self._master_token:
458                 # Get buildlog for the error message
459
460                 (buildlog, err), proc = self._popen_ssh_command(
461                     "cat %(buildlog)s" % {
462                         'buildlog' : os.path.join(self.home_path, 'buildlog'),
463                         'buildscript' : os.path.join(self.home_path, 'nepi-build.sh'),
464                     },
465                     timeout = 120,
466                     noerrors = True)
467                 
468                 proc.wait()
469                 
470                 if self.check_bad_host(buildlog, err):
471                     self.node.blacklist()
472                 elif self._master and trial < 3 and 'BAD TOKEN' in buildlog or 'BAD TOKEN' in err:
473                     # bad sync with master, may try again
474                     # but first wait for master
475                     self._master.async_setup_wait()
476                     self._launch_build(trial+1)
477                     self._do_wait_build(trial+1)
478                 else:
479                     # No longer need'em
480                     self._master_prk = None
481                     self._master_puk = None
482         
483                     raise RuntimeError, "Failed to set up application %s: "\
484                             "build failed, got wrong token from pid %s/%s "\
485                             "(expected %r, got %r), see buildlog at %s:\n%s" % (
486                         self.home_path, pid, ppid, self._master_token, slave_token, self.node.hostname, buildlog)
487
488             # No longer need'em
489             self._master_prk = None
490             self._master_puk = None
491         
492             self._logger.info("Built %s at %s", self, self.node.hostname)
493
494     def _do_kill_build(self):
495         pid = self._build_pid
496         ppid = self._build_ppid
497         
498         if pid and ppid:
499             self._logger.info("Killing build of %s", self)
500             rspawn.remote_kill(
501                 pid, ppid,
502                 host = self.node.hostname,
503                 port = None,
504                 user = self.node.slicename,
505                 agent = None,
506                 ident_key = self.node.ident_path,
507                 hostip = self.node.hostip
508                 )
509         
510         
511     def _do_build_master(self):
512         if not self.sources and not self.build and not self.buildDepends:
513             return None
514             
515         if self.sources:
516             sources = self.sources.split(' ')
517             
518             # Copy all sources
519             try:
520                 self._popen_scp(
521                     sources,
522                     "%s@%s:%s" % (self.node.slicename, self.node.hostname, 
523                         os.path.join(self.home_path,'.'),)
524                     )
525             except RuntimeError, e:
526                 raise RuntimeError, "Failed upload source file %r: %s %s" \
527                         % (sources, e.args[0], e.args[1],)
528             
529         buildscript = cStringIO.StringIO()
530         
531         buildscript.write("(\n")
532         
533         if self.buildDepends:
534             # Install build dependencies
535             buildscript.write(
536                 "sudo -S yum -y install %(packages)s\n" % {
537                     'packages' : self.buildDepends
538                 }
539             )
540         
541             
542         if self.build:
543             # Build sources
544             buildscript.write(
545                 "mkdir -p build && ( cd build && ( %(command)s ) )\n" % {
546                     'command' : self._replace_paths(self.build),
547                     'home' : server.shell_escape(self.home_path),
548                 }
549             )
550         
551             # Make archive
552             buildscript.write("tar czf build.tar.gz build\n")
553         
554         # Write token
555         buildscript.write("echo %(master_token)s > build.token ) ; echo %(master_token)s > build.token.retcode" % {
556             'master_token' : server.shell_escape(self._master_token)
557         })
558         
559         buildscript.seek(0)
560
561         return buildscript
562
563     def _do_install(self):
564         if self.install:
565             self._logger.info("Installing %s at %s", self, self.node.hostname)
566             
567             # Install application
568             try:
569                 self._popen_ssh_command(
570                     "cd %(home)s && cd build && ( %(command)s ) > ${HOME}/%(home)s/installlog 2>&1 || ( tail ${HOME}/%(home)s/{install,build}log >&2 && false )" % \
571                         {
572                         'command' : self._replace_paths(self.install),
573                         'home' : server.shell_escape(self.home_path),
574                         },
575                     )
576             except RuntimeError, e:
577                 if self.check_bad_host(e.args[0], e.args[1]):
578                     self.node.blacklist()
579                 raise RuntimeError, "Failed install build sources: %s %s" % (e.args[0], e.args[1],)
580
581     def set_master(self, master):
582         self._master = master
583         
584     def install_keys(self, prk, puk, passphrase):
585         # Install keys
586         self._master_passphrase = passphrase
587         self._master_prk = prk
588         self._master_puk = puk
589         self._master_prk_name = os.path.basename(prk.name)
590         self._master_puk_name = os.path.basename(puk.name)
591         
592     def _do_install_keys(self):
593         prk = self._master_prk
594         puk = self._master_puk
595        
596         try:
597             self._popen_scp(
598                 [ prk.name, puk.name ],
599                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, self.home_path )
600                 )
601         except RuntimeError, e:
602             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
603                     % (e.args[0], e.args[1],)
604
605         try:
606             self._popen_scp(
607                 cStringIO.StringIO('%s,%s %s\n' % (
608                     self._master.node.hostname, self._master.node.hostip, 
609                     self._master.node.server_key)),
610                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
611                     os.path.join(self.home_path,"master_known_hosts") )
612                 )
613         except RuntimeError, e:
614             raise RuntimeError, "Failed to set up application deployment keys: %s %s" \
615                     % (e.args[0], e.args[1],)
616         
617     
618     def cleanup(self):
619         # make sure there's no leftover build processes
620         self._do_kill_build()
621         
622         # No longer need'em
623         self._master_prk = None
624         self._master_puk = None
625
626     @server.eintr_retry
627     def _popen_scp(self, src, dst, retry = 3):
628         while 1:
629             try:
630                 (out,err),proc = server.popen_scp(
631                     src,
632                     dst, 
633                     port = None,
634                     agent = None,
635                     ident_key = self.node.ident_path,
636                     server_key = self.node.server_key
637                     )
638
639                 if server.eintr_retry(proc.wait)():
640                     raise RuntimeError, (out, err)
641                 return (out, err), proc
642             except:
643                 if retry <= 0:
644                     raise
645                 else:
646                     retry -= 1
647   
648
649     @server.eintr_retry
650     def _popen_ssh_command(self, command, retry = 0, noerrors=False, timeout=None):
651         (out,err),proc = server.popen_ssh_command(
652             command,
653             host = self.node.hostname,
654             port = None,
655             user = self.node.slicename,
656             agent = None,
657             ident_key = self.node.ident_path,
658             server_key = self.node.server_key,
659             timeout = timeout,
660             retry = retry
661             )
662
663         if server.eintr_retry(proc.wait)():
664             if not noerrors:
665                 raise RuntimeError, (out, err)
666         return (out, err), proc
667
668 class Application(Dependency):
669     """
670     An application also has dependencies, but also a command to be ran and monitored.
671     
672     It adds the output of that command as traces.
673     """
674     
675     TRACES = ('stdout','stderr','buildlog', 'output')
676     
677     def __init__(self, api=None):
678         super(Application,self).__init__(api)
679         
680         # Attributes
681         self.command = None
682         self.sudo = False
683         
684         self.stdin = None
685         self.stdout = None
686         self.stderr = None
687         self.output = None
688         
689         # Those are filled when the app is started
690         #   Having both pid and ppid makes it harder
691         #   for pid rollover to induce tracking mistakes
692         self._started = False
693         self._pid = None
694         self._ppid = None
695
696         # Do not add to the python path of nodes
697         self.add_to_path = False
698     
699     def __str__(self):
700         return "%s<command:%s%s>" % (
701             self.__class__.__name__,
702             "sudo " if self.sudo else "",
703             self.command,
704         )
705     
706     def start(self):
707         self._logger.info("Starting %s", self)
708         
709         # Create shell script with the command
710         # This way, complex commands and scripts can be ran seamlessly
711         # sync files
712         command = cStringIO.StringIO()
713         command.write('export PYTHONPATH=$PYTHONPATH:%s\n' % (
714             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
715         ))
716         command.write('export PATH=$PATH:%s\n' % (
717             ':'.join(["${HOME}/"+server.shell_escape(s) for s in self.node.pythonpath])
718         ))
719         if self.node.env:
720             for envkey, envvals in self.node.env.iteritems():
721                 for envval in envvals:
722                     command.write('export %s=%s\n' % (envkey, envval))
723         command.write(self.command)
724         command.seek(0)
725
726         try:
727             self._popen_scp(
728                 command,
729                 '%s@%s:%s' % (self.node.slicename, self.node.hostname, 
730                     os.path.join(self.home_path, "app.sh"))
731                 )
732         except RuntimeError, e:
733             raise RuntimeError, "Failed to set up application: %s %s" \
734                     % (e.args[0], e.args[1],)
735         
736         # Start process in a "daemonized" way, using nohup and heavy
737         # stdin/out redirection to avoid connection issues
738         (out,err),proc = rspawn.remote_spawn(
739             self._replace_paths("bash ./app.sh"),
740             
741             pidfile = './pid',
742             home = self.home_path,
743             stdin = 'stdin' if self.stdin is not None else '/dev/null',
744             stdout = 'stdout' if self.stdout else '/dev/null',
745             stderr = 'stderr' if self.stderr else '/dev/null',
746             sudo = self.sudo,
747             
748             host = self.node.hostname,
749             port = None,
750             user = self.node.slicename,
751             agent = None,
752             ident_key = self.node.ident_path,
753             server_key = self.node.server_key
754             )
755         
756         if proc.wait():
757             if self.check_bad_host(out, err):
758                 self.node.blacklist()
759             raise RuntimeError, "Failed to set up application: %s %s" % (out,err,)
760
761         self._started = True
762     
763     def recover(self):
764         # Assuming the application is running on PlanetLab,
765         # proper pidfiles should be present at the app's home path.
766         # So we mark this application as started, and check the pidfiles
767         self._started = True
768         self.checkpid()
769
770     def checkpid(self):            
771         # Get PID/PPID
772         # NOTE: wait a bit for the pidfile to be created
773         if self._started and not self._pid or not self._ppid:
774             pidtuple = rspawn.remote_check_pid(
775                 os.path.join(self.home_path,'pid'),
776                 host = self.node.hostname,
777                 port = None,
778                 user = self.node.slicename,
779                 agent = None,
780                 ident_key = self.node.ident_path,
781                 server_key = self.node.server_key
782                 )
783             
784             if pidtuple:
785                 self._pid, self._ppid = pidtuple
786     
787     def status(self):
788         self.checkpid()
789         if not self._started:
790             return AS.STATUS_NOT_STARTED
791         elif not self._pid or not self._ppid:
792             return AS.STATUS_NOT_STARTED
793         else:
794             status = rspawn.remote_status(
795                 self._pid, self._ppid,
796                 host = self.node.hostname,
797                 port = None,
798                 user = self.node.slicename,
799                 agent = None,
800                 ident_key = self.node.ident_path,
801                 server_key = self.node.server_key
802                 )
803             
804             if status is rspawn.NOT_STARTED:
805                 return AS.STATUS_NOT_STARTED
806             elif status is rspawn.RUNNING:
807                 return AS.STATUS_RUNNING
808             elif status is rspawn.FINISHED:
809                 return AS.STATUS_FINISHED
810             else:
811                 # WTF?
812                 return AS.STATUS_NOT_STARTED
813     
814     def kill(self):
815         status = self.status()
816         if status == AS.STATUS_RUNNING:
817             # kill by ppid+pid - SIGTERM first, then try SIGKILL
818             rspawn.remote_kill(
819                 self._pid, self._ppid,
820                 host = self.node.hostname,
821                 port = None,
822                 user = self.node.slicename,
823                 agent = None,
824                 ident_key = self.node.ident_path,
825                 server_key = self.node.server_key,
826                 sudo = self.sudo
827                 )
828             self._logger.info("Killed %s", self)
829
830
831 class NepiDependency(Dependency):
832     """
833     This dependency adds nepi itself to the python path,
834     so that you may run testbeds within PL nodes.
835     """
836     
837     # Class attribute holding a *weak* reference to the shared NEPI tar file
838     # so that they may share it. Don't operate on the file itself, it would
839     # be a mess, just use its path.
840     _shared_nepi_tar = None
841     
842     def __init__(self, api = None):
843         super(NepiDependency, self).__init__(api)
844         
845         self._tarball = None
846         
847         self.depends = 'python python-ipaddr python-setuptools'
848         
849         # our sources are in our ad-hoc tarball
850         self.sources = self.tarball.name
851         
852         tarname = os.path.basename(self.tarball.name)
853         
854         # it's already built - just move the tarball into place
855         self.build = "mv -f ${SOURCES}/%s ." % (tarname,)
856         
857         # unpack it into sources, and we're done
858         self.install = "tar xzf ${BUILD}/%s -C .." % (tarname,)
859     
860     @property
861     def tarball(self):
862         if self._tarball is None:
863             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
864             if shared_tar is not None:
865                 self._tarball = shared_tar
866             else:
867                 # Build an ad-hoc tarball
868                 # Prebuilt
869                 import nepi
870                 import tempfile
871                 
872                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
873                 
874                 proc = subprocess.Popen(
875                     ["tar", "czf", shared_tar.name, 
876                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
877                         'nepi'],
878                     stdout = open("/dev/null","w"),
879                     stdin = open("/dev/null","r"))
880
881                 if proc.wait():
882                     raise RuntimeError, "Failed to create nepi tarball"
883                 
884                 self._tarball = self._shared_nepi_tar = shared_tar
885                 
886         return self._tarball
887
888 class NS3Dependency(Dependency):
889     """
890     This dependency adds NS3 libraries to the library paths,
891     so that you may run the NS3 testbed within PL nodes.
892     
893     You'll also need the NepiDependency.
894     """
895     
896     def __init__(self, api = None):
897         super(NS3Dependency, self).__init__(api)
898         
899         self.buildDepends = 'make waf gcc gcc-c++ gccxml unzip'
900         
901         # We have to download the sources, untar, build...
902         pybindgen_source_url = "http://yans.pl.sophia.inria.fr/trac/nepi/raw-attachment/wiki/WikiStart/pybindgen-r794.tar.gz"
903         pygccxml_source_url = "http://leaseweb.dl.sourceforge.net/project/pygccxml/pygccxml/pygccxml-1.0/pygccxml-1.0.0.zip"
904         ns3_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/ns-3.11-nepi/archive/tip.tar.gz"
905         passfd_source_url = "http://yans.pl.sophia.inria.fr/code/hgwebdir.cgi/python-passfd/archive/tip.tar.gz"
906         self.build =(
907             " ( "
908             "  cd .. && "
909             "  python -c 'import pygccxml, pybindgen, passfd' && "
910             "  test -f lib/ns/_core.so && "
911             "  test -f lib/ns/__init__.py && "
912             "  test -f lib/ns/core.py && "
913             "  test -f lib/libns3-core.so && "
914             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
915             " ) || ( "
916                 # Not working, rebuild
917                      # Archive SHA1 sums to check
918                      "echo '7158877faff2254e6c094bf18e6b4283cac19137  pygccxml-1.0.0.zip' > archive_sums.txt && "
919                      "echo 'a18c2ccffd0df517bc37e2f3a2475092517c43f2  pybindgen-src.tar.gz' >> archive_sums.txt && "
920                      " ( " # check existing files
921                      " sha1sum -c archive_sums.txt && "
922                      " test -f passfd-src.tar.gz && "
923                      " test -f ns3-src.tar.gz "
924                      " ) || ( " # nope? re-download
925                      " rm -f pybindgen-src.zip pygccxml-1.0.0.zip passfd-src.tar.gz ns3-src.tar.gz && "
926                      " wget -q -c -O pybindgen-src.tar.gz %(pybindgen_source_url)s && " # continue, to exploit the case when it has already been dl'ed
927                      " wget -q -c -O pygccxml-1.0.0.zip %(pygccxml_source_url)s && " 
928                      " wget -q -c -O passfd-src.tar.gz %(passfd_source_url)s && "
929                      " wget -q -c -O ns3-src.tar.gz %(ns3_source_url)s && "  
930                      " sha1sum -c archive_sums.txt " # Check SHA1 sums when applicable
931                      " ) && "
932                      "unzip -n pygccxml-1.0.0.zip && "
933                      "mkdir -p pybindgen-src && "
934                      "mkdir -p ns3-src && "
935                      "mkdir -p passfd-src && "
936                      "tar xzf ns3-src.tar.gz --strip-components=1 -C ns3-src && "
937                      "tar xzf passfd-src.tar.gz --strip-components=1 -C passfd-src && "
938                      "tar xzf pybindgen-src.tar.gz --strip-components=1 -C pybindgen-src && "
939                      "rm -rf target && "    # mv doesn't like unclean targets
940                      "mkdir -p target && "
941                      "cd pygccxml-1.0.0 && "
942                      "rm -rf unittests docs && " # pygccxml has ~100M of unit tests - excessive - docs aren't needed either
943                      "python setup.py build && "
944                      "python setup.py install --install-lib ${BUILD}/target && "
945                      "python setup.py clean && "
946                      "cd ../pybindgen-src && "
947                      "export PYTHONPATH=$PYTHONPATH:${BUILD}/target && "
948                      "./waf configure --prefix=${BUILD}/target -d release && "
949                      "./waf && "
950                      "./waf install && "
951                      "./waf clean && "
952                      "mv -f ${BUILD}/target/lib/python*/site-packages/pybindgen ${BUILD}/target/. && "
953                      "rm -rf ${BUILD}/target/lib && "
954                      "cd ../passfd-src && "
955                      "python setup.py build && "
956                      "python setup.py install --install-lib ${BUILD}/target && "
957                      "python setup.py clean && "
958                      "cd ../ns3-src && "
959                      "./waf configure --prefix=${BUILD}/target --with-pybindgen=../pybindgen-src -d release --disable-examples --disable-tests && "
960                      "./waf &&"
961                      "./waf install && "
962                      "rm -f ${BUILD}/target/lib/*.so && "
963                      "cp -a ${BUILD}/ns3-src/build/release/libns3*.so ${BUILD}/target/lib && "
964                      "cp -a ${BUILD}/ns3-src/build/release/bindings/python/ns ${BUILD}/target/lib &&"
965                      "./waf clean "
966              " )"
967                      % dict(
968                         pybindgen_source_url = server.shell_escape(pybindgen_source_url),
969                         pygccxml_source_url = server.shell_escape(pygccxml_source_url),
970                         ns3_source_url = server.shell_escape(ns3_source_url),
971                         passfd_source_url = server.shell_escape(passfd_source_url),
972                      ))
973         
974         # Just move ${BUILD}/target
975         self.install = (
976             " ( "
977             "  cd .. && "
978             "  python -c 'import pygccxml, pybindgen, passfd' && "
979             "  test -f lib/ns/_core.so && "
980             "  test -f lib/ns/__init__.py && "
981             "  test -f lib/ns/core.py && "
982             "  test -f lib/libns3-core.so && "
983             "  LD_LIBRARY_PATH=lib PYTHONPATH=lib python -c 'import ns.core' "
984             " ) || ( "
985                 # Not working, reinstall
986                     "test -d ${BUILD}/target && "
987                     "[[ \"x\" != \"x$(find ${BUILD}/target -mindepth 1 -print -quit)\" ]] &&"
988                     "( for i in ${BUILD}/target/* ; do rm -rf ${SOURCES}/${i##*/} ; done ) && " # mv doesn't like unclean targets
989                     "mv -f ${BUILD}/target/* ${SOURCES}"
990             " )"
991         )
992         
993         # Set extra environment paths
994         self.env['NEPI_NS3BINDINGS'] = "${SOURCES}/lib"
995         self.env['NEPI_NS3LIBRARY'] = "${SOURCES}/lib"
996     
997     @property
998     def tarball(self):
999         if self._tarball is None:
1000             shared_tar = self._shared_nepi_tar and self._shared_nepi_tar()
1001             if shared_tar is not None:
1002                 self._tarball = shared_tar
1003             else:
1004                 # Build an ad-hoc tarball
1005                 # Prebuilt
1006                 import nepi
1007                 import tempfile
1008                 
1009                 shared_tar = tempfile.NamedTemporaryFile(prefix='nepi-src-', suffix='.tar.gz')
1010                 
1011                 proc = subprocess.Popen(
1012                     ["tar", "czf", shared_tar.name, 
1013                         '-C', os.path.join(os.path.dirname(os.path.dirname(nepi.__file__)),'.'), 
1014                         'nepi'],
1015                     stdout = open("/dev/null","w"),
1016                     stdin = open("/dev/null","r"))
1017
1018                 if proc.wait():
1019                     raise RuntimeError, "Failed to create nepi tarball"
1020                 
1021                 self._tarball = self._shared_nepi_tar = shared_tar
1022                 
1023         return self._tarball
1024
1025 class YumDependency(Dependency):
1026     """
1027     This dependency is an internal helper class used to
1028     efficiently distribute yum-downloaded rpms.
1029     
1030     It temporarily sets the yum cache as persistent in the
1031     build master, and installs all the required packages.
1032     
1033     The rpm packages left in the yum cache are gathered and
1034     distributed by the underlying Dependency in an efficient
1035     manner. Build slaves will then install those rpms back in
1036     the cache before issuing the install command.
1037     
1038     When packages have been installed already, nothing but an
1039     empty tar is distributed.
1040     """
1041     
1042     # Class attribute holding a *weak* reference to the shared NEPI tar file
1043     # so that they may share it. Don't operate on the file itself, it would
1044     # be a mess, just use its path.
1045     _shared_nepi_tar = None
1046     
1047     def _build_get(self):
1048         # canonical representation of dependencies
1049         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1050         
1051         # download rpms and pack into a tar archive
1052         return (
1053             "sudo -S nice yum -y makecache && "
1054             "sudo -S sed -i -r 's/keepcache *= *0/keepcache=1/' /etc/yum.conf && "
1055             " ( ( "
1056                 "sudo -S nice yum -y install %s ; "
1057                 "rm -f ${BUILD}/packages.tar ; "
1058                 "tar -C /var/cache/yum -rf ${BUILD}/packages.tar $(cd /var/cache/yum ; find -iname '*.rpm')"
1059             " ) || /bin/true ) && "
1060             "sudo -S sed -i -r 's/keepcache *= *1/keepcache=0/' /etc/yum.conf && "
1061             "( sudo -S nice yum -y clean packages || /bin/true ) "
1062         ) % ( depends, )
1063     def _build_set(self, value):
1064         # ignore
1065         return
1066     build = property(_build_get, _build_set)
1067     
1068     def _install_get(self):
1069         # canonical representation of dependencies
1070         depends = ' '.join( sorted( (self.depends or "").split(' ') ) )
1071         
1072         # unpack cached rpms into yum cache, install, and cleanup
1073         return (
1074             "sudo -S tar -k --keep-newer-files -C /var/cache/yum -xf packages.tar && "
1075             "sudo -S nice yum -y install %s && "
1076             "( sudo -S nice yum -y clean packages || /bin/true ) "
1077         ) % ( depends, )
1078     def _install_set(self, value):
1079         # ignore
1080         return
1081     install = property(_install_get, _install_set)
1082         
1083     def check_bad_host(self, out, err):
1084         badre = re.compile(r'(?:'
1085                            r'The GPG keys listed for the ".*" repository are already installed but they are not correct for this package'
1086                            r'|Error: Cannot retrieve repository metadata (repomd.xml) for repository: .*[.] Please verify its path and try again'
1087                            r'|Error: disk I/O error'
1088                            r'|MASTER NODE UNREACHABLE'
1089                            r')', 
1090                            re.I)
1091         return badre.search(out) or badre.search(err)