From: Thierry Parmentelat Date: Thu, 11 Feb 2010 12:12:37 +0000 (+0000) Subject: fix log_call deadlock X-Git-Tag: NodeManager-2.0-2~2 X-Git-Url: http://git.onelab.eu/?p=nodemanager.git;a=commitdiff_plain;h=33f472e3e3589569f0272f7f53d07b011d2a5490 fix log_call deadlock initscript cleaned up - now supports 'service nm restartverbose' run vuseradd and vuserdel through bash -x (should be in verbose only but..) --- diff --git a/initscripts/nm b/initscripts/nm index 25b3c26..cb58689 100755 --- a/initscripts/nm +++ b/initscripts/nm @@ -12,22 +12,23 @@ # Source function library. . /etc/init.d/functions -if [ -f /etc/sysconfig/NodeManager ]; then - . /etc/sysconfig/NodeManager -fi +[ -f /etc/sysconfig/NodeManager ] && . /etc/sysconfig/NodeManager -nm=${NM-"python /usr/share/NodeManager/nm.py"} -prog="Node Manager" options=${OPTIONS-"-d -s"} restartoptions=${RESTARTOPTIONS-"-d"} +# turn on verbosity +verboseoptions=${DEBUGOPTIONS-"-v -d"} # debug mode is interactive, and has faster period -debugoptions=${DEBUGOPTIONS-"-v -p 60 -r 31"} +debugoptions=${DEBUGOPTIONS-"-v -p 30 -r 15"} + +nm=${NM-"python /usr/share/NodeManager/nm.py"} +prog="Node Manager" pidfile=${PIDFILE-/var/run/nm.pid} lockfile=${LOCKFILE-/var/lock/subsys/nm} + RETVAL=0 -do_start() -{ +function start() { echo -n $"Starting $prog: " daemon --check=nm $nm "$@" RETVAL=$? @@ -36,13 +37,7 @@ do_start() return $RETVAL } -start() -{ - do_start $options -} - -stop() -{ +function stop() { echo -n $"Stopping $prog: " killproc nm RETVAL=$? @@ -52,7 +47,7 @@ stop() case "$1" in start) - start + start $options ;; stop) stop @@ -62,23 +57,29 @@ case "$1" in RETVAL=$? ;; restart|reload) + shift stop - do_start $restartoptions + start $restartoptions "$@" + ;; + condrestart) + shift + [ -f ${pidfile} ] && { stop; start $restartoptions "$@"; } + ;; + restartverbose) + shift + stop + $nm $verboseoptions "$@" ;; restartdebug) shift stop - [[ -z "$@" ]] && echo "Running interactively .." + echo "Restarting with $debugoptions $@ .." $nm $debugoptions "$@" ;; - condrestart) - if [ -f ${pidfile} ] ; then - restart - fi - ;; *) - echo $"Usage: $0 {start|stop|restart|condrestart|status|restartdebug [-d]}" - exit 1 + echo $"Usage: $0 {start|stop|status|restart|condrestart|restartdebug [-d]}" + exit 1 + ;; esac -exit 0 +exit $RETVAL diff --git a/logger.py b/logger.py index dc65bc2..c4f6ce6 100644 --- a/logger.py +++ b/logger.py @@ -41,44 +41,6 @@ def log(msg,level=LOG_NODE): sys.stderr.write(msg) sys.stderr.flush() -#################### child processes -# avoid waiting until the process returns; -# that makes debugging of hanging children hard - -# time out in seconds - avoid hanging subprocesses - default is 5 minutes -default_timeout_minutes=5 - -def log_call(command,timeout=default_timeout_minutes*60,poll=0.3): - log('log_call: running command %s' % ' '.join(command)) - verbose('log_call: timeout %r s' % timeout) - verbose('log_call: poll %r s' % poll) - trigger=time.time()+timeout - try: - child = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - while True: - # see if anything can be read within the poll interval - (r,w,x)=select.select([child.stdout,child.stderr],[],[],poll) - # read and log it - for fd in r: - input=fd.read() - if input: log(input) - # is process over ? - returncode=child.poll() - # yes - if returncode != None: - # child is done and return 0 - if returncode == 0: - verbose('log_call: command completed %s' % ' '.join(command)) - break - # child has failed - else: - raise Exception("log_call: failed with returncode %d"%returncode) - # no : still within timeout ? - if time.time() >= trigger: - child.terminate() - raise Exception("log_call: terminated command - exceeded timeout %d s"%timeout) - except: log_exc('failed to run command %s' % ' '.join(command)) - def log_exc(msg="",name=None): """Log the traceback resulting from an exception.""" if name: @@ -86,6 +48,7 @@ def log_exc(msg="",name=None): else: log("EXCEPTION caught <%s> \n %s" %(msg, traceback.format_exc())) +########## snapshot data to a file # for some reason the various modules are still triggered even when the # data from PLC cannot be reached # we show this message instead of the exception stack instead in this case @@ -107,3 +70,59 @@ def log_data_in_file (data, file, message=""): def log_slivers (data): log_data_in_file (data, LOG_SLIVERS, "raw GetSlivers") + +#################### child processes +# avoid waiting until the process returns; +# that makes debugging of hanging children hard + +class Buffer: + def __init__ (self,message='log_call: '): + self.buffer='' + self.message=message + + def add (self,c): + self.buffer += c + if c=='\n': self.flush() + + def flush (self): + if self.buffer: + log (self.message + self.buffer) + self.buffer='' + +# time out in seconds - avoid hanging subprocesses - default is 5 minutes +default_timeout_minutes=5 + +def log_call(command,timeout=default_timeout_minutes*60,poll=1): + message=" ".join(command) + log("log_call: running command %s" % message) + verbose("log_call: timeout=%r s" % timeout) + verbose("log_call: poll=%r s" % poll) + trigger=time.time()+timeout + try: + child = subprocess.Popen(command, bufsize=1, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) + buffer = Buffer() + while True: + # see if anything can be read within the poll interval + (r,w,x)=select.select([child.stdout],[],[],poll) + if r: buffer.add(child.stdout.read(1)) + # is process over ? + returncode=child.poll() + # yes + if returncode != None: + buffer.flush() + # child is done and return 0 + if returncode == 0: + log("log_call: command completed (%s)" % message) + break + # child has failed + else: + log("log_call: command return=%d (%s)" %(returncode,message)) + raise Exception("log_call: failed with returncode %d"%returncode) + # no : still within timeout ? + if time.time() >= trigger: + buffer.flush() + child.terminate() + raise Exception("log_call: terminated command - exceeded timeout %d s"%timeout) + except: log_exc("failed to run command %s" % message) + diff --git a/sliver_vs.py b/sliver_vs.py index 85cbac2..cb80f7b 100644 --- a/sliver_vs.py +++ b/sliver_vs.py @@ -106,9 +106,7 @@ class Sliver_VS(accounts.Account, vserver.VServer): personality="linux64" return personality - # temporary : run vuseradd verbosely - mostly a check for log_call to work properly, - # since the issue with the kernel seems to have been spotted -# logger.log_call(['/usr/sbin/vuseradd', '-t', vref, name, ], timeout=10*60) +# logger.log_call(['/usr/sbin/vuseradd', '-t', vref, name, ], timeout=15*60) logger.log_call(['/bin/bash','-x','/usr/sbin/vuseradd', '-t', vref, name, ], timeout=15*60) # export slicename to the slice in /etc/slicename file('/vservers/%s/etc/slicename' % name, 'w').write(name) @@ -119,7 +117,9 @@ class Sliver_VS(accounts.Account, vserver.VServer): logger.log('sliver_vs: %s: set personality to %s'%(name,personality(arch))) @staticmethod - def destroy(name): logger.log_call(['/usr/sbin/vuserdel', name, ]) + def destroy(name): +# logger.log_call(['/usr/sbin/vuserdel', name, ]) + logger.log_call(['/bin/bash','-x','/usr/sbin/vuserdel', name, ]) def configure(self, rec): new_rspec = rec['_rspec']