X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=trunk%2Fpycurl%2Fexamples%2Fretriever.py;fp=trunk%2Fpycurl%2Fexamples%2Fretriever.py;h=2c91d07f4ec18ba0dbf3bbbe0192414fe68f7168;hb=5a4c1b1278ffa01e630fde47f7c54888ed20a576;hp=0000000000000000000000000000000000000000;hpb=cee5ab52df1c9f38b6eaff2dd354cb22f59028c7;p=plcapi.git diff --git a/trunk/pycurl/examples/retriever.py b/trunk/pycurl/examples/retriever.py new file mode 100644 index 0000000..2c91d07 --- /dev/null +++ b/trunk/pycurl/examples/retriever.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python +# -*- coding: iso-8859-1 -*- +# vi:ts=4:et +# $Id$ + +# +# Usage: python retriever.py [<# of +# concurrent connections>] +# + +import sys, threading, Queue +import pycurl + +# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see +# the libcurl tutorial for more info. +try: + import signal + from signal import SIGPIPE, SIG_IGN + signal.signal(signal.SIGPIPE, signal.SIG_IGN) +except ImportError: + pass + + +# Get args +num_conn = 10 +try: + if sys.argv[1] == "-": + urls = sys.stdin.readlines() + else: + urls = open(sys.argv[1]).readlines() + if len(sys.argv) >= 3: + num_conn = int(sys.argv[2]) +except: + print "Usage: %s [<# of concurrent connections>]" % sys.argv[0] + raise SystemExit + + +# Make a queue with (url, filename) tuples +queue = Queue.Queue() +for url in urls: + url = url.strip() + if not url or url[0] == "#": + continue + filename = "doc_%03d.dat" % (len(queue.queue) + 1) + queue.put((url, filename)) + + +# Check args +assert queue.queue, "no URLs given" +num_urls = len(queue.queue) +num_conn = min(num_conn, num_urls) +assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" +print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM) +print "----- Getting", num_urls, "URLs using", num_conn, "connections -----" + + +class WorkerThread(threading.Thread): + def __init__(self, queue): + threading.Thread.__init__(self) + self.queue = queue + + def run(self): + while 1: + try: + url, filename = self.queue.get_nowait() + except Queue.Empty: + raise SystemExit + fp = open(filename, "wb") + curl = pycurl.Curl() + curl.setopt(pycurl.URL, url) + curl.setopt(pycurl.FOLLOWLOCATION, 1) + curl.setopt(pycurl.MAXREDIRS, 5) + curl.setopt(pycurl.CONNECTTIMEOUT, 30) + curl.setopt(pycurl.TIMEOUT, 300) + curl.setopt(pycurl.NOSIGNAL, 1) + curl.setopt(pycurl.WRITEDATA, fp) + try: + curl.perform() + except: + import traceback + traceback.print_exc(file=sys.stderr) + sys.stderr.flush() + curl.close() + fp.close() + sys.stdout.write(".") + sys.stdout.flush() + + +# Start a bunch of threads +threads = [] +for dummy in range(num_conn): + t = WorkerThread(queue) + t.start() + threads.append(t) + + +# Wait for all threads to finish +for thread in threads: + thread.join()