2 # -*- coding: iso-8859-1 -*-
4 # $Id: retriever.py,v 1.17 2005/02/13 08:28:01 mfx Exp $
7 # Usage: python retriever.py <file with URLs to fetch> [<# of
8 # concurrent connections>]
11 import sys, threading, Queue
14 # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
15 # the libcurl tutorial for more info.
18 from signal import SIGPIPE, SIG_IGN
19 signal.signal(signal.SIGPIPE, signal.SIG_IGN)
27 if sys.argv[1] == "-":
28 urls = sys.stdin.readlines()
30 urls = open(sys.argv[1]).readlines()
31 if len(sys.argv) >= 3:
32 num_conn = int(sys.argv[2])
34 print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
38 # Make a queue with (url, filename) tuples
42 if not url or url[0] == "#":
44 filename = "doc_%03d.dat" % (len(queue.queue) + 1)
45 queue.put((url, filename))
49 assert queue.queue, "no URLs given"
50 num_urls = len(queue.queue)
51 num_conn = min(num_conn, num_urls)
52 assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
53 print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
54 print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
57 class WorkerThread(threading.Thread):
58 def __init__(self, queue):
59 threading.Thread.__init__(self)
65 url, filename = self.queue.get_nowait()
68 fp = open(filename, "wb")
70 curl.setopt(pycurl.URL, url)
71 curl.setopt(pycurl.FOLLOWLOCATION, 1)
72 curl.setopt(pycurl.MAXREDIRS, 5)
73 curl.setopt(pycurl.CONNECTTIMEOUT, 30)
74 curl.setopt(pycurl.TIMEOUT, 300)
75 curl.setopt(pycurl.NOSIGNAL, 1)
76 curl.setopt(pycurl.WRITEDATA, fp)
81 traceback.print_exc(file=sys.stderr)
89 # Start a bunch of threads
91 for dummy in range(num_conn):
92 t = WorkerThread(queue)
97 # Wait for all threads to finish
98 for thread in threads: