2 # -*- coding: iso-8859-1 -*-
4 # $Id: retriever-multi.py,v 1.25 2005/02/13 08:28:01 mfx Exp $
7 # Usage: python retriever-multi.py <file with URLs to fetch> [<# of
8 # concurrent connections>]
14 # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
15 # the libcurl tutorial for more info.
18 from signal import SIGPIPE, SIG_IGN
19 signal.signal(signal.SIGPIPE, signal.SIG_IGN)
27 if sys.argv[1] == "-":
28 urls = sys.stdin.readlines()
30 urls = open(sys.argv[1]).readlines()
31 if len(sys.argv) >= 3:
32 num_conn = int(sys.argv[2])
34 print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
38 # Make a queue with (url, filename) tuples
42 if not url or url[0] == "#":
44 filename = "doc_%03d.dat" % (len(queue) + 1)
45 queue.append((url, filename))
49 assert queue, "no URLs given"
51 num_conn = min(num_conn, num_urls)
52 assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
53 print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
54 print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
57 # Pre-allocate a list of curl objects
58 m = pycurl.CurlMulti()
60 for i in range(num_conn):
63 c.setopt(pycurl.FOLLOWLOCATION, 1)
64 c.setopt(pycurl.MAXREDIRS, 5)
65 c.setopt(pycurl.CONNECTTIMEOUT, 30)
66 c.setopt(pycurl.TIMEOUT, 300)
67 c.setopt(pycurl.NOSIGNAL, 1)
72 freelist = m.handles[:]
74 while num_processed < num_urls:
75 # If there is an url to process and a free curl object, add to multi stack
76 while queue and freelist:
77 url, filename = queue.pop(0)
79 c.fp = open(filename, "wb")
80 c.setopt(pycurl.URL, url)
81 c.setopt(pycurl.WRITEDATA, c.fp)
86 # Run the internal curl state machine for the multi stack
88 ret, num_handles = m.perform()
89 if ret != pycurl.E_CALL_MULTI_PERFORM:
91 # Check for curl objects which have terminated, and add them to the freelist
93 num_q, ok_list, err_list = m.info_read()
98 print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
100 for c, errno, errmsg in err_list:
104 print "Failed: ", c.filename, c.url, errno, errmsg
106 num_processed = num_processed + len(ok_list) + len(err_list)
109 # Currently no more I/O is pending, could do something in the meantime
110 # (display a progress bar, etc.).
111 # We just call select() to sleep until some more data is available.