bd20da76eb182369cdc97f4390b383995bd7920a
[plcapi.git] / pycurl / examples / retriever.py
1 #! /usr/bin/env python
2 # -*- coding: iso-8859-1 -*-
3 # vi:ts=4:et
4 # $Id: retriever.py,v 1.17 2005/02/13 08:28:01 mfx Exp $
5
6 #
7 # Usage: python retriever.py <file with URLs to fetch> [<# of
8 #          concurrent connections>]
9 #
10
11 import sys, threading, Queue
12 import pycurl
13
14 # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
15 # the libcurl tutorial for more info.
16 try:
17     import signal
18     from signal import SIGPIPE, SIG_IGN
19     signal.signal(signal.SIGPIPE, signal.SIG_IGN)
20 except ImportError:
21     pass
22
23
24 # Get args
25 num_conn = 10
26 try:
27     if sys.argv[1] == "-":
28         urls = sys.stdin.readlines()
29     else:
30         urls = open(sys.argv[1]).readlines()
31     if len(sys.argv) >= 3:
32         num_conn = int(sys.argv[2])
33 except:
34     print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
35     raise SystemExit
36
37
38 # Make a queue with (url, filename) tuples
39 queue = Queue.Queue()
40 for url in urls:
41     url = url.strip()
42     if not url or url[0] == "#":
43         continue
44     filename = "doc_%03d.dat" % (len(queue.queue) + 1)
45     queue.put((url, filename))
46
47
48 # Check args
49 assert queue.queue, "no URLs given"
50 num_urls = len(queue.queue)
51 num_conn = min(num_conn, num_urls)
52 assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
53 print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
54 print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
55
56
57 class WorkerThread(threading.Thread):
58     def __init__(self, queue):
59         threading.Thread.__init__(self)
60         self.queue = queue
61
62     def run(self):
63         while 1:
64             try:
65                 url, filename = self.queue.get_nowait()
66             except Queue.Empty:
67                 raise SystemExit
68             fp = open(filename, "wb")
69             curl = pycurl.Curl()
70             curl.setopt(pycurl.URL, url)
71             curl.setopt(pycurl.FOLLOWLOCATION, 1)
72             curl.setopt(pycurl.MAXREDIRS, 5)
73             curl.setopt(pycurl.CONNECTTIMEOUT, 30)
74             curl.setopt(pycurl.TIMEOUT, 300)
75             curl.setopt(pycurl.NOSIGNAL, 1)
76             curl.setopt(pycurl.WRITEDATA, fp)
77             try:
78                 curl.perform()
79             except:
80                 import traceback
81                 traceback.print_exc(file=sys.stderr)
82                 sys.stderr.flush()
83             curl.close()
84             fp.close()
85             sys.stdout.write(".")
86             sys.stdout.flush()
87
88
89 # Start a bunch of threads
90 threads = []
91 for dummy in range(num_conn):
92     t = WorkerThread(queue)
93     t.start()
94     threads.append(t)
95
96
97 # Wait for all threads to finish
98 for thread in threads:
99     thread.join()