pycurl/examples/retriever.py

   1 #! /usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3 # vi:ts=4:et
   4 # $Id: retriever.py,v 1.17 2005/02/13 08:28:01 mfx Exp $
   5
   6 #
   7 # Usage: python retriever.py <file with URLs to fetch> [<# of
   8 #          concurrent connections>]
   9 #
  10
  11 import sys, threading, Queue
  12 import pycurl
  13
  14 # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
  15 # the libcurl tutorial for more info.
  16 try:
  17     import signal
  18     from signal import SIGPIPE, SIG_IGN
  19     signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  20 except ImportError:
  21     pass
  22
  23
  24 # Get args
  25 num_conn = 10
  26 try:
  27     if sys.argv[1] == "-":
  28         urls = sys.stdin.readlines()
  29     else:
  30         urls = open(sys.argv[1]).readlines()
  31     if len(sys.argv) >= 3:
  32         num_conn = int(sys.argv[2])
  33 except:
  34     print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
  35     raise SystemExit
  36
  37
  38 # Make a queue with (url, filename) tuples
  39 queue = Queue.Queue()
  40 for url in urls:
  41     url = url.strip()
  42     if not url or url[0] == "#":
  43         continue
  44     filename = "doc_%03d.dat" % (len(queue.queue) + 1)
  45     queue.put((url, filename))
  46
  47
  48 # Check args
  49 assert queue.queue, "no URLs given"
  50 num_urls = len(queue.queue)
  51 num_conn = min(num_conn, num_urls)
  52 assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
  53 print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
  54 print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
  55
  56
  57 class WorkerThread(threading.Thread):
  58     def __init__(self, queue):
  59         threading.Thread.__init__(self)
  60         self.queue = queue
  61
  62     def run(self):
  63         while 1:
  64             try:
  65                 url, filename = self.queue.get_nowait()
  66             except Queue.Empty:
  67                 raise SystemExit
  68             fp = open(filename, "wb")
  69             curl = pycurl.Curl()
  70             curl.setopt(pycurl.URL, url)
  71             curl.setopt(pycurl.FOLLOWLOCATION, 1)
  72             curl.setopt(pycurl.MAXREDIRS, 5)
  73             curl.setopt(pycurl.CONNECTTIMEOUT, 30)
  74             curl.setopt(pycurl.TIMEOUT, 300)
  75             curl.setopt(pycurl.NOSIGNAL, 1)
  76             curl.setopt(pycurl.WRITEDATA, fp)
  77             try:
  78                 curl.perform()
  79             except:
  80                 import traceback
  81                 traceback.print_exc(file=sys.stderr)
  82                 sys.stderr.flush()
  83             curl.close()
  84             fp.close()
  85             sys.stdout.write(".")
  86             sys.stdout.flush()
  87
  88
  89 # Start a bunch of threads
  90 threads = []
  91 for dummy in range(num_conn):
  92     t = WorkerThread(queue)
  93     t.start()
  94     threads.append(t)
  95
  96
  97 # Wait for all threads to finish
  98 for thread in threads:
  99     thread.join()