trunk/pycurl/examples/retriever-multi.py

   1 #! /usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3 # vi:ts=4:et
   4 # $Id$
   5
   6 #
   7 # Usage: python retriever-multi.py <file with URLs to fetch> [<# of
   8 #          concurrent connections>]
   9 #
  10
  11 import sys
  12 import pycurl
  13
  14 # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
  15 # the libcurl tutorial for more info.
  16 try:
  17     import signal
  18     from signal import SIGPIPE, SIG_IGN
  19     signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  20 except ImportError:
  21     pass
  22
  23
  24 # Get args
  25 num_conn = 10
  26 try:
  27     if sys.argv[1] == "-":
  28         urls = sys.stdin.readlines()
  29     else:
  30         urls = open(sys.argv[1]).readlines()
  31     if len(sys.argv) >= 3:
  32         num_conn = int(sys.argv[2])
  33 except:
  34     print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
  35     raise SystemExit
  36
  37
  38 # Make a queue with (url, filename) tuples
  39 queue = []
  40 for url in urls:
  41     url = url.strip()
  42     if not url or url[0] == "#":
  43         continue
  44     filename = "doc_%03d.dat" % (len(queue) + 1)
  45     queue.append((url, filename))
  46
  47
  48 # Check args
  49 assert queue, "no URLs given"
  50 num_urls = len(queue)
  51 num_conn = min(num_conn, num_urls)
  52 assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
  53 print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
  54 print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
  55
  56
  57 # Pre-allocate a list of curl objects
  58 m = pycurl.CurlMulti()
  59 m.handles = []
  60 for i in range(num_conn):
  61     c = pycurl.Curl()
  62     c.fp = None
  63     c.setopt(pycurl.FOLLOWLOCATION, 1)
  64     c.setopt(pycurl.MAXREDIRS, 5)
  65     c.setopt(pycurl.CONNECTTIMEOUT, 30)
  66     c.setopt(pycurl.TIMEOUT, 300)
  67     c.setopt(pycurl.NOSIGNAL, 1)
  68     m.handles.append(c)
  69
  70
  71 # Main loop
  72 freelist = m.handles[:]
  73 num_processed = 0
  74 while num_processed < num_urls:
  75     # If there is an url to process and a free curl object, add to multi stack
  76     while queue and freelist:
  77         url, filename = queue.pop(0)
  78         c = freelist.pop()
  79         c.fp = open(filename, "wb")
  80         c.setopt(pycurl.URL, url)
  81         c.setopt(pycurl.WRITEDATA, c.fp)
  82         m.add_handle(c)
  83         # store some info
  84         c.filename = filename
  85         c.url = url
  86     # Run the internal curl state machine for the multi stack
  87     while 1:
  88         ret, num_handles = m.perform()
  89         if ret != pycurl.E_CALL_MULTI_PERFORM:
  90             break
  91     # Check for curl objects which have terminated, and add them to the freelist
  92     while 1:
  93         num_q, ok_list, err_list = m.info_read()
  94         for c in ok_list:
  95             c.fp.close()
  96             c.fp = None
  97             m.remove_handle(c)
  98             print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
  99             freelist.append(c)
 100         for c, errno, errmsg in err_list:
 101             c.fp.close()
 102             c.fp = None
 103             m.remove_handle(c)
 104             print "Failed: ", c.filename, c.url, errno, errmsg
 105             freelist.append(c)
 106         num_processed = num_processed + len(ok_list) + len(err_list)
 107         if num_q == 0:
 108             break
 109     # Currently no more I/O is pending, could do something in the meantime
 110     # (display a progress bar, etc.).
 111     # We just call select() to sleep until some more data is available.
 112     m.select()
 113
 114
 115 # Cleanup
 116 for c in m.handles:
 117     if c.fp is not None:
 118         c.fp.close()
 119         c.fp = None
 120     c.close()
 121 m.close()
 122