lib/tornado/httpclient.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2009 Facebook
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License"); you may
   6 # not use this file except in compliance with the License. You may obtain
   7 # a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14 # License for the specific language governing permissions and limitations
  15 # under the License.
  16
  17 """Blocking and non-blocking HTTP client implementations using pycurl."""
  18
  19 import calendar
  20 import collections
  21 import cStringIO
  22 import email.utils
  23 import errno
  24 import escape
  25 import httplib
  26 import httputil
  27 import ioloop
  28 import logging
  29 import pycurl
  30 import sys
  31 import time
  32 import weakref
  33
  34 class HTTPClient(object):
  35     """A blocking HTTP client backed with pycurl.
  36
  37     Typical usage looks like this:
  38
  39         http_client = httpclient.HTTPClient()
  40         try:
  41             response = http_client.fetch("http://www.google.com/")
  42             print response.body
  43         except httpclient.HTTPError, e:
  44             print "Error:", e
  45
  46     fetch() can take a string URL or an HTTPRequest instance, which offers
  47     more options, like executing POST/PUT/DELETE requests.
  48     """
  49     def __init__(self, max_simultaneous_connections=None):
  50         self._curl = _curl_create(max_simultaneous_connections)
  51
  52     def __del__(self):
  53         self._curl.close()
  54
  55     def fetch(self, request, **kwargs):
  56         """Executes an HTTPRequest, returning an HTTPResponse.
  57
  58         If an error occurs during the fetch, we raise an HTTPError.
  59         """
  60         if not isinstance(request, HTTPRequest):
  61            request = HTTPRequest(url=request, **kwargs)
  62         buffer = cStringIO.StringIO()
  63         headers = httputil.HTTPHeaders()
  64         try:
  65             _curl_setup_request(self._curl, request, buffer, headers)
  66             self._curl.perform()
  67             code = self._curl.getinfo(pycurl.HTTP_CODE)
  68             effective_url = self._curl.getinfo(pycurl.EFFECTIVE_URL)
  69             buffer.seek(0)
  70             response = HTTPResponse(
  71                 request=request, code=code, headers=headers,
  72                 buffer=buffer, effective_url=effective_url)
  73             if code < 200 or code >= 300:
  74                 raise HTTPError(code, response=response)
  75             return response
  76         except pycurl.error, e:
  77             buffer.close()
  78             raise CurlError(*e)
  79
  80
  81 class AsyncHTTPClient(object):
  82     """An non-blocking HTTP client backed with pycurl.
  83
  84     Example usage:
  85
  86         import ioloop
  87
  88         def handle_request(response):
  89             if response.error:
  90                 print "Error:", response.error
  91             else:
  92                 print response.body
  93             ioloop.IOLoop.instance().stop()
  94
  95         http_client = httpclient.AsyncHTTPClient()
  96         http_client.fetch("http://www.google.com/", handle_request)
  97         ioloop.IOLoop.instance().start()
  98
  99     fetch() can take a string URL or an HTTPRequest instance, which offers
 100     more options, like executing POST/PUT/DELETE requests.
 101
 102     The keyword argument max_clients to the AsyncHTTPClient constructor
 103     determines the maximum number of simultaneous fetch() operations that
 104     can execute in parallel on each IOLoop.
 105     """
 106     _ASYNC_CLIENTS = weakref.WeakKeyDictionary()
 107
 108     def __new__(cls, io_loop=None, max_clients=10,
 109                 max_simultaneous_connections=None):
 110         # There is one client per IOLoop since they share curl instances
 111         io_loop = io_loop or ioloop.IOLoop.instance()
 112         if io_loop in cls._ASYNC_CLIENTS:
 113             return cls._ASYNC_CLIENTS[io_loop]
 114         else:
 115             instance = super(AsyncHTTPClient, cls).__new__(cls)
 116             instance.io_loop = io_loop
 117             instance._multi = pycurl.CurlMulti()
 118             instance._curls = [_curl_create(max_simultaneous_connections)
 119                                for i in xrange(max_clients)]
 120             instance._free_list = instance._curls[:]
 121             instance._requests = collections.deque()
 122             instance._fds = {}
 123             instance._events = {}
 124             instance._added_perform_callback = False
 125             instance._timeout = None
 126             instance._closed = False
 127             cls._ASYNC_CLIENTS[io_loop] = instance
 128             return instance
 129
 130     def close(self):
 131         """Destroys this http client, freeing any file descriptors used.
 132         Not needed in normal use, but may be helpful in unittests that
 133         create and destroy http clients.  No other methods may be called
 134         on the AsyncHTTPClient after close().
 135         """
 136         del AsyncHTTPClient._ASYNC_CLIENTS[self.io_loop]
 137         for curl in self._curls:
 138             curl.close()
 139         self._multi.close()
 140         self._closed = True
 141
 142     def fetch(self, request, callback, **kwargs):
 143         """Executes an HTTPRequest, calling callback with an HTTPResponse.
 144
 145         If an error occurs during the fetch, the HTTPResponse given to the
 146         callback has a non-None error attribute that contains the exception
 147         encountered during the request. You can call response.reraise() to
 148         throw the exception (if any) in the callback.
 149         """
 150         if not isinstance(request, HTTPRequest):
 151            request = HTTPRequest(url=request, **kwargs)
 152         self._requests.append((request, callback))
 153         self._add_perform_callback()
 154
 155     def _add_perform_callback(self):
 156         if not self._added_perform_callback:
 157             self.io_loop.add_callback(self._perform)
 158             self._added_perform_callback = True
 159
 160     def _handle_events(self, fd, events):
 161         self._events[fd] = events
 162         self._add_perform_callback()
 163
 164     def _handle_timeout(self):
 165         self._timeout = None
 166         self._perform()
 167
 168     def _perform(self):
 169         self._added_perform_callback = False
 170
 171         if self._closed:
 172             return
 173
 174         while True:
 175             while True:
 176                 ret, num_handles = self._multi.perform()
 177                 if ret != pycurl.E_CALL_MULTI_PERFORM:
 178                     break
 179
 180             # Update the set of active file descriptors.  It is important
 181             # that this happen immediately after perform() because
 182             # fds that have been removed from fdset are free to be reused
 183             # in user callbacks.
 184             fds = {}
 185             (readable, writable, exceptable) = self._multi.fdset()
 186             for fd in readable:
 187                 fds[fd] = fds.get(fd, 0) | 0x1 | 0x2
 188             for fd in writable:
 189                 fds[fd] = fds.get(fd, 0) | 0x4
 190             for fd in exceptable:
 191                 fds[fd] = fds.get(fd, 0) | 0x8 | 0x10
 192
 193             if fds and max(fds.iterkeys()) > 900:
 194                 # Libcurl has a bug in which it behaves unpredictably with
 195                 # file descriptors greater than 1024.  (This is because
 196                 # even though it uses poll() instead of select(), it still
 197                 # uses FD_SET internally) Since curl opens its own file
 198                 # descriptors we can't catch this problem when it happens,
 199                 # and the best we can do is detect that it's about to
 200                 # happen.  Exiting is a lousy way to handle this error,
 201                 # but there's not much we can do at this point.  Exiting
 202                 # (and getting restarted by whatever monitoring process
 203                 # is handling crashed tornado processes) will at least
 204                 # get things working again and hopefully bring the issue
 205                 # to someone's attention.
 206                 # If you run into this issue, you either have a file descriptor
 207                 # leak or need to run more tornado processes (so that none
 208                 # of them are handling more than 1000 simultaneous connections)
 209                 print >> sys.stderr, "ERROR: File descriptor too high for libcurl. Exiting."
 210                 logging.error("File descriptor too high for libcurl. Exiting.")
 211                 sys.exit(1)
 212
 213             for fd in self._fds:
 214                 if fd not in fds:
 215                     try:
 216                         self.io_loop.remove_handler(fd)
 217                     except (OSError, IOError), e:
 218                         if e[0] != errno.ENOENT:
 219                             raise
 220
 221             for fd, events in fds.iteritems():
 222                 old_events = self._fds.get(fd, None)
 223                 if old_events is None:
 224                     self.io_loop.add_handler(fd, self._handle_events, events)
 225                 elif old_events != events:
 226                     try:
 227                         self.io_loop.update_handler(fd, events)
 228                     except (OSError, IOError), e:
 229                         if e[0] == errno.ENOENT:
 230                             self.io_loop.add_handler(fd, self._handle_events,
 231                                                      events)
 232                         else:
 233                             raise
 234             self._fds = fds
 235
 236
 237             # Handle completed fetches
 238             completed = 0
 239             while True:
 240                 num_q, ok_list, err_list = self._multi.info_read()
 241                 for curl in ok_list:
 242                     self._finish(curl)
 243                     completed += 1
 244                 for curl, errnum, errmsg in err_list:
 245                     self._finish(curl, errnum, errmsg)
 246                     completed += 1
 247                 if num_q == 0:
 248                     break
 249
 250             # Start fetching new URLs
 251             started = 0
 252             while self._free_list and self._requests:
 253                 started += 1
 254                 curl = self._free_list.pop()
 255                 (request, callback) = self._requests.popleft()
 256                 curl.info = {
 257                     "headers": httputil.HTTPHeaders(),
 258                     "buffer": cStringIO.StringIO(),
 259                     "request": request,
 260                     "callback": callback,
 261                     "start_time": time.time(),
 262                 }
 263                 _curl_setup_request(curl, request, curl.info["buffer"],
 264                                     curl.info["headers"])
 265                 self._multi.add_handle(curl)
 266
 267             if not started and not completed:
 268                 break
 269
 270         if self._timeout is not None:
 271             self.io_loop.remove_timeout(self._timeout)
 272             self._timeout = None
 273
 274         if num_handles:
 275             self._timeout = self.io_loop.add_timeout(
 276                 time.time() + 0.2, self._handle_timeout)
 277
 278
 279     def _finish(self, curl, curl_error=None, curl_message=None):
 280         info = curl.info
 281         curl.info = None
 282         self._multi.remove_handle(curl)
 283         self._free_list.append(curl)
 284         buffer = info["buffer"]
 285         if curl_error:
 286             error = CurlError(curl_error, curl_message)
 287             code = error.code
 288             body = None
 289             effective_url = None
 290             buffer.close()
 291             buffer = None
 292         else:
 293             error = None
 294             code = curl.getinfo(pycurl.HTTP_CODE)
 295             effective_url = curl.getinfo(pycurl.EFFECTIVE_URL)
 296             buffer.seek(0)
 297         try:
 298             info["callback"](HTTPResponse(
 299                 request=info["request"], code=code, headers=info["headers"],
 300                 buffer=buffer, effective_url=effective_url, error=error,
 301                 request_time=time.time() - info["start_time"]))
 302         except (KeyboardInterrupt, SystemExit):
 303             raise
 304         except:
 305             logging.error("Exception in callback %r", info["callback"],
 306                           exc_info=True)
 307
 308
 309 class AsyncHTTPClient2(object):
 310     """Alternate implementation of AsyncHTTPClient.
 311
 312     This class has the same interface as AsyncHTTPClient (so see that class
 313     for usage documentation) but is implemented with a different set of
 314     libcurl APIs (curl_multi_socket_action instead of fdset/perform).
 315     This implementation will likely become the default in the future, but
 316     for now should be considered somewhat experimental.
 317
 318     The main advantage of this class over the original implementation is
 319     that it is immune to the fd > 1024 bug, so applications with a large
 320     number of simultaneous requests (e.g. long-polling) may prefer this
 321     version.
 322
 323     Known bugs:
 324     * Timeouts connecting to localhost
 325     In some situations, this implementation will return a connection
 326     timeout when the old implementation would be able to connect.  This
 327     has only been observed when connecting to localhost when using
 328     the kqueue-based IOLoop (mac/bsd), but it may also occur on epoll (linux)
 329     and, in principle, for non-localhost sites.
 330     While the bug is unrelated to IPv6, disabling IPv6 will avoid the
 331     most common manifestations of the bug, so this class disables IPv6 when
 332     it detects an affected version of libcurl.
 333     The underlying cause is a libcurl bug in versions up to and including
 334     7.21.0 (it will be fixed in the not-yet-released 7.21.1)
 335     http://sourceforge.net/tracker/?func=detail&aid=3017819&group_id=976&atid=100976
 336     """
 337     _ASYNC_CLIENTS = weakref.WeakKeyDictionary()
 338
 339     def __new__(cls, io_loop=None, max_clients=10,
 340                 max_simultaneous_connections=None):
 341         # There is one client per IOLoop since they share curl instances
 342         io_loop = io_loop or ioloop.IOLoop.instance()
 343         if io_loop in cls._ASYNC_CLIENTS:
 344             return cls._ASYNC_CLIENTS[io_loop]
 345         else:
 346             instance = super(AsyncHTTPClient2, cls).__new__(cls)
 347             instance.io_loop = io_loop
 348             instance._multi = pycurl.CurlMulti()
 349             instance._multi.setopt(pycurl.M_TIMERFUNCTION,
 350                                    instance._set_timeout)
 351             instance._multi.setopt(pycurl.M_SOCKETFUNCTION,
 352                                    instance._handle_socket)
 353             instance._curls = [_curl_create(max_simultaneous_connections)
 354                                for i in xrange(max_clients)]
 355             instance._free_list = instance._curls[:]
 356             instance._requests = collections.deque()
 357             instance._fds = {}
 358             instance._timeout = None
 359             cls._ASYNC_CLIENTS[io_loop] = instance
 360             return instance
 361
 362     def close(self):
 363         """Destroys this http client, freeing any file descriptors used.
 364         Not needed in normal use, but may be helpful in unittests that
 365         create and destroy http clients.  No other methods may be called
 366         on the AsyncHTTPClient after close().
 367         """
 368         del AsyncHTTPClient2._ASYNC_CLIENTS[self.io_loop]
 369         for curl in self._curls:
 370             curl.close()
 371         self._multi.close()
 372         self._closed = True
 373
 374     def fetch(self, request, callback, **kwargs):
 375         """Executes an HTTPRequest, calling callback with an HTTPResponse.
 376
 377         If an error occurs during the fetch, the HTTPResponse given to the
 378         callback has a non-None error attribute that contains the exception
 379         encountered during the request. You can call response.reraise() to
 380         throw the exception (if any) in the callback.
 381         """
 382         if not isinstance(request, HTTPRequest):
 383            request = HTTPRequest(url=request, **kwargs)
 384         self._requests.append((request, callback))
 385         self._process_queue()
 386         self._set_timeout(0)
 387
 388     def _handle_socket(self, event, fd, multi, data):
 389         """Called by libcurl when it wants to change the file descriptors
 390         it cares about.
 391         """
 392         event_map = {
 393             pycurl.POLL_NONE: ioloop.IOLoop.NONE,
 394             pycurl.POLL_IN: ioloop.IOLoop.READ,
 395             pycurl.POLL_OUT: ioloop.IOLoop.WRITE,
 396             pycurl.POLL_INOUT: ioloop.IOLoop.READ | ioloop.IOLoop.WRITE
 397         }
 398         if event == pycurl.POLL_REMOVE:
 399             self.io_loop.remove_handler(fd)
 400             del self._fds[fd]
 401         else:
 402             ioloop_event = event_map[event]
 403             if fd not in self._fds:
 404                 self._fds[fd] = ioloop_event
 405                 self.io_loop.add_handler(fd, self._handle_events,
 406                                          ioloop_event)
 407             else:
 408                 self._fds[fd] = ioloop_event
 409                 self.io_loop.update_handler(fd, ioloop_event)
 410
 411     def _set_timeout(self, msecs):
 412         """Called by libcurl to schedule a timeout."""
 413         if self._timeout is not None:
 414             self.io_loop.remove_timeout(self._timeout)
 415         self._timeout = self.io_loop.add_timeout(
 416             time.time() + msecs/1000.0, self._handle_timeout)
 417
 418     def _handle_events(self, fd, events):
 419         """Called by IOLoop when there is activity on one of our
 420         file descriptors.
 421         """
 422         action = 0
 423         if events & ioloop.IOLoop.READ: action |= pycurl.CSELECT_IN
 424         if events & ioloop.IOLoop.WRITE: action |= pycurl.CSELECT_OUT
 425         while True:
 426             try:
 427                 ret, num_handles = self._multi.socket_action(fd, action)
 428             except Exception, e:
 429                 ret = e[0]
 430             if ret != pycurl.E_CALL_MULTI_PERFORM:
 431                 break
 432         self._finish_pending_requests()
 433
 434     def _handle_timeout(self):
 435         """Called by IOLoop when the requested timeout has passed."""
 436         self._timeout = None
 437         while True:
 438             try:
 439                 ret, num_handles = self._multi.socket_action(
 440                                         pycurl.SOCKET_TIMEOUT, 0)
 441             except Exception, e:
 442                 ret = e[0]
 443             if ret != pycurl.E_CALL_MULTI_PERFORM:
 444                 break
 445         self._finish_pending_requests()
 446
 447         # In theory, we shouldn't have to do this because curl will
 448         # call _set_timeout whenever the timeout changes.  However,
 449         # sometimes after _handle_timeout we will need to reschedule
 450         # immediately even though nothing has changed from curl's
 451         # perspective.  This is because when socket_action is
 452         # called with SOCKET_TIMEOUT, libcurl decides internally which
 453         # timeouts need to be processed by using a monotonic clock
 454         # (where available) while tornado uses python's time.time()
 455         # to decide when timeouts have occurred.  When those clocks
 456         # disagree on elapsed time (as they will whenever there is an
 457         # NTP adjustment), tornado might call _handle_timeout before
 458         # libcurl is ready.  After each timeout, resync the scheduled
 459         # timeout with libcurl's current state.
 460         new_timeout = self._multi.timeout()
 461         if new_timeout != -1:
 462             self._set_timeout(new_timeout)
 463
 464     def _finish_pending_requests(self):
 465         """Process any requests that were completed by the last
 466         call to multi.socket_action.
 467         """
 468         while True:
 469             num_q, ok_list, err_list = self._multi.info_read()
 470             for curl in ok_list:
 471                 self._finish(curl)
 472             for curl, errnum, errmsg in err_list:
 473                 self._finish(curl, errnum, errmsg)
 474             if num_q == 0:
 475                 break
 476         self._process_queue()
 477
 478     def _process_queue(self):
 479         while True:
 480             started = 0
 481             while self._free_list and self._requests:
 482                 started += 1
 483                 curl = self._free_list.pop()
 484                 (request, callback) = self._requests.popleft()
 485                 curl.info = {
 486                     "headers": httputil.HTTPHeaders(),
 487                     "buffer": cStringIO.StringIO(),
 488                     "request": request,
 489                     "callback": callback,
 490                     "start_time": time.time(),
 491                 }
 492                 # Disable IPv6 to mitigate the effects of this bug
 493                 # on curl versions <= 7.21.0
 494                 # http://sourceforge.net/tracker/?func=detail&aid=3017819&group_id=976&atid=100976
 495                 if pycurl.version_info()[2] <= 0x71500:  # 7.21.0
 496                     curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
 497                 _curl_setup_request(curl, request, curl.info["buffer"],
 498                                     curl.info["headers"])
 499                 self._multi.add_handle(curl)
 500
 501             if not started:
 502                 break
 503
 504     def _finish(self, curl, curl_error=None, curl_message=None):
 505         info = curl.info
 506         curl.info = None
 507         self._multi.remove_handle(curl)
 508         self._free_list.append(curl)
 509         buffer = info["buffer"]
 510         if curl_error:
 511             error = CurlError(curl_error, curl_message)
 512             code = error.code
 513             effective_url = None
 514             buffer.close()
 515             buffer = None
 516         else:
 517             error = None
 518             code = curl.getinfo(pycurl.HTTP_CODE)
 519             effective_url = curl.getinfo(pycurl.EFFECTIVE_URL)
 520             buffer.seek(0)
 521         try:
 522             info["callback"](HTTPResponse(
 523                 request=info["request"], code=code, headers=info["headers"],
 524                 buffer=buffer, effective_url=effective_url, error=error,
 525                 request_time=time.time() - info["start_time"]))
 526         except (KeyboardInterrupt, SystemExit):
 527             raise
 528         except:
 529             logging.error("Exception in callback %r", info["callback"],
 530                           exc_info=True)
 531
 532
 533 class HTTPRequest(object):
 534     def __init__(self, url, method="GET", headers=None, body=None,
 535                  auth_username=None, auth_password=None,
 536                  connect_timeout=20.0, request_timeout=20.0,
 537                  if_modified_since=None, follow_redirects=True,
 538                  max_redirects=5, user_agent=None, use_gzip=True,
 539                  network_interface=None, streaming_callback=None,
 540                  header_callback=None, prepare_curl_callback=None,
 541                  allow_nonstandard_methods=False):
 542         if headers is None:
 543             headers = httputil.HTTPHeaders()
 544         if if_modified_since:
 545             timestamp = calendar.timegm(if_modified_since.utctimetuple())
 546             headers["If-Modified-Since"] = email.utils.formatdate(
 547                 timestamp, localtime=False, usegmt=True)
 548         if "Pragma" not in headers:
 549             headers["Pragma"] = ""
 550         self.url = _utf8(url)
 551         self.method = method
 552         self.headers = headers
 553         self.body = body
 554         self.auth_username = _utf8(auth_username)
 555         self.auth_password = _utf8(auth_password)
 556         self.connect_timeout = connect_timeout
 557         self.request_timeout = request_timeout
 558         self.follow_redirects = follow_redirects
 559         self.max_redirects = max_redirects
 560         self.user_agent = user_agent
 561         self.use_gzip = use_gzip
 562         self.network_interface = network_interface
 563         self.streaming_callback = streaming_callback
 564         self.header_callback = header_callback
 565         self.prepare_curl_callback = prepare_curl_callback
 566         self.allow_nonstandard_methods = allow_nonstandard_methods
 567
 568
 569 class HTTPResponse(object):
 570     def __init__(self, request, code, headers={}, buffer=None, effective_url=None,
 571                  error=None, request_time=None):
 572         self.request = request
 573         self.code = code
 574         self.headers = headers
 575         self.buffer = buffer
 576         self._body = None
 577         if effective_url is None:
 578             self.effective_url = request.url
 579         else:
 580             self.effective_url = effective_url
 581         if error is None:
 582             if self.code < 200 or self.code >= 300:
 583                 self.error = HTTPError(self.code, response=self)
 584             else:
 585                 self.error = None
 586         else:
 587             self.error = error
 588         self.request_time = request_time
 589
 590     def _get_body(self):
 591         if self.buffer is None:
 592             return None
 593         elif self._body is None:
 594             self._body = self.buffer.getvalue()
 595
 596         return self._body
 597
 598     body = property(_get_body)
 599
 600     def rethrow(self):
 601         if self.error:
 602             raise self.error
 603
 604     def __repr__(self):
 605         args = ",".join("%s=%r" % i for i in self.__dict__.iteritems())
 606         return "%s(%s)" % (self.__class__.__name__, args)
 607
 608     def __del__(self):
 609         if self.buffer is not None:
 610             self.buffer.close()
 611
 612
 613 class HTTPError(Exception):
 614     """Exception thrown for an unsuccessful HTTP request.
 615
 616     Attributes:
 617     code - HTTP error integer error code, e.g. 404.  Error code 599 is
 618            used when no HTTP response was received, e.g. for a timeout.
 619     response - HTTPResponse object, if any.
 620
 621     Note that if follow_redirects is False, redirects become HTTPErrors,
 622     and you can look at error.response.headers['Location'] to see the
 623     destination of the redirect.
 624     """
 625     def __init__(self, code, message=None, response=None):
 626         self.code = code
 627         message = message or httplib.responses.get(code, "Unknown")
 628         self.response = response
 629         Exception.__init__(self, "HTTP %d: %s" % (self.code, message))
 630
 631
 632 class CurlError(HTTPError):
 633     def __init__(self, errno, message):
 634         HTTPError.__init__(self, 599, message)
 635         self.errno = errno
 636
 637
 638 def _curl_create(max_simultaneous_connections=None):
 639     curl = pycurl.Curl()
 640     if logging.getLogger().isEnabledFor(logging.DEBUG):
 641         curl.setopt(pycurl.VERBOSE, 1)
 642         curl.setopt(pycurl.DEBUGFUNCTION, _curl_debug)
 643     curl.setopt(pycurl.MAXCONNECTS, max_simultaneous_connections or 5)
 644     return curl
 645
 646
 647 def _curl_setup_request(curl, request, buffer, headers):
 648     curl.setopt(pycurl.URL, request.url)
 649     # Request headers may be either a regular dict or HTTPHeaders object
 650     if isinstance(request.headers, httputil.HTTPHeaders):
 651       curl.setopt(pycurl.HTTPHEADER,
 652                   [_utf8("%s: %s" % i) for i in request.headers.get_all()])
 653     else:
 654         curl.setopt(pycurl.HTTPHEADER,
 655                     [_utf8("%s: %s" % i) for i in request.headers.iteritems()])
 656     if request.header_callback:
 657         curl.setopt(pycurl.HEADERFUNCTION, request.header_callback)
 658     else:
 659         curl.setopt(pycurl.HEADERFUNCTION,
 660                     lambda line: _curl_header_callback(headers, line))
 661     if request.streaming_callback:
 662         curl.setopt(pycurl.WRITEFUNCTION, request.streaming_callback)
 663     else:
 664         curl.setopt(pycurl.WRITEFUNCTION, buffer.write)
 665     curl.setopt(pycurl.FOLLOWLOCATION, request.follow_redirects)
 666     curl.setopt(pycurl.MAXREDIRS, request.max_redirects)
 667     curl.setopt(pycurl.CONNECTTIMEOUT, int(request.connect_timeout))
 668     curl.setopt(pycurl.TIMEOUT, int(request.request_timeout))
 669     if request.user_agent:
 670         curl.setopt(pycurl.USERAGENT, _utf8(request.user_agent))
 671     else:
 672         curl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)")
 673     if request.network_interface:
 674         curl.setopt(pycurl.INTERFACE, request.network_interface)
 675     if request.use_gzip:
 676         curl.setopt(pycurl.ENCODING, "gzip,deflate")
 677     else:
 678         curl.setopt(pycurl.ENCODING, "none")
 679
 680     # Set the request method through curl's retarded interface which makes
 681     # up names for almost every single method
 682     curl_options = {
 683         "GET": pycurl.HTTPGET,
 684         "POST": pycurl.POST,
 685         "PUT": pycurl.UPLOAD,
 686         "HEAD": pycurl.NOBODY,
 687     }
 688     custom_methods = set(["DELETE"])
 689     for o in curl_options.values():
 690         curl.setopt(o, False)
 691     if request.method in curl_options:
 692         curl.unsetopt(pycurl.CUSTOMREQUEST)
 693         curl.setopt(curl_options[request.method], True)
 694     elif request.allow_nonstandard_methods or request.method in custom_methods:
 695         curl.setopt(pycurl.CUSTOMREQUEST, request.method)
 696     else:
 697         raise KeyError('unknown method ' + request.method)
 698
 699     # Handle curl's cryptic options for every individual HTTP method
 700     if request.method in ("POST", "PUT"):
 701         request_buffer =  cStringIO.StringIO(escape.utf8(request.body))
 702         curl.setopt(pycurl.READFUNCTION, request_buffer.read)
 703         if request.method == "POST":
 704             def ioctl(cmd):
 705                 if cmd == curl.IOCMD_RESTARTREAD:
 706                     request_buffer.seek(0)
 707             curl.setopt(pycurl.IOCTLFUNCTION, ioctl)
 708             curl.setopt(pycurl.POSTFIELDSIZE, len(request.body))
 709         else:
 710             curl.setopt(pycurl.INFILESIZE, len(request.body))
 711
 712     if request.auth_username and request.auth_password:
 713         userpwd = "%s:%s" % (request.auth_username, request.auth_password)
 714         curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC)
 715         curl.setopt(pycurl.USERPWD, userpwd)
 716         logging.info("%s %s (username: %r)", request.method, request.url,
 717                      request.auth_username)
 718     else:
 719         curl.unsetopt(pycurl.USERPWD)
 720         logging.info("%s %s", request.method, request.url)
 721     if request.prepare_curl_callback is not None:
 722         request.prepare_curl_callback(curl)
 723
 724
 725 def _curl_header_callback(headers, header_line):
 726     if header_line.startswith("HTTP/"):
 727         headers.clear()
 728         return
 729     if header_line == "\r\n":
 730         return
 731     headers.parse_line(header_line)
 732
 733 def _curl_debug(debug_type, debug_msg):
 734     debug_types = ('I', '<', '>', '<', '>')
 735     if debug_type == 0:
 736         logging.debug('%s', debug_msg.strip())
 737     elif debug_type in (1, 2):
 738         for line in debug_msg.splitlines():
 739             logging.debug('%s %s', debug_types[debug_type], line)
 740     elif debug_type == 4:
 741         logging.debug('%s %r', debug_types[debug_type], debug_msg)
 742
 743
 744 def _utf8(value):
 745     if value is None:
 746         return value
 747     if isinstance(value, unicode):
 748         return value.encode("utf-8")
 749     assert isinstance(value, str)
 750     return value