1 |
'''OpenAnything: a kind and thoughtful library for HTTP web services |
---|
2 |
|
---|
3 |
This program is part of 'Dive Into Python', a free Python book for |
---|
4 |
experienced programmers. Visit http://diveintopython.org/ for the |
---|
5 |
latest version. |
---|
6 |
''' |
---|
7 |
|
---|
8 |
__author__ = 'Mark Pilgrim (mark@diveintopython.org)' |
---|
9 |
__version__ = '$Revision: 1.6 $'[11:-2] |
---|
10 |
__date__ = '$Date: 2004/04/16 21:16:24 $' |
---|
11 |
__copyright__ = 'Copyright (c) 2004 Mark Pilgrim' |
---|
12 |
__license__ = 'Python' |
---|
13 |
|
---|
14 |
import urllib2, urlparse, gzip |
---|
15 |
from StringIO import StringIO |
---|
16 |
|
---|
17 |
USER_AGENT = 'OpenAnything/%s +http://diveintopython.org/http_web_services/' % __version__ |
---|
18 |
|
---|
19 |
class SmartRedirectHandler(urllib2.HTTPRedirectHandler): |
---|
20 |
def http_error_301(self, req, fp, code, msg, headers): |
---|
21 |
result = urllib2.HTTPRedirectHandler.http_error_301( |
---|
22 |
self, req, fp, code, msg, headers) |
---|
23 |
result.status = code |
---|
24 |
return result |
---|
25 |
|
---|
26 |
def http_error_302(self, req, fp, code, msg, headers): |
---|
27 |
result = urllib2.HTTPRedirectHandler.http_error_302( |
---|
28 |
self, req, fp, code, msg, headers) |
---|
29 |
result.status = code |
---|
30 |
return result |
---|
31 |
|
---|
32 |
class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): |
---|
33 |
def http_error_default(self, req, fp, code, msg, headers): |
---|
34 |
result = urllib2.HTTPError( |
---|
35 |
req.get_full_url(), code, msg, headers, fp) |
---|
36 |
result.status = code |
---|
37 |
return result |
---|
38 |
|
---|
39 |
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT): |
---|
40 |
"""URL, filename, or string --> stream |
---|
41 |
|
---|
42 |
This function lets you define parsers that take any input source |
---|
43 |
(URL, pathname to local or network file, or actual data as a string) |
---|
44 |
and deal with it in a uniform manner. Returned object is guaranteed |
---|
45 |
to have all the basic stdio read methods (read, readline, readlines). |
---|
46 |
Just .close() the object when you're done with it. |
---|
47 |
|
---|
48 |
If the etag argument is supplied, it will be used as the value of an |
---|
49 |
If-None-Match request header. |
---|
50 |
|
---|
51 |
If the lastmodified argument is supplied, it must be a formatted |
---|
52 |
date/time string in GMT (as returned in the Last-Modified header of |
---|
53 |
a previous request). The formatted date/time will be used |
---|
54 |
as the value of an If-Modified-Since request header. |
---|
55 |
|
---|
56 |
If the agent argument is supplied, it will be used as the value of a |
---|
57 |
User-Agent request header. |
---|
58 |
""" |
---|
59 |
|
---|
60 |
if hasattr(source, 'read'): |
---|
61 |
return source |
---|
62 |
|
---|
63 |
if source == '-': |
---|
64 |
return sys.stdin |
---|
65 |
|
---|
66 |
if urlparse.urlparse(source)[0] == 'http': |
---|
67 |
# open URL with urllib2 |
---|
68 |
request = urllib2.Request(source) |
---|
69 |
request.add_header('User-Agent', agent) |
---|
70 |
if lastmodified: |
---|
71 |
request.add_header('If-Modified-Since', lastmodified) |
---|
72 |
if etag: |
---|
73 |
request.add_header('If-None-Match', etag) |
---|
74 |
request.add_header('Accept-encoding', 'gzip') |
---|
75 |
opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler()) |
---|
76 |
return opener.open(request) |
---|
77 |
|
---|
78 |
# try to open with native open function (if source is a filename) |
---|
79 |
try: |
---|
80 |
return open(source) |
---|
81 |
except (IOError, OSError): |
---|
82 |
pass |
---|
83 |
|
---|
84 |
# treat source as string |
---|
85 |
return StringIO(str(source)) |
---|
86 |
|
---|
87 |
def fetch(source, etag=None, lastmodified=None, agent=USER_AGENT): |
---|
88 |
'''Fetch data and metadata from a URL, file, stream, or string''' |
---|
89 |
result = {} |
---|
90 |
f = openAnything(source, etag, lastmodified, agent) |
---|
91 |
result['data'] = f.read() |
---|
92 |
if hasattr(f, 'headers'): |
---|
93 |
# save ETag, if the server sent one |
---|
94 |
result['etag'] = f.headers.get('ETag') |
---|
95 |
# save Last-Modified header, if the server sent one |
---|
96 |
result['lastmodified'] = f.headers.get('Last-Modified') |
---|
97 |
if f.headers.get('content-encoding') == 'gzip': |
---|
98 |
# data came back gzip-compressed, decompress it |
---|
99 |
result['data'] = gzip.GzipFile(fileobj=StringIO(result['data'])).read() |
---|
100 |
if hasattr(f, 'url'): |
---|
101 |
result['url'] = f.url |
---|
102 |
result['status'] = 200 |
---|
103 |
if hasattr(f, 'status'): |
---|
104 |
result['status'] = f.status |
---|
105 |
f.close() |
---|
106 |
return result |
---|
107 |
|
---|