]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Lib/urllib.py
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
30 from urlparse
import urljoin
as basejoin
32 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
40 __version__
= '1.17' # XXX This version is not always updated :-(
42 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
44 # Helper for non-unix systems
46 from nturl2path
import url2pathname
, pathname2url
47 elif os
.name
== 'riscos':
48 from rourl2path
import url2pathname
, pathname2url
50 def url2pathname(pathname
):
51 """OS-specific conversion from a relative URL of the 'file' scheme
52 to a file system path; not recommended for general use."""
53 return unquote(pathname
)
55 def pathname2url(pathname
):
56 """OS-specific conversion from a file system path to a relative URL
57 of the 'file' scheme; not recommended for general use."""
58 return quote(pathname
)
60 # This really consists of two pieces:
61 # (1) a class which handles opening of all sorts of URLs
62 # (plus assorted utilities etc.)
63 # (2) a set of functions for parsing URLs
64 # XXX Should these be separated out into different modules?
67 # Shortcut for basic usage
69 def urlopen(url
, data
=None, proxies
=None):
70 """Create a file-like object for the specified URL to read from."""
71 from warnings
import warnpy3k
72 warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
73 "favor of urllib2.urlopen()", stacklevel
=2)
76 if proxies
is not None:
77 opener
= FancyURLopener(proxies
=proxies
)
79 opener
= FancyURLopener()
84 return opener
.open(url
)
86 return opener
.open(url
, data
)
87 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
90 _urlopener
= FancyURLopener()
91 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
106 # exception raised when downloaded size does not match content-length
107 class ContentTooShortError(IOError):
108 def __init__(self
, message
, content
):
109 IOError.__init
__(self
, message
)
110 self
.content
= content
114 """Class to open URLs.
115 This is a class rather than just a subroutine because we may need
116 more than one set of global protocol-specific options.
117 Note -- this is a base class for those who don't want the
118 automatic handling of errors type 302 (relocated) and 401
119 (authorization needed)."""
123 version
= "Python-urllib/%s" % __version__
126 def __init__(self
, proxies
=None, **x509
):
128 proxies
= getproxies()
129 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
130 self
.proxies
= proxies
131 self
.key_file
= x509
.get('key_file')
132 self
.cert_file
= x509
.get('cert_file')
133 self
.addheaders
= [('User-Agent', self
.version
)]
134 self
.__tempfiles
= []
135 self
.__unlink
= os
.unlink
# See cleanup()
136 self
.tempcache
= None
137 # Undocumented feature: if you assign {} to tempcache,
138 # it is used to cache files retrieved with
139 # self.retrieve(). This is not enabled by default
140 # since it does not work for changing documents (and I
141 # haven't got the logic to check expiration headers
143 self
.ftpcache
= ftpcache
144 # Undocumented feature: you can use a different
145 # ftp cache by assigning to the .ftpcache member;
146 # in case you want logically independent URL openers
147 # XXX This is not threadsafe. Bah.
156 # This code sometimes runs when the rest of this module
157 # has already been deleted, so it can't use any globals
158 # or import anything.
160 for file in self
.__tempfiles
:
165 del self
.__tempfiles
[:]
167 self
.tempcache
.clear()
169 def addheader(self
, *args
):
170 """Add a header to be used by the HTTP interface only
171 e.g. u.addheader('Accept', 'sound/basic')"""
172 self
.addheaders
.append(args
)
175 def open(self
, fullurl
, data
=None):
176 """Use URLopener().open(file) instead of open(file, 'r')."""
177 fullurl
= unwrap(toBytes(fullurl
))
178 # percent encode url, fixing lame server errors for e.g, like space
180 fullurl
= quote(fullurl
, safe
="%/:=&?~#+!$,;'@()*[]|")
181 if self
.tempcache
and fullurl
in self
.tempcache
:
182 filename
, headers
= self
.tempcache
[fullurl
]
183 fp
= open(filename
, 'rb')
184 return addinfourl(fp
, headers
, fullurl
)
185 urltype
, url
= splittype(fullurl
)
188 if urltype
in self
.proxies
:
189 proxy
= self
.proxies
[urltype
]
190 urltype
, proxyhost
= splittype(proxy
)
191 host
, selector
= splithost(proxyhost
)
192 url
= (host
, fullurl
) # Signal special case to open_*()
195 name
= 'open_' + urltype
197 name
= name
.replace('-', '_')
198 if not hasattr(self
, name
):
200 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
202 return self
.open_unknown(fullurl
, data
)
205 return getattr(self
, name
)(url
)
207 return getattr(self
, name
)(url
, data
)
208 except socket
.error
, msg
:
209 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
211 def open_unknown(self
, fullurl
, data
=None):
212 """Overridable interface to open unknown URL type."""
213 type, url
= splittype(fullurl
)
214 raise IOError, ('url error', 'unknown url type', type)
216 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
217 """Overridable interface to open unknown URL type."""
218 type, url
= splittype(fullurl
)
219 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
222 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
223 """retrieve(url) returns (filename, headers) for a local object
224 or (tempfilename, headers) for a remote object."""
225 url
= unwrap(toBytes(url
))
226 if self
.tempcache
and url
in self
.tempcache
:
227 return self
.tempcache
[url
]
228 type, url1
= splittype(url
)
229 if filename
is None and (not type or type == 'file'):
231 fp
= self
.open_local_file(url1
)
234 return url2pathname(splithost(url1
)[1]), hdrs
237 fp
= self
.open(url
, data
)
241 tfp
= open(filename
, 'wb')
244 garbage
, path
= splittype(url
)
245 garbage
, path
= splithost(path
or "")
246 path
, garbage
= splitquery(path
or "")
247 path
, garbage
= splitattr(path
or "")
248 suffix
= os
.path
.splitext(path
)[1]
249 (fd
, filename
) = tempfile
.mkstemp(suffix
)
250 self
.__tempfiles
.append(filename
)
251 tfp
= os
.fdopen(fd
, 'wb')
253 result
= filename
, headers
254 if self
.tempcache
is not None:
255 self
.tempcache
[url
] = result
261 if "content-length" in headers
:
262 size
= int(headers
["Content-Length"])
263 reporthook(blocknum
, bs
, size
)
272 reporthook(blocknum
, bs
, size
)
278 # raise exception if actual size does not match content-length header
279 if size
>= 0 and read
< size
:
280 raise ContentTooShortError("retrieval incomplete: got only %i out "
281 "of %i bytes" % (read
, size
), result
)
285 # Each method named open_<type> knows how to open that type of URL
287 def open_http(self
, url
, data
=None):
288 """Use HTTP protocol."""
292 if isinstance(url
, str):
293 host
, selector
= splithost(url
)
295 user_passwd
, host
= splituser(host
)
300 # check whether the proxy contains authorization information
301 proxy_passwd
, host
= splituser(host
)
302 # now we proceed with the url we want to obtain
303 urltype
, rest
= splittype(selector
)
306 if urltype
.lower() != 'http':
309 realhost
, rest
= splithost(rest
)
311 user_passwd
, realhost
= splituser(realhost
)
313 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
314 if proxy_bypass(realhost
):
317 #print "proxy via http:", host, selector
318 if not host
: raise IOError, ('http error', 'no host given')
322 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
328 auth
= base64
.b64encode(user_passwd
).strip()
331 h
= httplib
.HTTP(host
)
333 h
.putrequest('POST', selector
)
334 h
.putheader('Content-Type', 'application/x-www-form-urlencoded')
335 h
.putheader('Content-Length', '%d' % len(data
))
337 h
.putrequest('GET', selector
)
338 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
339 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
340 if realhost
: h
.putheader('Host', realhost
)
341 for args
in self
.addheaders
: h
.putheader(*args
)
343 errcode
, errmsg
, headers
= h
.getreply()
347 # something went wrong with the HTTP status line
348 raise IOError, ('http protocol error', 0,
349 'got a bad status line', None)
350 # According to RFC 2616, "2xx" code indicates that the client's
351 # request was successfully received, understood, and accepted.
352 if (200 <= errcode
< 300):
353 return addinfourl(fp
, headers
, "http:" + url
, errcode
)
356 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
358 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
360 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
361 """Handle http errors.
362 Derived class can override this, or provide specific handlers
363 named http_error_DDD where DDD is the 3-digit error code."""
364 # First check if there's a specific handler for this error
365 name
= 'http_error_%d' % errcode
366 if hasattr(self
, name
):
367 method
= getattr(self
, name
)
369 result
= method(url
, fp
, errcode
, errmsg
, headers
)
371 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
372 if result
: return result
373 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
375 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
376 """Default error handler: close the connection and raise IOError."""
379 raise IOError, ('http error', errcode
, errmsg
, headers
)
382 def open_https(self
, url
, data
=None):
383 """Use HTTPS protocol."""
388 if isinstance(url
, str):
389 host
, selector
= splithost(url
)
391 user_passwd
, host
= splituser(host
)
396 # here, we determine, whether the proxy contains authorization information
397 proxy_passwd
, host
= splituser(host
)
398 urltype
, rest
= splittype(selector
)
401 if urltype
.lower() != 'https':
404 realhost
, rest
= splithost(rest
)
406 user_passwd
, realhost
= splituser(realhost
)
408 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
409 #print "proxy via https:", host, selector
410 if not host
: raise IOError, ('https error', 'no host given')
413 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
418 auth
= base64
.b64encode(user_passwd
).strip()
421 h
= httplib
.HTTPS(host
, 0,
422 key_file
=self
.key_file
,
423 cert_file
=self
.cert_file
)
425 h
.putrequest('POST', selector
)
426 h
.putheader('Content-Type',
427 'application/x-www-form-urlencoded')
428 h
.putheader('Content-Length', '%d' % len(data
))
430 h
.putrequest('GET', selector
)
431 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
432 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
433 if realhost
: h
.putheader('Host', realhost
)
434 for args
in self
.addheaders
: h
.putheader(*args
)
436 errcode
, errmsg
, headers
= h
.getreply()
440 # something went wrong with the HTTP status line
441 raise IOError, ('http protocol error', 0,
442 'got a bad status line', None)
443 # According to RFC 2616, "2xx" code indicates that the client's
444 # request was successfully received, understood, and accepted.
445 if (200 <= errcode
< 300):
446 return addinfourl(fp
, headers
, "https:" + url
, errcode
)
449 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
451 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
454 def open_file(self
, url
):
455 """Use local file or FTP depending on form of URL."""
456 if not isinstance(url
, str):
457 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
458 if url
[:2] == '//' and url
[2:3] != '/' and url
[2:12].lower() != 'localhost/':
459 return self
.open_ftp(url
)
461 return self
.open_local_file(url
)
463 def open_local_file(self
, url
):
464 """Use local file."""
465 import mimetypes
, mimetools
, email
.utils
467 from cStringIO
import StringIO
469 from StringIO
import StringIO
470 host
, file = splithost(url
)
471 localname
= url2pathname(file)
473 stats
= os
.stat(localname
)
475 raise IOError(e
.errno
, e
.strerror
, e
.filename
)
477 modified
= email
.utils
.formatdate(stats
.st_mtime
, usegmt
=True)
478 mtype
= mimetypes
.guess_type(url
)[0]
479 headers
= mimetools
.Message(StringIO(
480 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
481 (mtype
or 'text/plain', size
, modified
)))
485 urlfile
= 'file://' + file
486 return addinfourl(open(localname
, 'rb'),
488 host
, port
= splitport(host
)
490 and socket
.gethostbyname(host
) in (localhost(), thishost()):
493 urlfile
= 'file://' + file
494 return addinfourl(open(localname
, 'rb'),
496 raise IOError, ('local file error', 'not on local host')
498 def open_ftp(self
, url
):
499 """Use FTP protocol."""
500 if not isinstance(url
, str):
501 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
502 import mimetypes
, mimetools
504 from cStringIO
import StringIO
506 from StringIO
import StringIO
507 host
, path
= splithost(url
)
508 if not host
: raise IOError, ('ftp error', 'no host given')
509 host
, port
= splitport(host
)
510 user
, host
= splituser(host
)
511 if user
: user
, passwd
= splitpasswd(user
)
515 passwd
= passwd
or ''
516 host
= socket
.gethostbyname(host
)
519 port
= ftplib
.FTP_PORT
522 path
, attrs
= splitattr(path
)
524 dirs
= path
.split('/')
525 dirs
, file = dirs
[:-1], dirs
[-1]
526 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
527 if dirs
and not dirs
[0]: dirs
[0] = '/'
528 key
= user
, host
, port
, '/'.join(dirs
)
530 if len(self
.ftpcache
) > MAXFTPCACHE
:
531 # Prune the cache, rather arbitrarily
532 for k
in self
.ftpcache
.keys():
538 if not key
in self
.ftpcache
:
539 self
.ftpcache
[key
] = \
540 ftpwrapper(user
, passwd
, host
, port
, dirs
)
541 if not file: type = 'D'
544 attr
, value
= splitvalue(attr
)
545 if attr
.lower() == 'type' and \
546 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
548 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
549 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
552 headers
+= "Content-Type: %s\n" % mtype
553 if retrlen
is not None and retrlen
>= 0:
554 headers
+= "Content-Length: %d\n" % retrlen
555 headers
= mimetools
.Message(StringIO(headers
))
556 return addinfourl(fp
, headers
, "ftp:" + url
)
557 except ftperrors(), msg
:
558 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
560 def open_data(self
, url
, data
=None):
561 """Use "data" URL."""
562 if not isinstance(url
, str):
563 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
566 # syntax of data URLs:
567 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
568 # mediatype := [ type "/" subtype ] *( ";" parameter )
570 # parameter := attribute "=" value
573 from cStringIO
import StringIO
575 from StringIO
import StringIO
577 [type, data
] = url
.split(',', 1)
579 raise IOError, ('data error', 'bad data URL')
581 type = 'text/plain;charset=US-ASCII'
582 semi
= type.rfind(';')
583 if semi
>= 0 and '=' not in type[semi
:]:
584 encoding
= type[semi
+1:]
589 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %H:%M:%S GMT',
590 time
.gmtime(time
.time())))
591 msg
.append('Content-type: %s' % type)
592 if encoding
== 'base64':
594 data
= base64
.decodestring(data
)
597 msg
.append('Content-Length: %d' % len(data
))
602 headers
= mimetools
.Message(f
, 0)
603 #f.fileno = None # needed for addinfourl
604 return addinfourl(f
, headers
, url
)
607 class FancyURLopener(URLopener
):
608 """Derived class with handlers for errors we can handle (perhaps)."""
610 def __init__(self
, *args
, **kwargs
):
611 URLopener
.__init
__(self
, *args
, **kwargs
)
616 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
617 """Default error handling -- don't raise an exception."""
618 return addinfourl(fp
, headers
, "http:" + url
, errcode
)
620 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
621 """Error 302 -- relocated (temporarily)."""
623 if self
.maxtries
and self
.tries
>= self
.maxtries
:
624 if hasattr(self
, "http_error_500"):
625 meth
= self
.http_error_500
627 meth
= self
.http_error_default
629 return meth(url
, fp
, 500,
630 "Internal Server Error: Redirect Recursion", headers
)
631 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
636 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
637 if 'location' in headers
:
638 newurl
= headers
['location']
639 elif 'uri' in headers
:
640 newurl
= headers
['uri']
645 # In case the server sent a relative URL, join with original:
646 newurl
= basejoin(self
.type + ":" + url
, newurl
)
648 # For security reasons we do not allow redirects to protocols
649 # other than HTTP, HTTPS or FTP.
650 newurl_lower
= newurl
.lower()
651 if not (newurl_lower
.startswith('http://') or
652 newurl_lower
.startswith('https://') or
653 newurl_lower
.startswith('ftp://')):
654 raise IOError('redirect error', errcode
,
655 errmsg
+ " - Redirection to url '%s' is not allowed" %
659 return self
.open(newurl
)
661 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
662 """Error 301 -- also relocated (permanently)."""
663 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
665 def http_error_303(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
666 """Error 303 -- also relocated (essentially identical to 302)."""
667 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
669 def http_error_307(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
670 """Error 307 -- relocated, but turn POST into error."""
672 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
674 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
676 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
677 """Error 401 -- authentication required.
678 This function supports Basic authentication only."""
679 if not 'www-authenticate' in headers
:
680 URLopener
.http_error_default(self
, url
, fp
,
681 errcode
, errmsg
, headers
)
682 stuff
= headers
['www-authenticate']
684 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
686 URLopener
.http_error_default(self
, url
, fp
,
687 errcode
, errmsg
, headers
)
688 scheme
, realm
= match
.groups()
689 if scheme
.lower() != 'basic':
690 URLopener
.http_error_default(self
, url
, fp
,
691 errcode
, errmsg
, headers
)
692 name
= 'retry_' + self
.type + '_basic_auth'
694 return getattr(self
,name
)(url
, realm
)
696 return getattr(self
,name
)(url
, realm
, data
)
698 def http_error_407(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
699 """Error 407 -- proxy authentication required.
700 This function supports Basic authentication only."""
701 if not 'proxy-authenticate' in headers
:
702 URLopener
.http_error_default(self
, url
, fp
,
703 errcode
, errmsg
, headers
)
704 stuff
= headers
['proxy-authenticate']
706 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
708 URLopener
.http_error_default(self
, url
, fp
,
709 errcode
, errmsg
, headers
)
710 scheme
, realm
= match
.groups()
711 if scheme
.lower() != 'basic':
712 URLopener
.http_error_default(self
, url
, fp
,
713 errcode
, errmsg
, headers
)
714 name
= 'retry_proxy_' + self
.type + '_basic_auth'
716 return getattr(self
,name
)(url
, realm
)
718 return getattr(self
,name
)(url
, realm
, data
)
720 def retry_proxy_http_basic_auth(self
, url
, realm
, data
=None):
721 host
, selector
= splithost(url
)
722 newurl
= 'http://' + host
+ selector
723 proxy
= self
.proxies
['http']
724 urltype
, proxyhost
= splittype(proxy
)
725 proxyhost
, proxyselector
= splithost(proxyhost
)
726 i
= proxyhost
.find('@') + 1
727 proxyhost
= proxyhost
[i
:]
728 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
729 if not (user
or passwd
): return None
730 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
731 self
.proxies
['http'] = 'http://' + proxyhost
+ proxyselector
733 return self
.open(newurl
)
735 return self
.open(newurl
, data
)
737 def retry_proxy_https_basic_auth(self
, url
, realm
, data
=None):
738 host
, selector
= splithost(url
)
739 newurl
= 'https://' + host
+ selector
740 proxy
= self
.proxies
['https']
741 urltype
, proxyhost
= splittype(proxy
)
742 proxyhost
, proxyselector
= splithost(proxyhost
)
743 i
= proxyhost
.find('@') + 1
744 proxyhost
= proxyhost
[i
:]
745 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
746 if not (user
or passwd
): return None
747 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
748 self
.proxies
['https'] = 'https://' + proxyhost
+ proxyselector
750 return self
.open(newurl
)
752 return self
.open(newurl
, data
)
754 def retry_http_basic_auth(self
, url
, realm
, data
=None):
755 host
, selector
= splithost(url
)
756 i
= host
.find('@') + 1
758 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
759 if not (user
or passwd
): return None
760 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
761 newurl
= 'http://' + host
+ selector
763 return self
.open(newurl
)
765 return self
.open(newurl
, data
)
767 def retry_https_basic_auth(self
, url
, realm
, data
=None):
768 host
, selector
= splithost(url
)
769 i
= host
.find('@') + 1
771 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
772 if not (user
or passwd
): return None
773 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
774 newurl
= 'https://' + host
+ selector
776 return self
.open(newurl
)
778 return self
.open(newurl
, data
)
780 def get_user_passwd(self
, host
, realm
, clear_cache
=0):
781 key
= realm
+ '@' + host
.lower()
782 if key
in self
.auth_cache
:
784 del self
.auth_cache
[key
]
786 return self
.auth_cache
[key
]
787 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
788 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
791 def prompt_user_passwd(self
, host
, realm
):
792 """Override this in a GUI environment!"""
795 user
= raw_input("Enter username for %s at %s: " % (realm
,
797 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
800 except KeyboardInterrupt:
809 """Return the IP address of the magic hostname 'localhost'."""
811 if _localhost
is None:
812 _localhost
= socket
.gethostbyname('localhost')
817 """Return the IP address of the current host."""
819 if _thishost
is None:
820 _thishost
= socket
.gethostbyname(socket
.gethostname())
825 """Return the set of errors raised by the FTP class."""
827 if _ftperrors
is None:
829 _ftperrors
= ftplib
.all_errors
834 """Return an empty mimetools.Message object."""
836 if _noheaders
is None:
839 from cStringIO
import StringIO
841 from StringIO
import StringIO
842 _noheaders
= mimetools
.Message(StringIO(), 0)
843 _noheaders
.fp
.close() # Recycle file descriptor
850 """Class used by open_ftp() for cache of open FTP connections."""
852 def __init__(self
, user
, passwd
, host
, port
, dirs
,
853 timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
):
859 self
.timeout
= timeout
865 self
.ftp
= ftplib
.FTP()
866 self
.ftp
.connect(self
.host
, self
.port
, self
.timeout
)
867 self
.ftp
.login(self
.user
, self
.passwd
)
868 for dir in self
.dirs
:
871 def retrfile(self
, file, type):
874 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
875 else: cmd
= 'TYPE ' + type; isdir
= 0
877 self
.ftp
.voidcmd(cmd
)
878 except ftplib
.all_errors
:
880 self
.ftp
.voidcmd(cmd
)
882 if file and not isdir
:
883 # Try to retrieve as a file
886 conn
= self
.ftp
.ntransfercmd(cmd
)
887 except ftplib
.error_perm
, reason
:
888 if str(reason
)[:3] != '550':
889 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
891 # Set transfer mode to ASCII!
892 self
.ftp
.voidcmd('TYPE A')
893 # Try a directory listing. Verify that directory exists.
899 except ftplib
.error_perm
, reason
:
900 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
906 conn
= self
.ftp
.ntransfercmd(cmd
)
908 # Pass back both a suitably decorated object and a retrieval length
909 return (addclosehook(conn
[0].makefile('rb'),
910 self
.endtransfer
), conn
[1])
911 def endtransfer(self
):
928 """Base class for addinfo and addclosehook."""
930 def __init__(self
, fp
):
932 self
.read
= self
.fp
.read
933 self
.readline
= self
.fp
.readline
934 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
935 if hasattr(self
.fp
, "fileno"):
936 self
.fileno
= self
.fp
.fileno
938 self
.fileno
= lambda: None
939 if hasattr(self
.fp
, "__iter__"):
940 self
.__iter
__ = self
.fp
.__iter
__
941 if hasattr(self
.fp
, "next"):
942 self
.next
= self
.fp
.next
945 return '<%s at %r whose fp = %r>' % (self
.__class
__.__name
__,
951 self
.readlines
= None
953 if self
.fp
: self
.fp
.close()
956 class addclosehook(addbase
):
957 """Class to add a close hook to an open file."""
959 def __init__(self
, fp
, closehook
, *hookargs
):
960 addbase
.__init
__(self
, fp
)
961 self
.closehook
= closehook
962 self
.hookargs
= hookargs
967 self
.closehook(*self
.hookargs
)
968 self
.closehook
= None
971 class addinfo(addbase
):
972 """class to add an info() method to an open file."""
974 def __init__(self
, fp
, headers
):
975 addbase
.__init
__(self
, fp
)
976 self
.headers
= headers
981 class addinfourl(addbase
):
982 """class to add info() and geturl() methods to an open file."""
984 def __init__(self
, fp
, headers
, url
, code
=None):
985 addbase
.__init
__(self
, fp
)
986 self
.headers
= headers
1000 # Utilities to parse URLs (most of these return None for missing parts):
1001 # unwrap('<URL:type://host/path>') --> 'type://host/path'
1002 # splittype('type:opaquestring') --> 'type', 'opaquestring'
1003 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
1004 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1005 # splitpasswd('user:passwd') -> 'user', 'passwd'
1006 # splitport('host:port') --> 'host', 'port'
1007 # splitquery('/path?query') --> '/path', 'query'
1008 # splittag('/path#tag') --> '/path', 'tag'
1009 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1010 # '/path', ['attr1=value1', 'attr2=value2', ...]
1011 # splitvalue('attr=value') --> 'attr', 'value'
1012 # unquote('abc%20def') -> 'abc def'
1013 # quote('abc def') -> 'abc%20def')
1022 return isinstance(x
, unicode)
1025 """toBytes(u"URL") --> 'URL'."""
1026 # Most URL schemes require ASCII. If that changes, the conversion
1028 if _is_unicode(url
):
1030 url
= url
.encode("ASCII")
1031 except UnicodeError:
1032 raise UnicodeError("URL " + repr(url
) +
1033 " contains non-ASCII characters")
1037 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1039 if url
[:1] == '<' and url
[-1:] == '>':
1040 url
= url
[1:-1].strip()
1041 if url
[:4] == 'URL:': url
= url
[4:].strip()
1046 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1048 if _typeprog
is None:
1050 _typeprog
= re
.compile('^([^/:]+):')
1052 match
= _typeprog
.match(url
)
1054 scheme
= match
.group(1)
1055 return scheme
.lower(), url
[len(scheme
) + 1:]
1060 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1062 if _hostprog
is None:
1064 _hostprog
= re
.compile('^//([^/?]*)(.*)$')
1066 match
= _hostprog
.match(url
)
1068 host_port
= match
.group(1)
1069 path
= match
.group(2)
1070 if path
and not path
.startswith('/'):
1072 return host_port
, path
1076 def splituser(host
):
1077 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1079 if _userprog
is None:
1081 _userprog
= re
.compile('^(.*)@(.*)$')
1083 match
= _userprog
.match(host
)
1084 if match
: return match
.group(1, 2)
1088 def splitpasswd(user
):
1089 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1091 if _passwdprog
is None:
1093 _passwdprog
= re
.compile('^([^:]*):(.*)$',re
.S
)
1095 match
= _passwdprog
.match(user
)
1096 if match
: return match
.group(1, 2)
1099 # splittag('/path#tag') --> '/path', 'tag'
1101 def splitport(host
):
1102 """splitport('host:port') --> 'host', 'port'."""
1104 if _portprog
is None:
1106 _portprog
= re
.compile('^(.*):([0-9]+)$')
1108 match
= _portprog
.match(host
)
1109 if match
: return match
.group(1, 2)
1113 def splitnport(host
, defport
=-1):
1114 """Split host and port, returning numeric port.
1115 Return given default port if no ':' found; defaults to -1.
1116 Return numerical port if a valid number are found after ':'.
1117 Return None if ':' but not a valid number."""
1119 if _nportprog
is None:
1121 _nportprog
= re
.compile('^(.*):(.*)$')
1123 match
= _nportprog
.match(host
)
1125 host
, port
= match
.group(1, 2)
1127 if not port
: raise ValueError, "no digits"
1132 return host
, defport
1135 def splitquery(url
):
1136 """splitquery('/path?query') --> '/path', 'query'."""
1138 if _queryprog
is None:
1140 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1142 match
= _queryprog
.match(url
)
1143 if match
: return match
.group(1, 2)
1148 """splittag('/path#tag') --> '/path', 'tag'."""
1150 if _tagprog
is None:
1152 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1154 match
= _tagprog
.match(url
)
1155 if match
: return match
.group(1, 2)
1159 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1160 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1161 words
= url
.split(';')
1162 return words
[0], words
[1:]
1165 def splitvalue(attr
):
1166 """splitvalue('attr=value') --> 'attr', 'value'."""
1168 if _valueprog
is None:
1170 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1172 match
= _valueprog
.match(attr
)
1173 if match
: return match
.group(1, 2)
1176 # urlparse contains a duplicate of this method to avoid a circular import. If
1177 # you update this method, also update the copy in urlparse. This code
1178 # duplication does not exist in Python3.
1180 _hexdig
= '0123456789ABCDEFabcdef'
1181 _hextochr
= dict((a
+ b
, chr(int(a
+ b
, 16)))
1182 for a
in _hexdig
for b
in _hexdig
)
1185 """unquote('abc%20def') -> 'abc def'."""
1191 for item
in res
[1:]:
1193 s
+= _hextochr
[item
[:2]] + item
[2:]
1196 except UnicodeDecodeError:
1197 s
+= unichr(int(item
[:2], 16)) + item
[2:]
1200 def unquote_plus(s
):
1201 """unquote('%7e/abc+def') -> '~/abc def'"""
1202 s
= s
.replace('+', ' ')
1205 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1206 'abcdefghijklmnopqrstuvwxyz'
1209 for i
, c
in zip(xrange(256), str(bytearray(xrange(256)))):
1210 _safe_map
[c
] = c
if (i
< 128 and c
in always_safe
) else '%{:02X}'.format(i
)
1213 def quote(s
, safe
='/'):
1214 """quote('abc def') -> 'abc%20def'
1216 Each part of a URL, e.g. the path info, the query, etc., has a
1217 different set of reserved characters that must be quoted.
1219 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1220 the following reserved characters.
1222 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1225 Each of these characters is reserved in some component of a URL,
1226 but not necessarily in all of them.
1228 By default, the quote function is intended for quoting the path
1229 section of a URL. Thus, it will not encode '/'. This character
1230 is reserved, but in typical usage the quote function is being
1231 called on a path where the existing slash characters are used as
1232 reserved characters.
1237 raise TypeError('None object cannot be quoted')
1239 cachekey
= (safe
, always_safe
)
1241 (quoter
, safe
) = _safe_quoters
[cachekey
]
1243 safe_map
= _safe_map
.copy()
1244 safe_map
.update([(c
, c
) for c
in safe
])
1245 quoter
= safe_map
.__getitem
__
1246 safe
= always_safe
+ safe
1247 _safe_quoters
[cachekey
] = (quoter
, safe
)
1248 if not s
.rstrip(safe
):
1250 return ''.join(map(quoter
, s
))
1252 def quote_plus(s
, safe
=''):
1253 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1255 s
= quote(s
, safe
+ ' ')
1256 return s
.replace(' ', '+')
1257 return quote(s
, safe
)
1259 def urlencode(query
, doseq
=0):
1260 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1262 If any values in the query arg are sequences and doseq is true, each
1263 sequence element is converted to a separate parameter.
1265 If the query arg is a sequence of two-element tuples, the order of the
1266 parameters in the output will match the order of parameters in the
1270 if hasattr(query
,"items"):
1272 query
= query
.items()
1274 # it's a bother at times that strings and string-like objects are
1277 # non-sequence items should not work with len()
1278 # non-empty strings will fail this
1279 if len(query
) and not isinstance(query
[0], tuple):
1281 # zero-length sequences of all types will get here and succeed,
1282 # but that's a minor nit - since the original implementation
1283 # allowed empty dicts that type of behavior probably should be
1284 # preserved for consistency
1286 ty
,va
,tb
= sys
.exc_info()
1287 raise TypeError, "not a valid non-string sequence or mapping object", tb
1291 # preserve old behavior
1293 k
= quote_plus(str(k
))
1294 v
= quote_plus(str(v
))
1295 l
.append(k
+ '=' + v
)
1298 k
= quote_plus(str(k
))
1299 if isinstance(v
, str):
1301 l
.append(k
+ '=' + v
)
1302 elif _is_unicode(v
):
1303 # is there a reasonable way to convert to ASCII?
1304 # encode generates a string, but "replace" or "ignore"
1305 # lose information and "strict" can raise UnicodeError
1306 v
= quote_plus(v
.encode("ASCII","replace"))
1307 l
.append(k
+ '=' + v
)
1310 # is this a sufficient test for sequence-ness?
1314 v
= quote_plus(str(v
))
1315 l
.append(k
+ '=' + v
)
1317 # loop over the sequence
1319 l
.append(k
+ '=' + quote_plus(str(elt
)))
1323 def getproxies_environment():
1324 """Return a dictionary of scheme -> proxy server URL mappings.
1326 Scan the environment for variables named <scheme>_proxy;
1327 this seems to be the standard convention. If you need a
1328 different way, you can pass a proxies dictionary to the
1329 [Fancy]URLopener constructor.
1333 for name
, value
in os
.environ
.items():
1335 if value
and name
[-6:] == '_proxy':
1336 proxies
[name
[:-6]] = value
1339 def proxy_bypass_environment(host
):
1340 """Test if proxies should not be used for a particular host.
1342 Checks the environment for a variable named no_proxy, which should
1343 be a list of DNS suffixes separated by commas, or '*' for all hosts.
1345 no_proxy
= os
.environ
.get('no_proxy', '') or os
.environ
.get('NO_PROXY', '')
1346 # '*' is special case for always bypass
1349 # strip port off host
1350 hostonly
, port
= splitport(host
)
1351 # check if the host ends with any of the DNS suffixes
1352 for name
in no_proxy
.split(','):
1353 if name
and (hostonly
.endswith(name
) or host
.endswith(name
)):
1355 # otherwise, don't bypass
1359 if sys
.platform
== 'darwin':
1360 from _scproxy
import _get_proxy_settings
, _get_proxies
1362 def proxy_bypass_macosx_sysconf(host
):
1364 Return True iff this host shouldn't be accessed using a proxy
1366 This function uses the MacOSX framework SystemConfiguration
1367 to fetch the proxy information.
1371 from fnmatch
import fnmatch
1373 hostonly
, port
= splitport(host
)
1376 parts
= ipAddr
.split('.')
1377 parts
= map(int, parts
)
1379 parts
= (parts
+ [0, 0, 0, 0])[:4]
1380 return (parts
[0] << 24) |
(parts
[1] << 16) |
(parts
[2] << 8) | parts
[3]
1382 proxy_settings
= _get_proxy_settings()
1384 # Check for simple host names:
1386 if proxy_settings
['exclude_simple']:
1391 for value
in proxy_settings
.get('exceptions', ()):
1392 # Items in the list are strings like these: *.local, 169.254/16
1393 if not value
: continue
1395 m
= re
.match(r
"(\d+(?:\.\d+)*)(/\d+)?", value
)
1399 hostIP
= socket
.gethostbyname(hostonly
)
1400 hostIP
= ip2num(hostIP
)
1401 except socket
.error
:
1404 base
= ip2num(m
.group(1))
1407 mask
= 8 * (m
.group(1).count('.') + 1)
1410 mask
= int(mask
[1:])
1413 if (hostIP
>> mask
) == (base
>> mask
):
1416 elif fnmatch(host
, value
):
1421 def getproxies_macosx_sysconf():
1422 """Return a dictionary of scheme -> proxy server URL mappings.
1424 This function uses the MacOSX framework SystemConfiguration
1425 to fetch the proxy information.
1427 return _get_proxies()
1429 def proxy_bypass(host
):
1430 if getproxies_environment():
1431 return proxy_bypass_environment(host
)
1433 return proxy_bypass_macosx_sysconf(host
)
1436 return getproxies_environment() or getproxies_macosx_sysconf()
1438 elif os
.name
== 'nt':
1439 def getproxies_registry():
1440 """Return a dictionary of scheme -> proxy server URL mappings.
1442 Win32 uses the registry to store proxies.
1449 # Std module, so should be around - but you never know!
1452 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1453 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1454 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1457 # Returned as Unicode but problems if not converted to ASCII
1458 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1460 if '=' in proxyServer
:
1461 # Per-protocol settings
1462 for p
in proxyServer
.split(';'):
1463 protocol
, address
= p
.split('=', 1)
1464 # See if address has a type:// prefix
1466 if not re
.match('^([^/:]+)://', address
):
1467 address
= '%s://%s' % (protocol
, address
)
1468 proxies
[protocol
] = address
1470 # Use one setting for all protocols
1471 if proxyServer
[:5] == 'http:':
1472 proxies
['http'] = proxyServer
1474 proxies
['http'] = 'http://%s' % proxyServer
1475 proxies
['https'] = 'https://%s' % proxyServer
1476 proxies
['ftp'] = 'ftp://%s' % proxyServer
1477 internetSettings
.Close()
1478 except (WindowsError, ValueError, TypeError):
1479 # Either registry key not found etc, or the value in an
1480 # unexpected format.
1481 # proxies already set up to be empty so nothing to do
1486 """Return a dictionary of scheme -> proxy server URL mappings.
1488 Returns settings gathered from the environment, if specified,
1492 return getproxies_environment() or getproxies_registry()
1494 def proxy_bypass_registry(host
):
1499 # Std modules, so should be around - but you never know!
1502 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1503 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1504 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1506 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1507 'ProxyOverride')[0])
1508 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1509 except WindowsError:
1511 if not proxyEnable
or not proxyOverride
:
1513 # try to make a host list from name and IP address.
1514 rawHost
, port
= splitport(host
)
1517 addr
= socket
.gethostbyname(rawHost
)
1520 except socket
.error
:
1523 fqdn
= socket
.getfqdn(rawHost
)
1526 except socket
.error
:
1528 # make a check value list from the registry entry: replace the
1529 # '<local>' string by the localhost entry and the corresponding
1531 proxyOverride
= proxyOverride
.split(';')
1532 # now check if we match one of the registry values.
1533 for test
in proxyOverride
:
1534 if test
== '<local>':
1535 if '.' not in rawHost
:
1537 test
= test
.replace(".", r
"\.") # mask dots
1538 test
= test
.replace("*", r
".*") # change glob sequence
1539 test
= test
.replace("?", r
".") # change glob char
1541 # print "%s <--> %s" %( test, val )
1542 if re
.match(test
, val
, re
.I
):
1546 def proxy_bypass(host
):
1547 """Return a dictionary of scheme -> proxy server URL mappings.
1549 Returns settings gathered from the environment, if specified,
1553 if getproxies_environment():
1554 return proxy_bypass_environment(host
)
1556 return proxy_bypass_registry(host
)
1559 # By default use environment variables
1560 getproxies
= getproxies_environment
1561 proxy_bypass
= proxy_bypass_environment
1563 # Test and time quote() and unquote()
1566 for i
in range(256): s
= s
+ chr(i
)
1577 print round(t1
- t0
, 3), 'sec'
1580 def reporthook(blocknum
, blocksize
, totalsize
):
1581 # Report during remote transfers
1582 print "Block number: %d, Block size: %d, Total size: %d" % (
1583 blocknum
, blocksize
, totalsize
)
1591 'file://localhost/etc/passwd',
1592 'ftp://ftp.gnu.org/pub/README',
1593 'http://www.python.org/index.html',
1595 if hasattr(URLopener
, "open_https"):
1596 args
.append('https://synergy.as.cmu.edu/~geek/')
1599 print '-'*10, url
, '-'*10
1600 fn
, h
= urlretrieve(url
, None, reporthook
)
1604 for k
in h
.keys(): print k
+ ':', h
[k
]
1606 with
open(fn
, 'rb') as fp
:
1609 table
= string
.maketrans("", "")
1610 data
= data
.translate(table
, "\r")
1620 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1621 except getopt
.error
, msg
:
1623 print "Use -h for help"
1630 print "Usage: python urllib.py [-t] [url ...]"
1631 print "-t runs self-test;",
1632 print "otherwise, contents of urls are printed"
1640 print "Use -h for help"
1642 print urlopen(url
).read(),
1644 # Run test program when run as a script
1645 if __name__
== '__main__':