AppPkg/Applications/Python/Python-2.7.10/Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 urlparse module is based upon the following RFC specifications.
   4
   5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
   6 and L.  Masinter, January 2005.
   7
   8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
   9 and L.Masinter, December 1999.
  10
  11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
  12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
  13
  14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
  15
  16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
  17 1995.
  18
  19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
  20 McCahill, December 1994
  21
  22 RFC 3986 is considered the current standard and any future changes to
  23 urlparse module should conform with it.  The urlparse module is
  24 currently not entirely compliant with this RFC due to defacto
  25 scenarios for parsing, and for backward compatibility purposes, some
  26 parsing quirks from older RFCs are retained. The testcases in
  27 test_urlparse.py provides a good indicator of parsing behavior.
  28
  29 """
  30
  31 import re
  32
  33 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
  34            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
  35
  36 # A classification of schemes ('' means apply by default)
  37 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  38                  'wais', 'file', 'https', 'shttp', 'mms',
  39                  'prospero', 'rtsp', 'rtspu', '', 'sftp',
  40                  'svn', 'svn+ssh']
  41 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  42                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  43                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  44                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
  45 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  46                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  47                'mms', '', 'sftp', 'tel']
  48
  49 # These are not actually used anymore, but should stay for backwards
  50 # compatibility.  (They are undocumented, but have a public-looking name.)
  51 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  52                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  53 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  54               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  55 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  56                  'nntp', 'wais', 'https', 'shttp', 'snews',
  57                  'file', 'prospero', '']
  58
  59 # Characters valid in scheme names
  60 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  61                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  62                 '0123456789'
  63                 '+-.')
  64
  65 MAX_CACHE_SIZE = 20
  66 _parse_cache = {}
  67
  68 def clear_cache():
  69     """Clear the parse cache."""
  70     _parse_cache.clear()
  71
  72
  73 class ResultMixin(object):
  74     """Shared methods for the parsed result objects."""
  75
  76     @property
  77     def username(self):
  78         netloc = self.netloc
  79         if "@" in netloc:
  80             userinfo = netloc.rsplit("@", 1)[0]
  81             if ":" in userinfo:
  82                 userinfo = userinfo.split(":", 1)[0]
  83             return userinfo
  84         return None
  85
  86     @property
  87     def password(self):
  88         netloc = self.netloc
  89         if "@" in netloc:
  90             userinfo = netloc.rsplit("@", 1)[0]
  91             if ":" in userinfo:
  92                 return userinfo.split(":", 1)[1]
  93         return None
  94
  95     @property
  96     def hostname(self):
  97         netloc = self.netloc.split('@')[-1]
  98         if '[' in netloc and ']' in netloc:
  99             return netloc.split(']')[0][1:].lower()
 100         elif ':' in netloc:
 101             return netloc.split(':')[0].lower()
 102         elif netloc == '':
 103             return None
 104         else:
 105             return netloc.lower()
 106
 107     @property
 108     def port(self):
 109         netloc = self.netloc.split('@')[-1].split(']')[-1]
 110         if ':' in netloc:
 111             port = netloc.split(':')[1]
 112             if port:
 113                 port = int(port, 10)
 114                 # verify legal port
 115                 if (0 <= port <= 65535):
 116                     return port
 117         return None
 118
 119 from collections import namedtuple
 120
 121 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
 122
 123     __slots__ = ()
 124
 125     def geturl(self):
 126         return urlunsplit(self)
 127
 128
 129 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
 130
 131     __slots__ = ()
 132
 133     def geturl(self):
 134         return urlunparse(self)
 135
 136
 137 def urlparse(url, scheme='', allow_fragments=True):
 138     """Parse a URL into 6 components:
 139     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 140     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 141     Note that we don't break the components up in smaller bits
 142     (e.g. netloc is a single string) and we don't expand % escapes."""
 143     tuple = urlsplit(url, scheme, allow_fragments)
 144     scheme, netloc, url, query, fragment = tuple
 145     if scheme in uses_params and ';' in url:
 146         url, params = _splitparams(url)
 147     else:
 148         params = ''
 149     return ParseResult(scheme, netloc, url, params, query, fragment)
 150
 151 def _splitparams(url):
 152     if '/'  in url:
 153         i = url.find(';', url.rfind('/'))
 154         if i < 0:
 155             return url, ''
 156     else:
 157         i = url.find(';')
 158     return url[:i], url[i+1:]
 159
 160 def _splitnetloc(url, start=0):
 161     delim = len(url)   # position of end of domain part of url, default is end
 162     for c in '/?#':    # look for delimiters; the order is NOT important
 163         wdelim = url.find(c, start)        # find first of this delim
 164         if wdelim >= 0:                    # if found
 165             delim = min(delim, wdelim)     # use earliest delim position
 166     return url[start:delim], url[delim:]   # return (domain, rest)
 167
 168 def urlsplit(url, scheme='', allow_fragments=True):
 169     """Parse a URL into 5 components:
 170     <scheme>://<netloc>/<path>?<query>#<fragment>
 171     Return a 5-tuple: (scheme, netloc, path, query, fragment).
 172     Note that we don't break the components up in smaller bits
 173     (e.g. netloc is a single string) and we don't expand % escapes."""
 174     allow_fragments = bool(allow_fragments)
 175     key = url, scheme, allow_fragments, type(url), type(scheme)
 176     cached = _parse_cache.get(key, None)
 177     if cached:
 178         return cached
 179     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
 180         clear_cache()
 181     netloc = query = fragment = ''
 182     i = url.find(':')
 183     if i > 0:
 184         if url[:i] == 'http': # optimize the common case
 185             scheme = url[:i].lower()
 186             url = url[i+1:]
 187             if url[:2] == '//':
 188                 netloc, url = _splitnetloc(url, 2)
 189                 if (('[' in netloc and ']' not in netloc) or
 190                         (']' in netloc and '[' not in netloc)):
 191                     raise ValueError("Invalid IPv6 URL")
 192             if allow_fragments and '#' in url:
 193                 url, fragment = url.split('#', 1)
 194             if '?' in url:
 195                 url, query = url.split('?', 1)
 196             v = SplitResult(scheme, netloc, url, query, fragment)
 197             _parse_cache[key] = v
 198             return v
 199         for c in url[:i]:
 200             if c not in scheme_chars:
 201                 break
 202         else:
 203             # make sure "url" is not actually a port number (in which case
 204             # "scheme" is really part of the path)
 205             rest = url[i+1:]
 206             if not rest or any(c not in '0123456789' for c in rest):
 207                 # not a port number
 208                 scheme, url = url[:i].lower(), rest
 209
 210     if url[:2] == '//':
 211         netloc, url = _splitnetloc(url, 2)
 212         if (('[' in netloc and ']' not in netloc) or
 213                 (']' in netloc and '[' not in netloc)):
 214             raise ValueError("Invalid IPv6 URL")
 215     if allow_fragments and '#' in url:
 216         url, fragment = url.split('#', 1)
 217     if '?' in url:
 218         url, query = url.split('?', 1)
 219     v = SplitResult(scheme, netloc, url, query, fragment)
 220     _parse_cache[key] = v
 221     return v
 222
 223 def urlunparse(data):
 224     """Put a parsed URL back together again.  This may result in a
 225     slightly different, but equivalent URL, if the URL that was parsed
 226     originally had redundant delimiters, e.g. a ? with an empty query
 227     (the draft states that these are equivalent)."""
 228     scheme, netloc, url, params, query, fragment = data
 229     if params:
 230         url = "%s;%s" % (url, params)
 231     return urlunsplit((scheme, netloc, url, query, fragment))
 232
 233 def urlunsplit(data):
 234     """Combine the elements of a tuple as returned by urlsplit() into a
 235     complete URL as a string. The data argument can be any five-item iterable.
 236     This may result in a slightly different, but equivalent URL, if the URL that
 237     was parsed originally had unnecessary delimiters (for example, a ? with an
 238     empty query; the RFC states that these are equivalent)."""
 239     scheme, netloc, url, query, fragment = data
 240     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
 241         if url and url[:1] != '/': url = '/' + url
 242         url = '//' + (netloc or '') + url
 243     if scheme:
 244         url = scheme + ':' + url
 245     if query:
 246         url = url + '?' + query
 247     if fragment:
 248         url = url + '#' + fragment
 249     return url
 250
 251 def urljoin(base, url, allow_fragments=True):
 252     """Join a base URL and a possibly relative URL to form an absolute
 253     interpretation of the latter."""
 254     if not base:
 255         return url
 256     if not url:
 257         return base
 258     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 259             urlparse(base, '', allow_fragments)
 260     scheme, netloc, path, params, query, fragment = \
 261             urlparse(url, bscheme, allow_fragments)
 262     if scheme != bscheme or scheme not in uses_relative:
 263         return url
 264     if scheme in uses_netloc:
 265         if netloc:
 266             return urlunparse((scheme, netloc, path,
 267                                params, query, fragment))
 268         netloc = bnetloc
 269     if path[:1] == '/':
 270         return urlunparse((scheme, netloc, path,
 271                            params, query, fragment))
 272     if not path and not params:
 273         path = bpath
 274         params = bparams
 275         if not query:
 276             query = bquery
 277         return urlunparse((scheme, netloc, path,
 278                            params, query, fragment))
 279     segments = bpath.split('/')[:-1] + path.split('/')
 280     # XXX The stuff below is bogus in various ways...
 281     if segments[-1] == '.':
 282         segments[-1] = ''
 283     while '.' in segments:
 284         segments.remove('.')
 285     while 1:
 286         i = 1
 287         n = len(segments) - 1
 288         while i < n:
 289             if (segments[i] == '..'
 290                 and segments[i-1] not in ('', '..')):
 291                 del segments[i-1:i+1]
 292                 break
 293             i = i+1
 294         else:
 295             break
 296     if segments == ['', '..']:
 297         segments[-1] = ''
 298     elif len(segments) >= 2 and segments[-1] == '..':
 299         segments[-2:] = ['']
 300     return urlunparse((scheme, netloc, '/'.join(segments),
 301                        params, query, fragment))
 302
 303 def urldefrag(url):
 304     """Removes any existing fragment from URL.
 305
 306     Returns a tuple of the defragmented URL and the fragment.  If
 307     the URL contained no fragments, the second element is the
 308     empty string.
 309     """
 310     if '#' in url:
 311         s, n, p, a, q, frag = urlparse(url)
 312         defrag = urlunparse((s, n, p, a, q, ''))
 313         return defrag, frag
 314     else:
 315         return url, ''
 316
 317 try:
 318     unicode
 319 except NameError:
 320     def _is_unicode(x):
 321         return 0
 322 else:
 323     def _is_unicode(x):
 324         return isinstance(x, unicode)
 325
 326 # unquote method for parse_qs and parse_qsl
 327 # Cannot use directly from urllib as it would create a circular reference
 328 # because urllib uses urlparse methods (urljoin).  If you update this function,
 329 # update it also in urllib.  This code duplication does not existin in Python3.
 330
 331 _hexdig = '0123456789ABCDEFabcdef'
 332 _hextochr = dict((a+b, chr(int(a+b,16)))
 333                  for a in _hexdig for b in _hexdig)
 334 _asciire = re.compile('([\x00-\x7f]+)')
 335
 336 def unquote(s):
 337     """unquote('abc%20def') -> 'abc def'."""
 338     if _is_unicode(s):
 339         if '%' not in s:
 340             return s
 341         bits = _asciire.split(s)
 342         res = [bits[0]]
 343         append = res.append
 344         for i in range(1, len(bits), 2):
 345             append(unquote(str(bits[i])).decode('latin1'))
 346             append(bits[i + 1])
 347         return ''.join(res)
 348
 349     bits = s.split('%')
 350     # fastpath
 351     if len(bits) == 1:
 352         return s
 353     res = [bits[0]]
 354     append = res.append
 355     for item in bits[1:]:
 356         try:
 357             append(_hextochr[item[:2]])
 358             append(item[2:])
 359         except KeyError:
 360             append('%')
 361             append(item)
 362     return ''.join(res)
 363
 364 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
 365     """Parse a query given as a string argument.
 366
 367         Arguments:
 368
 369         qs: percent-encoded query string to be parsed
 370
 371         keep_blank_values: flag indicating whether blank values in
 372             percent-encoded queries should be treated as blank strings.
 373             A true value indicates that blanks should be retained as
 374             blank strings.  The default false value indicates that
 375             blank values are to be ignored and treated as if they were
 376             not included.
 377
 378         strict_parsing: flag indicating what to do with parsing errors.
 379             If false (the default), errors are silently ignored.
 380             If true, errors raise a ValueError exception.
 381     """
 382     dict = {}
 383     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
 384         if name in dict:
 385             dict[name].append(value)
 386         else:
 387             dict[name] = [value]
 388     return dict
 389
 390 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
 391     """Parse a query given as a string argument.
 392
 393     Arguments:
 394
 395     qs: percent-encoded query string to be parsed
 396
 397     keep_blank_values: flag indicating whether blank values in
 398         percent-encoded queries should be treated as blank strings.  A
 399         true value indicates that blanks should be retained as blank
 400         strings.  The default false value indicates that blank values
 401         are to be ignored and treated as if they were  not included.
 402
 403     strict_parsing: flag indicating what to do with parsing errors. If
 404         false (the default), errors are silently ignored. If true,
 405         errors raise a ValueError exception.
 406
 407     Returns a list, as G-d intended.
 408     """
 409     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 410     r = []
 411     for name_value in pairs:
 412         if not name_value and not strict_parsing:
 413             continue
 414         nv = name_value.split('=', 1)
 415         if len(nv) != 2:
 416             if strict_parsing:
 417                 raise ValueError, "bad query field: %r" % (name_value,)
 418             # Handle case of a control-name with no equal sign
 419             if keep_blank_values:
 420                 nv.append('')
 421             else:
 422                 continue
 423         if len(nv[1]) or keep_blank_values:
 424             name = unquote(nv[0].replace('+', ' '))
 425             value = unquote(nv[1].replace('+', ' '))
 426             r.append((name, value))
 427
 428     return r