AppPkg/Applications/Python/Python-2.7.2/Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 urlparse module is based upon the following RFC specifications.
   4
   5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
   6 and L.  Masinter, January 2005.
   7
   8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
   9 and L.Masinter, December 1999.
  10
  11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
  12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
  13
  14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
  15
  16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
  17 1995.
  18
  19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
  20 McCahill, December 1994
  21
  22 RFC 3986 is considered the current standard and any future changes to
  23 urlparse module should conform with it.  The urlparse module is
  24 currently not entirely compliant with this RFC due to defacto
  25 scenarios for parsing, and for backward compatibility purposes, some
  26 parsing quirks from older RFCs are retained. The testcases in
  27 test_urlparse.py provides a good indicator of parsing behavior.
  28
  29 """
  30
  31 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
  32            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
  33
  34 # A classification of schemes ('' means apply by default)
  35 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  36                  'wais', 'file', 'https', 'shttp', 'mms',
  37                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
  38 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  39                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  40                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  41                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
  42 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  43                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  44 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  45                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  46                'mms', '', 'sftp']
  47 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  48               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  49 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  50                  'nntp', 'wais', 'https', 'shttp', 'snews',
  51                  'file', 'prospero', '']
  52
  53 # Characters valid in scheme names
  54 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  55                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  56                 '0123456789'
  57                 '+-.')
  58
  59 MAX_CACHE_SIZE = 20
  60 _parse_cache = {}
  61
  62 def clear_cache():
  63     """Clear the parse cache."""
  64     _parse_cache.clear()
  65
  66
  67 class ResultMixin(object):
  68     """Shared methods for the parsed result objects."""
  69
  70     @property
  71     def username(self):
  72         netloc = self.netloc
  73         if "@" in netloc:
  74             userinfo = netloc.rsplit("@", 1)[0]
  75             if ":" in userinfo:
  76                 userinfo = userinfo.split(":", 1)[0]
  77             return userinfo
  78         return None
  79
  80     @property
  81     def password(self):
  82         netloc = self.netloc
  83         if "@" in netloc:
  84             userinfo = netloc.rsplit("@", 1)[0]
  85             if ":" in userinfo:
  86                 return userinfo.split(":", 1)[1]
  87         return None
  88
  89     @property
  90     def hostname(self):
  91         netloc = self.netloc.split('@')[-1]
  92         if '[' in netloc and ']' in netloc:
  93             return netloc.split(']')[0][1:].lower()
  94         elif ':' in netloc:
  95             return netloc.split(':')[0].lower()
  96         elif netloc == '':
  97             return None
  98         else:
  99             return netloc.lower()
 100
 101     @property
 102     def port(self):
 103         netloc = self.netloc.split('@')[-1].split(']')[-1]
 104         if ':' in netloc:
 105             port = netloc.split(':')[1]
 106             return int(port, 10)
 107         else:
 108             return None
 109
 110 from collections import namedtuple
 111
 112 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
 113
 114     __slots__ = ()
 115
 116     def geturl(self):
 117         return urlunsplit(self)
 118
 119
 120 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
 121
 122     __slots__ = ()
 123
 124     def geturl(self):
 125         return urlunparse(self)
 126
 127
 128 def urlparse(url, scheme='', allow_fragments=True):
 129     """Parse a URL into 6 components:
 130     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 131     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 132     Note that we don't break the components up in smaller bits
 133     (e.g. netloc is a single string) and we don't expand % escapes."""
 134     tuple = urlsplit(url, scheme, allow_fragments)
 135     scheme, netloc, url, query, fragment = tuple
 136     if scheme in uses_params and ';' in url:
 137         url, params = _splitparams(url)
 138     else:
 139         params = ''
 140     return ParseResult(scheme, netloc, url, params, query, fragment)
 141
 142 def _splitparams(url):
 143     if '/'  in url:
 144         i = url.find(';', url.rfind('/'))
 145         if i < 0:
 146             return url, ''
 147     else:
 148         i = url.find(';')
 149     return url[:i], url[i+1:]
 150
 151 def _splitnetloc(url, start=0):
 152     delim = len(url)   # position of end of domain part of url, default is end
 153     for c in '/?#':    # look for delimiters; the order is NOT important
 154         wdelim = url.find(c, start)        # find first of this delim
 155         if wdelim >= 0:                    # if found
 156             delim = min(delim, wdelim)     # use earliest delim position
 157     return url[start:delim], url[delim:]   # return (domain, rest)
 158
 159 def urlsplit(url, scheme='', allow_fragments=True):
 160     """Parse a URL into 5 components:
 161     <scheme>://<netloc>/<path>?<query>#<fragment>
 162     Return a 5-tuple: (scheme, netloc, path, query, fragment).
 163     Note that we don't break the components up in smaller bits
 164     (e.g. netloc is a single string) and we don't expand % escapes."""
 165     allow_fragments = bool(allow_fragments)
 166     key = url, scheme, allow_fragments, type(url), type(scheme)
 167     cached = _parse_cache.get(key, None)
 168     if cached:
 169         return cached
 170     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
 171         clear_cache()
 172     netloc = query = fragment = ''
 173     i = url.find(':')
 174     if i > 0:
 175         if url[:i] == 'http': # optimize the common case
 176             scheme = url[:i].lower()
 177             url = url[i+1:]
 178             if url[:2] == '//':
 179                 netloc, url = _splitnetloc(url, 2)
 180                 if (('[' in netloc and ']' not in netloc) or
 181                         (']' in netloc and '[' not in netloc)):
 182                     raise ValueError("Invalid IPv6 URL")
 183             if allow_fragments and '#' in url:
 184                 url, fragment = url.split('#', 1)
 185             if '?' in url:
 186                 url, query = url.split('?', 1)
 187             v = SplitResult(scheme, netloc, url, query, fragment)
 188             _parse_cache[key] = v
 189             return v
 190         for c in url[:i]:
 191             if c not in scheme_chars:
 192                 break
 193         else:
 194             try:
 195                 # make sure "url" is not actually a port number (in which case
 196                 # "scheme" is really part of the path
 197                 _testportnum = int(url[i+1:])
 198             except ValueError:
 199                 scheme, url = url[:i].lower(), url[i+1:]
 200
 201     if url[:2] == '//':
 202         netloc, url = _splitnetloc(url, 2)
 203         if (('[' in netloc and ']' not in netloc) or
 204                 (']' in netloc and '[' not in netloc)):
 205             raise ValueError("Invalid IPv6 URL")
 206     if allow_fragments and scheme in uses_fragment and '#' in url:
 207         url, fragment = url.split('#', 1)
 208     if scheme in uses_query and '?' in url:
 209         url, query = url.split('?', 1)
 210     v = SplitResult(scheme, netloc, url, query, fragment)
 211     _parse_cache[key] = v
 212     return v
 213
 214 def urlunparse(data):
 215     """Put a parsed URL back together again.  This may result in a
 216     slightly different, but equivalent URL, if the URL that was parsed
 217     originally had redundant delimiters, e.g. a ? with an empty query
 218     (the draft states that these are equivalent)."""
 219     scheme, netloc, url, params, query, fragment = data
 220     if params:
 221         url = "%s;%s" % (url, params)
 222     return urlunsplit((scheme, netloc, url, query, fragment))
 223
 224 def urlunsplit(data):
 225     """Combine the elements of a tuple as returned by urlsplit() into a
 226     complete URL as a string. The data argument can be any five-item iterable.
 227     This may result in a slightly different, but equivalent URL, if the URL that
 228     was parsed originally had unnecessary delimiters (for example, a ? with an
 229     empty query; the RFC states that these are equivalent)."""
 230     scheme, netloc, url, query, fragment = data
 231     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
 232         if url and url[:1] != '/': url = '/' + url
 233         url = '//' + (netloc or '') + url
 234     if scheme:
 235         url = scheme + ':' + url
 236     if query:
 237         url = url + '?' + query
 238     if fragment:
 239         url = url + '#' + fragment
 240     return url
 241
 242 def urljoin(base, url, allow_fragments=True):
 243     """Join a base URL and a possibly relative URL to form an absolute
 244     interpretation of the latter."""
 245     if not base:
 246         return url
 247     if not url:
 248         return base
 249     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 250             urlparse(base, '', allow_fragments)
 251     scheme, netloc, path, params, query, fragment = \
 252             urlparse(url, bscheme, allow_fragments)
 253     if scheme != bscheme or scheme not in uses_relative:
 254         return url
 255     if scheme in uses_netloc:
 256         if netloc:
 257             return urlunparse((scheme, netloc, path,
 258                                params, query, fragment))
 259         netloc = bnetloc
 260     if path[:1] == '/':
 261         return urlunparse((scheme, netloc, path,
 262                            params, query, fragment))
 263     if not path and not params:
 264         path = bpath
 265         params = bparams
 266         if not query:
 267             query = bquery
 268         return urlunparse((scheme, netloc, path,
 269                            params, query, fragment))
 270     segments = bpath.split('/')[:-1] + path.split('/')
 271     # XXX The stuff below is bogus in various ways...
 272     if segments[-1] == '.':
 273         segments[-1] = ''
 274     while '.' in segments:
 275         segments.remove('.')
 276     while 1:
 277         i = 1
 278         n = len(segments) - 1
 279         while i < n:
 280             if (segments[i] == '..'
 281                 and segments[i-1] not in ('', '..')):
 282                 del segments[i-1:i+1]
 283                 break
 284             i = i+1
 285         else:
 286             break
 287     if segments == ['', '..']:
 288         segments[-1] = ''
 289     elif len(segments) >= 2 and segments[-1] == '..':
 290         segments[-2:] = ['']
 291     return urlunparse((scheme, netloc, '/'.join(segments),
 292                        params, query, fragment))
 293
 294 def urldefrag(url):
 295     """Removes any existing fragment from URL.
 296
 297     Returns a tuple of the defragmented URL and the fragment.  If
 298     the URL contained no fragments, the second element is the
 299     empty string.
 300     """
 301     if '#' in url:
 302         s, n, p, a, q, frag = urlparse(url)
 303         defrag = urlunparse((s, n, p, a, q, ''))
 304         return defrag, frag
 305     else:
 306         return url, ''
 307
 308 # unquote method for parse_qs and parse_qsl
 309 # Cannot use directly from urllib as it would create a circular reference
 310 # because urllib uses urlparse methods (urljoin).  If you update this function,
 311 # update it also in urllib.  This code duplication does not existin in Python3.
 312
 313 _hexdig = '0123456789ABCDEFabcdef'
 314 _hextochr = dict((a+b, chr(int(a+b,16)))
 315                  for a in _hexdig for b in _hexdig)
 316
 317 def unquote(s):
 318     """unquote('abc%20def') -> 'abc def'."""
 319     res = s.split('%')
 320     # fastpath
 321     if len(res) == 1:
 322         return s
 323     s = res[0]
 324     for item in res[1:]:
 325         try:
 326             s += _hextochr[item[:2]] + item[2:]
 327         except KeyError:
 328             s += '%' + item
 329         except UnicodeDecodeError:
 330             s += unichr(int(item[:2], 16)) + item[2:]
 331     return s
 332
 333 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
 334     """Parse a query given as a string argument.
 335
 336         Arguments:
 337
 338         qs: percent-encoded query string to be parsed
 339
 340         keep_blank_values: flag indicating whether blank values in
 341             percent-encoded queries should be treated as blank strings.
 342             A true value indicates that blanks should be retained as
 343             blank strings.  The default false value indicates that
 344             blank values are to be ignored and treated as if they were
 345             not included.
 346
 347         strict_parsing: flag indicating what to do with parsing errors.
 348             If false (the default), errors are silently ignored.
 349             If true, errors raise a ValueError exception.
 350     """
 351     dict = {}
 352     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
 353         if name in dict:
 354             dict[name].append(value)
 355         else:
 356             dict[name] = [value]
 357     return dict
 358
 359 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
 360     """Parse a query given as a string argument.
 361
 362     Arguments:
 363
 364     qs: percent-encoded query string to be parsed
 365
 366     keep_blank_values: flag indicating whether blank values in
 367         percent-encoded queries should be treated as blank strings.  A
 368         true value indicates that blanks should be retained as blank
 369         strings.  The default false value indicates that blank values
 370         are to be ignored and treated as if they were  not included.
 371
 372     strict_parsing: flag indicating what to do with parsing errors. If
 373         false (the default), errors are silently ignored. If true,
 374         errors raise a ValueError exception.
 375
 376     Returns a list, as G-d intended.
 377     """
 378     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 379     r = []
 380     for name_value in pairs:
 381         if not name_value and not strict_parsing:
 382             continue
 383         nv = name_value.split('=', 1)
 384         if len(nv) != 2:
 385             if strict_parsing:
 386                 raise ValueError, "bad query field: %r" % (name_value,)
 387             # Handle case of a control-name with no equal sign
 388             if keep_blank_values:
 389                 nv.append('')
 390             else:
 391                 continue
 392         if len(nv[1]) or keep_blank_values:
 393             name = unquote(nv[0].replace('+', ' '))
 394             value = unquote(nv[1].replace('+', ' '))
 395             r.append((name, value))
 396
 397     return r