+++ /dev/null
-"""Parse (absolute and relative) URLs.\r
-\r
-urlparse module is based upon the following RFC specifications.\r
-\r
-RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding\r
-and L. Masinter, January 2005.\r
-\r
-RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter\r
-and L.Masinter, December 1999.\r
-\r
-RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.\r
-Berners-Lee, R. Fielding, and L. Masinter, August 1998.\r
-\r
-RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.\r
-\r
-RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June\r
-1995.\r
-\r
-RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.\r
-McCahill, December 1994\r
-\r
-RFC 3986 is considered the current standard and any future changes to\r
-urlparse module should conform with it. The urlparse module is\r
-currently not entirely compliant with this RFC due to defacto\r
-scenarios for parsing, and for backward compatibility purposes, some\r
-parsing quirks from older RFCs are retained. The testcases in\r
-test_urlparse.py provides a good indicator of parsing behavior.\r
-\r
-"""\r
-\r
-__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",\r
- "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]\r
-\r
-# A classification of schemes ('' means apply by default)\r
-uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',\r
- 'wais', 'file', 'https', 'shttp', 'mms',\r
- 'prospero', 'rtsp', 'rtspu', '', 'sftp']\r
-uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',\r
- 'imap', 'wais', 'file', 'mms', 'https', 'shttp',\r
- 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',\r
- 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']\r
-non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',\r
- 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']\r
-uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',\r
- 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',\r
- 'mms', '', 'sftp']\r
-uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',\r
- 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']\r
-uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',\r
- 'nntp', 'wais', 'https', 'shttp', 'snews',\r
- 'file', 'prospero', '']\r
-\r
-# Characters valid in scheme names\r
-scheme_chars = ('abcdefghijklmnopqrstuvwxyz'\r
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'\r
- '0123456789'\r
- '+-.')\r
-\r
-MAX_CACHE_SIZE = 20\r
-_parse_cache = {}\r
-\r
-def clear_cache():\r
- """Clear the parse cache."""\r
- _parse_cache.clear()\r
-\r
-\r
-class ResultMixin(object):\r
- """Shared methods for the parsed result objects."""\r
-\r
- @property\r
- def username(self):\r
- netloc = self.netloc\r
- if "@" in netloc:\r
- userinfo = netloc.rsplit("@", 1)[0]\r
- if ":" in userinfo:\r
- userinfo = userinfo.split(":", 1)[0]\r
- return userinfo\r
- return None\r
-\r
- @property\r
- def password(self):\r
- netloc = self.netloc\r
- if "@" in netloc:\r
- userinfo = netloc.rsplit("@", 1)[0]\r
- if ":" in userinfo:\r
- return userinfo.split(":", 1)[1]\r
- return None\r
-\r
- @property\r
- def hostname(self):\r
- netloc = self.netloc.split('@')[-1]\r
- if '[' in netloc and ']' in netloc:\r
- return netloc.split(']')[0][1:].lower()\r
- elif ':' in netloc:\r
- return netloc.split(':')[0].lower()\r
- elif netloc == '':\r
- return None\r
- else:\r
- return netloc.lower()\r
-\r
- @property\r
- def port(self):\r
- netloc = self.netloc.split('@')[-1].split(']')[-1]\r
- if ':' in netloc:\r
- port = netloc.split(':')[1]\r
- return int(port, 10)\r
- else:\r
- return None\r
-\r
-from collections import namedtuple\r
-\r
-class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):\r
-\r
- __slots__ = ()\r
-\r
- def geturl(self):\r
- return urlunsplit(self)\r
-\r
-\r
-class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):\r
-\r
- __slots__ = ()\r
-\r
- def geturl(self):\r
- return urlunparse(self)\r
-\r
-\r
-def urlparse(url, scheme='', allow_fragments=True):\r
- """Parse a URL into 6 components:\r
- <scheme>://<netloc>/<path>;<params>?<query>#<fragment>\r
- Return a 6-tuple: (scheme, netloc, path, params, query, fragment).\r
- Note that we don't break the components up in smaller bits\r
- (e.g. netloc is a single string) and we don't expand % escapes."""\r
- tuple = urlsplit(url, scheme, allow_fragments)\r
- scheme, netloc, url, query, fragment = tuple\r
- if scheme in uses_params and ';' in url:\r
- url, params = _splitparams(url)\r
- else:\r
- params = ''\r
- return ParseResult(scheme, netloc, url, params, query, fragment)\r
-\r
-def _splitparams(url):\r
- if '/' in url:\r
- i = url.find(';', url.rfind('/'))\r
- if i < 0:\r
- return url, ''\r
- else:\r
- i = url.find(';')\r
- return url[:i], url[i+1:]\r
-\r
-def _splitnetloc(url, start=0):\r
- delim = len(url) # position of end of domain part of url, default is end\r
- for c in '/?#': # look for delimiters; the order is NOT important\r
- wdelim = url.find(c, start) # find first of this delim\r
- if wdelim >= 0: # if found\r
- delim = min(delim, wdelim) # use earliest delim position\r
- return url[start:delim], url[delim:] # return (domain, rest)\r
-\r
-def urlsplit(url, scheme='', allow_fragments=True):\r
- """Parse a URL into 5 components:\r
- <scheme>://<netloc>/<path>?<query>#<fragment>\r
- Return a 5-tuple: (scheme, netloc, path, query, fragment).\r
- Note that we don't break the components up in smaller bits\r
- (e.g. netloc is a single string) and we don't expand % escapes."""\r
- allow_fragments = bool(allow_fragments)\r
- key = url, scheme, allow_fragments, type(url), type(scheme)\r
- cached = _parse_cache.get(key, None)\r
- if cached:\r
- return cached\r
- if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth\r
- clear_cache()\r
- netloc = query = fragment = ''\r
- i = url.find(':')\r
- if i > 0:\r
- if url[:i] == 'http': # optimize the common case\r
- scheme = url[:i].lower()\r
- url = url[i+1:]\r
- if url[:2] == '//':\r
- netloc, url = _splitnetloc(url, 2)\r
- if (('[' in netloc and ']' not in netloc) or\r
- (']' in netloc and '[' not in netloc)):\r
- raise ValueError("Invalid IPv6 URL")\r
- if allow_fragments and '#' in url:\r
- url, fragment = url.split('#', 1)\r
- if '?' in url:\r
- url, query = url.split('?', 1)\r
- v = SplitResult(scheme, netloc, url, query, fragment)\r
- _parse_cache[key] = v\r
- return v\r
- for c in url[:i]:\r
- if c not in scheme_chars:\r
- break\r
- else:\r
- try:\r
- # make sure "url" is not actually a port number (in which case\r
- # "scheme" is really part of the path\r
- _testportnum = int(url[i+1:])\r
- except ValueError:\r
- scheme, url = url[:i].lower(), url[i+1:]\r
-\r
- if url[:2] == '//':\r
- netloc, url = _splitnetloc(url, 2)\r
- if (('[' in netloc and ']' not in netloc) or\r
- (']' in netloc and '[' not in netloc)):\r
- raise ValueError("Invalid IPv6 URL")\r
- if allow_fragments and scheme in uses_fragment and '#' in url:\r
- url, fragment = url.split('#', 1)\r
- if scheme in uses_query and '?' in url:\r
- url, query = url.split('?', 1)\r
- v = SplitResult(scheme, netloc, url, query, fragment)\r
- _parse_cache[key] = v\r
- return v\r
-\r
-def urlunparse(data):\r
- """Put a parsed URL back together again. This may result in a\r
- slightly different, but equivalent URL, if the URL that was parsed\r
- originally had redundant delimiters, e.g. a ? with an empty query\r
- (the draft states that these are equivalent)."""\r
- scheme, netloc, url, params, query, fragment = data\r
- if params:\r
- url = "%s;%s" % (url, params)\r
- return urlunsplit((scheme, netloc, url, query, fragment))\r
-\r
-def urlunsplit(data):\r
- """Combine the elements of a tuple as returned by urlsplit() into a\r
- complete URL as a string. The data argument can be any five-item iterable.\r
- This may result in a slightly different, but equivalent URL, if the URL that\r
- was parsed originally had unnecessary delimiters (for example, a ? with an\r
- empty query; the RFC states that these are equivalent)."""\r
- scheme, netloc, url, query, fragment = data\r
- if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):\r
- if url and url[:1] != '/': url = '/' + url\r
- url = '//' + (netloc or '') + url\r
- if scheme:\r
- url = scheme + ':' + url\r
- if query:\r
- url = url + '?' + query\r
- if fragment:\r
- url = url + '#' + fragment\r
- return url\r
-\r
-def urljoin(base, url, allow_fragments=True):\r
- """Join a base URL and a possibly relative URL to form an absolute\r
- interpretation of the latter."""\r
- if not base:\r
- return url\r
- if not url:\r
- return base\r
- bscheme, bnetloc, bpath, bparams, bquery, bfragment = \\r
- urlparse(base, '', allow_fragments)\r
- scheme, netloc, path, params, query, fragment = \\r
- urlparse(url, bscheme, allow_fragments)\r
- if scheme != bscheme or scheme not in uses_relative:\r
- return url\r
- if scheme in uses_netloc:\r
- if netloc:\r
- return urlunparse((scheme, netloc, path,\r
- params, query, fragment))\r
- netloc = bnetloc\r
- if path[:1] == '/':\r
- return urlunparse((scheme, netloc, path,\r
- params, query, fragment))\r
- if not path and not params:\r
- path = bpath\r
- params = bparams\r
- if not query:\r
- query = bquery\r
- return urlunparse((scheme, netloc, path,\r
- params, query, fragment))\r
- segments = bpath.split('/')[:-1] + path.split('/')\r
- # XXX The stuff below is bogus in various ways...\r
- if segments[-1] == '.':\r
- segments[-1] = ''\r
- while '.' in segments:\r
- segments.remove('.')\r
- while 1:\r
- i = 1\r
- n = len(segments) - 1\r
- while i < n:\r
- if (segments[i] == '..'\r
- and segments[i-1] not in ('', '..')):\r
- del segments[i-1:i+1]\r
- break\r
- i = i+1\r
- else:\r
- break\r
- if segments == ['', '..']:\r
- segments[-1] = ''\r
- elif len(segments) >= 2 and segments[-1] == '..':\r
- segments[-2:] = ['']\r
- return urlunparse((scheme, netloc, '/'.join(segments),\r
- params, query, fragment))\r
-\r
-def urldefrag(url):\r
- """Removes any existing fragment from URL.\r
-\r
- Returns a tuple of the defragmented URL and the fragment. If\r
- the URL contained no fragments, the second element is the\r
- empty string.\r
- """\r
- if '#' in url:\r
- s, n, p, a, q, frag = urlparse(url)\r
- defrag = urlunparse((s, n, p, a, q, ''))\r
- return defrag, frag\r
- else:\r
- return url, ''\r
-\r
-# unquote method for parse_qs and parse_qsl\r
-# Cannot use directly from urllib as it would create a circular reference\r
-# because urllib uses urlparse methods (urljoin). If you update this function,\r
-# update it also in urllib. This code duplication does not existin in Python3.\r
-\r
-_hexdig = '0123456789ABCDEFabcdef'\r
-_hextochr = dict((a+b, chr(int(a+b,16)))\r
- for a in _hexdig for b in _hexdig)\r
-\r
-def unquote(s):\r
- """unquote('abc%20def') -> 'abc def'."""\r
- res = s.split('%')\r
- # fastpath\r
- if len(res) == 1:\r
- return s\r
- s = res[0]\r
- for item in res[1:]:\r
- try:\r
- s += _hextochr[item[:2]] + item[2:]\r
- except KeyError:\r
- s += '%' + item\r
- except UnicodeDecodeError:\r
- s += unichr(int(item[:2], 16)) + item[2:]\r
- return s\r
-\r
-def parse_qs(qs, keep_blank_values=0, strict_parsing=0):\r
- """Parse a query given as a string argument.\r
-\r
- Arguments:\r
-\r
- qs: percent-encoded query string to be parsed\r
-\r
- keep_blank_values: flag indicating whether blank values in\r
- percent-encoded queries should be treated as blank strings.\r
- A true value indicates that blanks should be retained as\r
- blank strings. The default false value indicates that\r
- blank values are to be ignored and treated as if they were\r
- not included.\r
-\r
- strict_parsing: flag indicating what to do with parsing errors.\r
- If false (the default), errors are silently ignored.\r
- If true, errors raise a ValueError exception.\r
- """\r
- dict = {}\r
- for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):\r
- if name in dict:\r
- dict[name].append(value)\r
- else:\r
- dict[name] = [value]\r
- return dict\r
-\r
-def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):\r
- """Parse a query given as a string argument.\r
-\r
- Arguments:\r
-\r
- qs: percent-encoded query string to be parsed\r
-\r
- keep_blank_values: flag indicating whether blank values in\r
- percent-encoded queries should be treated as blank strings. A\r
- true value indicates that blanks should be retained as blank\r
- strings. The default false value indicates that blank values\r
- are to be ignored and treated as if they were not included.\r
-\r
- strict_parsing: flag indicating what to do with parsing errors. If\r
- false (the default), errors are silently ignored. If true,\r
- errors raise a ValueError exception.\r
-\r
- Returns a list, as G-d intended.\r
- """\r
- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]\r
- r = []\r
- for name_value in pairs:\r
- if not name_value and not strict_parsing:\r
- continue\r
- nv = name_value.split('=', 1)\r
- if len(nv) != 2:\r
- if strict_parsing:\r
- raise ValueError, "bad query field: %r" % (name_value,)\r
- # Handle case of a control-name with no equal sign\r
- if keep_blank_values:\r
- nv.append('')\r
- else:\r
- continue\r
- if len(nv[1]) or keep_blank_values:\r
- name = unquote(nv[0].replace('+', ' '))\r
- value = unquote(nv[1].replace('+', ' '))\r
- r.append((name, value))\r
-\r
- return r\r