[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / robotparser.py

""" robotparser.py\r
\r
    Copyright (C) 2000  Bastian Kleineidam\r
\r
    You can choose between two licenses when using this package:\r
    1) GNU GPLv2\r
    2) PSF license for Python 2.2\r
\r
    The robots.txt Exclusion Protocol is implemented as specified in\r
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html\r
"""\r
import urlparse\r
import urllib\r
\r
__all__ = ["RobotFileParser"]\r
\r
\r
class RobotFileParser:\r
    """ This class provides a set of methods to read, parse and answer\r
    questions about a single robots.txt file.\r
\r
    """\r
\r
    def __init__(self, url=''):\r
        self.entries = []\r
        self.default_entry = None\r
        self.disallow_all = False\r
        self.allow_all = False\r
        self.set_url(url)\r
        self.last_checked = 0\r
\r
    def mtime(self):\r
        """Returns the time the robots.txt file was last fetched.\r
\r
        This is useful for long-running web spiders that need to\r
        check for new robots.txt files periodically.\r
\r
        """\r
        return self.last_checked\r
\r
    def modified(self):\r
        """Sets the time the robots.txt file was last fetched to the\r
        current time.\r
\r
        """\r
        import time\r
        self.last_checked = time.time()\r
\r
    def set_url(self, url):\r
        """Sets the URL referring to a robots.txt file."""\r
        self.url = url\r
        self.host, self.path = urlparse.urlparse(url)[1:3]\r
\r
    def read(self):\r
        """Reads the robots.txt URL and feeds it to the parser."""\r
        opener = URLopener()\r
        f = opener.open(self.url)\r
        lines = [line.strip() for line in f]\r
        f.close()\r
        self.errcode = opener.errcode\r
        if self.errcode in (401, 403):\r
            self.disallow_all = True\r
        elif self.errcode >= 400:\r
            self.allow_all = True\r
        elif self.errcode == 200 and lines:\r
            self.parse(lines)\r
\r
    def _add_entry(self, entry):\r
        if "*" in entry.useragents:\r
            # the default entry is considered last\r
            if self.default_entry is None:\r
                # the first default entry wins\r
                self.default_entry = entry\r
        else:\r
            self.entries.append(entry)\r
\r
    def parse(self, lines):\r
        """parse the input lines from a robots.txt file.\r
           We allow that a user-agent: line is not preceded by\r
           one or more blank lines."""\r
        # states:\r
        #   0: start state\r
        #   1: saw user-agent line\r
        #   2: saw an allow or disallow line\r
        state = 0\r
        linenumber = 0\r
        entry = Entry()\r
\r
        for line in lines:\r
            linenumber += 1\r
            if not line:\r
                if state == 1:\r
                    entry = Entry()\r
                    state = 0\r
                elif state == 2:\r
                    self._add_entry(entry)\r
                    entry = Entry()\r
                    state = 0\r
            # remove optional comment and strip line\r
            i = line.find('#')\r
            if i >= 0:\r
                line = line[:i]\r
            line = line.strip()\r
            if not line:\r
                continue\r
            line = line.split(':', 1)\r
            if len(line) == 2:\r
                line[0] = line[0].strip().lower()\r
                line[1] = urllib.unquote(line[1].strip())\r
                if line[0] == "user-agent":\r
                    if state == 2:\r
                        self._add_entry(entry)\r
                        entry = Entry()\r
                    entry.useragents.append(line[1])\r
                    state = 1\r
                elif line[0] == "disallow":\r
                    if state != 0:\r
                        entry.rulelines.append(RuleLine(line[1], False))\r
                        state = 2\r
                elif line[0] == "allow":\r
                    if state != 0:\r
                        entry.rulelines.append(RuleLine(line[1], True))\r
                        state = 2\r
        if state == 2:\r
            self._add_entry(entry)\r
\r
\r
    def can_fetch(self, useragent, url):\r
        """using the parsed robots.txt decide if useragent can fetch url"""\r
        if self.disallow_all:\r
            return False\r
        if self.allow_all:\r
            return True\r
        # search for given user agent matches\r
        # the first match counts\r
        parsed_url = urlparse.urlparse(urllib.unquote(url))\r
        url = urlparse.urlunparse(('', '', parsed_url.path,\r
            parsed_url.params, parsed_url.query, parsed_url.fragment))\r
        url = urllib.quote(url)\r
        if not url:\r
            url = "/"\r
        for entry in self.entries:\r
            if entry.applies_to(useragent):\r
                return entry.allowance(url)\r
        # try the default entry last\r
        if self.default_entry:\r
            return self.default_entry.allowance(url)\r
        # agent not found ==> access granted\r
        return True\r
\r
\r
    def __str__(self):\r
        return ''.join([str(entry) + "\n" for entry in self.entries])\r
\r
\r
class RuleLine:\r
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"\r
       (allowance==False) followed by a path."""\r
    def __init__(self, path, allowance):\r
        if path == '' and not allowance:\r
            # an empty value means allow all\r
            allowance = True\r
        self.path = urllib.quote(path)\r
        self.allowance = allowance\r
\r
    def applies_to(self, filename):\r
        return self.path == "*" or filename.startswith(self.path)\r
\r
    def __str__(self):\r
        return (self.allowance and "Allow" or "Disallow") + ": " + self.path\r
\r
\r
class Entry:\r
    """An entry has one or more user-agents and zero or more rulelines"""\r
    def __init__(self):\r
        self.useragents = []\r
        self.rulelines = []\r
\r
    def __str__(self):\r
        ret = []\r
        for agent in self.useragents:\r
            ret.extend(["User-agent: ", agent, "\n"])\r
        for line in self.rulelines:\r
            ret.extend([str(line), "\n"])\r
        return ''.join(ret)\r
\r
    def applies_to(self, useragent):\r
        """check if this entry applies to the specified agent"""\r
        # split the name token and make it lower case\r
        useragent = useragent.split("/")[0].lower()\r
        for agent in self.useragents:\r
            if agent == '*':\r
                # we have the catch-all agent\r
                return True\r
            agent = agent.lower()\r
            if agent in useragent:\r
                return True\r
        return False\r
\r
    def allowance(self, filename):\r
        """Preconditions:\r
        - our agent applies to this entry\r
        - filename is URL decoded"""\r
        for line in self.rulelines:\r
            if line.applies_to(filename):\r
                return line.allowance\r
        return True\r
\r
class URLopener(urllib.FancyURLopener):\r
    def __init__(self, *args):\r
        urllib.FancyURLopener.__init__(self, *args)\r
        self.errcode = 200\r
\r
    def prompt_user_passwd(self, host, realm):\r
        ## If robots.txt file is accessible only with a password,\r
        ## we act as if the file wasn't there.\r
        return None, None\r
\r
    def http_error_default(self, url, fp, errcode, errmsg, headers):\r
        self.errcode = errcode\r
        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,\r
                                                        errmsg, headers)\r
Commit	Line	Data
4710c53d	1	""" robotparser.py\r
	2	\r
	3	Copyright (C) 2000 Bastian Kleineidam\r
	4	\r
	5	You can choose between two licenses when using this package:\r
	6	1) GNU GPLv2\r
	7	2) PSF license for Python 2.2\r
	8	\r
	9	The robots.txt Exclusion Protocol is implemented as specified in\r
	10	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html\r
	11	"""\r
	12	import urlparse\r
	13	import urllib\r
	14	\r
	15	__all__ = ["RobotFileParser"]\r
	16	\r
	17	\r
	18	class RobotFileParser:\r
	19	""" This class provides a set of methods to read, parse and answer\r
	20	questions about a single robots.txt file.\r
	21	\r
	22	"""\r
	23	\r
	24	def __init__(self, url=''):\r
	25	self.entries = []\r
	26	self.default_entry = None\r
	27	self.disallow_all = False\r
	28	self.allow_all = False\r
	29	self.set_url(url)\r
	30	self.last_checked = 0\r
	31	\r
	32	def mtime(self):\r
	33	"""Returns the time the robots.txt file was last fetched.\r
	34	\r
	35	This is useful for long-running web spiders that need to\r
	36	check for new robots.txt files periodically.\r
	37	\r
	38	"""\r
	39	return self.last_checked\r
	40	\r
	41	def modified(self):\r
	42	"""Sets the time the robots.txt file was last fetched to the\r
	43	current time.\r
	44	\r
	45	"""\r
	46	import time\r
	47	self.last_checked = time.time()\r
	48	\r
	49	def set_url(self, url):\r
	50	"""Sets the URL referring to a robots.txt file."""\r
	51	self.url = url\r
	52	self.host, self.path = urlparse.urlparse(url)[1:3]\r
	53	\r
	54	def read(self):\r
	55	"""Reads the robots.txt URL and feeds it to the parser."""\r
	56	opener = URLopener()\r
	57	f = opener.open(self.url)\r
	58	lines = [line.strip() for line in f]\r
	59	f.close()\r
	60	self.errcode = opener.errcode\r
	61	if self.errcode in (401, 403):\r
	62	self.disallow_all = True\r
	63	elif self.errcode >= 400:\r
	64	self.allow_all = True\r
65	elif self.errcode == 200 and lines:\r
66	self.parse(lines)\r
67	\r
68	def _add_entry(self, entry):\r
69	if "*" in entry.useragents:\r
70	# the default entry is considered last\r
71	if self.default_entry is None:\r
72	# the first default entry wins\r
73	self.default_entry = entry\r
74	else:\r
75	self.entries.append(entry)\r
76	\r
77	def parse(self, lines):\r
78	"""parse the input lines from a robots.txt file.\r
79	We allow that a user-agent: line is not preceded by\r
80	one or more blank lines."""\r
81	# states:\r
82	# 0: start state\r
83	# 1: saw user-agent line\r
84	# 2: saw an allow or disallow line\r
85	state = 0\r
86	linenumber = 0\r
87	entry = Entry()\r
88	\r
89	for line in lines:\r
90	linenumber += 1\r
91	if not line:\r
92	if state == 1:\r
93	entry = Entry()\r
94	state = 0\r
95	elif state == 2:\r
96	self._add_entry(entry)\r
97	entry = Entry()\r
98	state = 0\r
99	# remove optional comment and strip line\r
100	i = line.find('#')\r
101	if i >= 0:\r
102	line = line[:i]\r
103	line = line.strip()\r
104	if not line:\r
105	continue\r
106	line = line.split(':', 1)\r
107	if len(line) == 2:\r
108	line[0] = line[0].strip().lower()\r
109	line[1] = urllib.unquote(line[1].strip())\r
110	if line[0] == "user-agent":\r
111	if state == 2:\r
112	self._add_entry(entry)\r
113	entry = Entry()\r
114	entry.useragents.append(line[1])\r
115	state = 1\r
116	elif line[0] == "disallow":\r
117	if state != 0:\r
118	entry.rulelines.append(RuleLine(line[1], False))\r
119	state = 2\r
120	elif line[0] == "allow":\r
121	if state != 0:\r
122	entry.rulelines.append(RuleLine(line[1], True))\r
123	state = 2\r
124	if state == 2:\r
125	self._add_entry(entry)\r
126	\r
127	\r
128	def can_fetch(self, useragent, url):\r
129	"""using the parsed robots.txt decide if useragent can fetch url"""\r
130	if self.disallow_all:\r
131	return False\r
132	if self.allow_all:\r
133	return True\r
134	# search for given user agent matches\r
135	# the first match counts\r
136	parsed_url = urlparse.urlparse(urllib.unquote(url))\r
137	url = urlparse.urlunparse(('', '', parsed_url.path,\r
138	parsed_url.params, parsed_url.query, parsed_url.fragment))\r
139	url = urllib.quote(url)\r
140	if not url:\r
141	url = "/"\r
142	for entry in self.entries:\r
143	if entry.applies_to(useragent):\r
144	return entry.allowance(url)\r
145	# try the default entry last\r
146	if self.default_entry:\r
147	return self.default_entry.allowance(url)\r
148	# agent not found ==> access granted\r
149	return True\r
150	\r
151	\r
152	def __str__(self):\r
153	return ''.join([str(entry) + "\n" for entry in self.entries])\r
154	\r
155	\r
156	class RuleLine:\r
157	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"\r
158	(allowance==False) followed by a path."""\r
159	def __init__(self, path, allowance):\r
160	if path == '' and not allowance:\r
161	# an empty value means allow all\r
162	allowance = True\r
163	self.path = urllib.quote(path)\r
164	self.allowance = allowance\r
165	\r
166	def applies_to(self, filename):\r
167	return self.path == "*" or filename.startswith(self.path)\r
168	\r
169	def __str__(self):\r
170	return (self.allowance and "Allow" or "Disallow") + ": " + self.path\r
171	\r
172	\r
173	class Entry:\r
174	"""An entry has one or more user-agents and zero or more rulelines"""\r
175	def __init__(self):\r
176	self.useragents = []\r
177	self.rulelines = []\r
178	\r
179	def __str__(self):\r
180	ret = []\r
181	for agent in self.useragents:\r
182	ret.extend(["User-agent: ", agent, "\n"])\r
183	for line in self.rulelines:\r
184	ret.extend([str(line), "\n"])\r
185	return ''.join(ret)\r
186	\r
187	def applies_to(self, useragent):\r
188	"""check if this entry applies to the specified agent"""\r
189	# split the name token and make it lower case\r
190	useragent = useragent.split("/")[0].lower()\r
191	for agent in self.useragents:\r
192	if agent == '*':\r
193	# we have the catch-all agent\r
194	return True\r
195	agent = agent.lower()\r
196	if agent in useragent:\r
197	return True\r
198	return False\r
199	\r
200	def allowance(self, filename):\r
201	"""Preconditions:\r
202	- our agent applies to this entry\r
203	- filename is URL decoded"""\r
204	for line in self.rulelines:\r
205	if line.applies_to(filename):\r
206	return line.allowance\r
207	return True\r
208	\r
209	class URLopener(urllib.FancyURLopener):\r
210	def __init__(self, *args):\r
211	urllib.FancyURLopener.__init__(self, *args)\r
212	self.errcode = 200\r
213	\r
214	def prompt_user_passwd(self, host, realm):\r
215	## If robots.txt file is accessible only with a password,\r
216	## we act as if the file wasn't there.\r
217	return None, None\r
218	\r
219	def http_error_default(self, url, fp, errcode, errmsg, headers):\r
220	self.errcode = errcode\r
221	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,\r
222	errmsg, headers)\r