]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/robotparser.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / robotparser.py
CommitLineData
4710c53d 1""" robotparser.py\r
2\r
3 Copyright (C) 2000 Bastian Kleineidam\r
4\r
5 You can choose between two licenses when using this package:\r
6 1) GNU GPLv2\r
7 2) PSF license for Python 2.2\r
8\r
9 The robots.txt Exclusion Protocol is implemented as specified in\r
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html\r
11"""\r
12import urlparse\r
13import urllib\r
14\r
15__all__ = ["RobotFileParser"]\r
16\r
17\r
18class RobotFileParser:\r
19 """ This class provides a set of methods to read, parse and answer\r
20 questions about a single robots.txt file.\r
21\r
22 """\r
23\r
24 def __init__(self, url=''):\r
25 self.entries = []\r
26 self.default_entry = None\r
27 self.disallow_all = False\r
28 self.allow_all = False\r
29 self.set_url(url)\r
30 self.last_checked = 0\r
31\r
32 def mtime(self):\r
33 """Returns the time the robots.txt file was last fetched.\r
34\r
35 This is useful for long-running web spiders that need to\r
36 check for new robots.txt files periodically.\r
37\r
38 """\r
39 return self.last_checked\r
40\r
41 def modified(self):\r
42 """Sets the time the robots.txt file was last fetched to the\r
43 current time.\r
44\r
45 """\r
46 import time\r
47 self.last_checked = time.time()\r
48\r
49 def set_url(self, url):\r
50 """Sets the URL referring to a robots.txt file."""\r
51 self.url = url\r
52 self.host, self.path = urlparse.urlparse(url)[1:3]\r
53\r
54 def read(self):\r
55 """Reads the robots.txt URL and feeds it to the parser."""\r
56 opener = URLopener()\r
57 f = opener.open(self.url)\r
58 lines = [line.strip() for line in f]\r
59 f.close()\r
60 self.errcode = opener.errcode\r
61 if self.errcode in (401, 403):\r
62 self.disallow_all = True\r
63 elif self.errcode >= 400:\r
64 self.allow_all = True\r
65 elif self.errcode == 200 and lines:\r
66 self.parse(lines)\r
67\r
68 def _add_entry(self, entry):\r
69 if "*" in entry.useragents:\r
70 # the default entry is considered last\r
71 if self.default_entry is None:\r
72 # the first default entry wins\r
73 self.default_entry = entry\r
74 else:\r
75 self.entries.append(entry)\r
76\r
77 def parse(self, lines):\r
78 """parse the input lines from a robots.txt file.\r
79 We allow that a user-agent: line is not preceded by\r
80 one or more blank lines."""\r
81 # states:\r
82 # 0: start state\r
83 # 1: saw user-agent line\r
84 # 2: saw an allow or disallow line\r
85 state = 0\r
86 linenumber = 0\r
87 entry = Entry()\r
88\r
89 for line in lines:\r
90 linenumber += 1\r
91 if not line:\r
92 if state == 1:\r
93 entry = Entry()\r
94 state = 0\r
95 elif state == 2:\r
96 self._add_entry(entry)\r
97 entry = Entry()\r
98 state = 0\r
99 # remove optional comment and strip line\r
100 i = line.find('#')\r
101 if i >= 0:\r
102 line = line[:i]\r
103 line = line.strip()\r
104 if not line:\r
105 continue\r
106 line = line.split(':', 1)\r
107 if len(line) == 2:\r
108 line[0] = line[0].strip().lower()\r
109 line[1] = urllib.unquote(line[1].strip())\r
110 if line[0] == "user-agent":\r
111 if state == 2:\r
112 self._add_entry(entry)\r
113 entry = Entry()\r
114 entry.useragents.append(line[1])\r
115 state = 1\r
116 elif line[0] == "disallow":\r
117 if state != 0:\r
118 entry.rulelines.append(RuleLine(line[1], False))\r
119 state = 2\r
120 elif line[0] == "allow":\r
121 if state != 0:\r
122 entry.rulelines.append(RuleLine(line[1], True))\r
123 state = 2\r
124 if state == 2:\r
125 self._add_entry(entry)\r
126\r
127\r
128 def can_fetch(self, useragent, url):\r
129 """using the parsed robots.txt decide if useragent can fetch url"""\r
130 if self.disallow_all:\r
131 return False\r
132 if self.allow_all:\r
133 return True\r
134 # search for given user agent matches\r
135 # the first match counts\r
136 parsed_url = urlparse.urlparse(urllib.unquote(url))\r
137 url = urlparse.urlunparse(('', '', parsed_url.path,\r
138 parsed_url.params, parsed_url.query, parsed_url.fragment))\r
139 url = urllib.quote(url)\r
140 if not url:\r
141 url = "/"\r
142 for entry in self.entries:\r
143 if entry.applies_to(useragent):\r
144 return entry.allowance(url)\r
145 # try the default entry last\r
146 if self.default_entry:\r
147 return self.default_entry.allowance(url)\r
148 # agent not found ==> access granted\r
149 return True\r
150\r
151\r
152 def __str__(self):\r
153 return ''.join([str(entry) + "\n" for entry in self.entries])\r
154\r
155\r
156class RuleLine:\r
157 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"\r
158 (allowance==False) followed by a path."""\r
159 def __init__(self, path, allowance):\r
160 if path == '' and not allowance:\r
161 # an empty value means allow all\r
162 allowance = True\r
163 self.path = urllib.quote(path)\r
164 self.allowance = allowance\r
165\r
166 def applies_to(self, filename):\r
167 return self.path == "*" or filename.startswith(self.path)\r
168\r
169 def __str__(self):\r
170 return (self.allowance and "Allow" or "Disallow") + ": " + self.path\r
171\r
172\r
173class Entry:\r
174 """An entry has one or more user-agents and zero or more rulelines"""\r
175 def __init__(self):\r
176 self.useragents = []\r
177 self.rulelines = []\r
178\r
179 def __str__(self):\r
180 ret = []\r
181 for agent in self.useragents:\r
182 ret.extend(["User-agent: ", agent, "\n"])\r
183 for line in self.rulelines:\r
184 ret.extend([str(line), "\n"])\r
185 return ''.join(ret)\r
186\r
187 def applies_to(self, useragent):\r
188 """check if this entry applies to the specified agent"""\r
189 # split the name token and make it lower case\r
190 useragent = useragent.split("/")[0].lower()\r
191 for agent in self.useragents:\r
192 if agent == '*':\r
193 # we have the catch-all agent\r
194 return True\r
195 agent = agent.lower()\r
196 if agent in useragent:\r
197 return True\r
198 return False\r
199\r
200 def allowance(self, filename):\r
201 """Preconditions:\r
202 - our agent applies to this entry\r
203 - filename is URL decoded"""\r
204 for line in self.rulelines:\r
205 if line.applies_to(filename):\r
206 return line.allowance\r
207 return True\r
208\r
209class URLopener(urllib.FancyURLopener):\r
210 def __init__(self, *args):\r
211 urllib.FancyURLopener.__init__(self, *args)\r
212 self.errcode = 200\r
213\r
214 def prompt_user_passwd(self, host, realm):\r
215 ## If robots.txt file is accessible only with a password,\r
216 ## we act as if the file wasn't there.\r
217 return None, None\r
218\r
219 def http_error_default(self, url, fp, errcode, errmsg, headers):\r
220 self.errcode = errcode\r
221 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,\r
222 errmsg, headers)\r