]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | """ robotparser.py\r |
2 | \r | |
3 | Copyright (C) 2000 Bastian Kleineidam\r | |
4 | \r | |
5 | You can choose between two licenses when using this package:\r | |
6 | 1) GNU GPLv2\r | |
7 | 2) PSF license for Python 2.2\r | |
8 | \r | |
9 | The robots.txt Exclusion Protocol is implemented as specified in\r | |
10 | http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html\r | |
11 | """\r | |
12 | import urlparse\r | |
13 | import urllib\r | |
14 | \r | |
15 | __all__ = ["RobotFileParser"]\r | |
16 | \r | |
17 | \r | |
18 | class RobotFileParser:\r | |
19 | """ This class provides a set of methods to read, parse and answer\r | |
20 | questions about a single robots.txt file.\r | |
21 | \r | |
22 | """\r | |
23 | \r | |
24 | def __init__(self, url=''):\r | |
25 | self.entries = []\r | |
26 | self.default_entry = None\r | |
27 | self.disallow_all = False\r | |
28 | self.allow_all = False\r | |
29 | self.set_url(url)\r | |
30 | self.last_checked = 0\r | |
31 | \r | |
32 | def mtime(self):\r | |
33 | """Returns the time the robots.txt file was last fetched.\r | |
34 | \r | |
35 | This is useful for long-running web spiders that need to\r | |
36 | check for new robots.txt files periodically.\r | |
37 | \r | |
38 | """\r | |
39 | return self.last_checked\r | |
40 | \r | |
41 | def modified(self):\r | |
42 | """Sets the time the robots.txt file was last fetched to the\r | |
43 | current time.\r | |
44 | \r | |
45 | """\r | |
46 | import time\r | |
47 | self.last_checked = time.time()\r | |
48 | \r | |
49 | def set_url(self, url):\r | |
50 | """Sets the URL referring to a robots.txt file."""\r | |
51 | self.url = url\r | |
52 | self.host, self.path = urlparse.urlparse(url)[1:3]\r | |
53 | \r | |
54 | def read(self):\r | |
55 | """Reads the robots.txt URL and feeds it to the parser."""\r | |
56 | opener = URLopener()\r | |
57 | f = opener.open(self.url)\r | |
58 | lines = [line.strip() for line in f]\r | |
59 | f.close()\r | |
60 | self.errcode = opener.errcode\r | |
61 | if self.errcode in (401, 403):\r | |
62 | self.disallow_all = True\r | |
63 | elif self.errcode >= 400:\r | |
64 | self.allow_all = True\r | |
65 | elif self.errcode == 200 and lines:\r | |
66 | self.parse(lines)\r | |
67 | \r | |
68 | def _add_entry(self, entry):\r | |
69 | if "*" in entry.useragents:\r | |
70 | # the default entry is considered last\r | |
71 | if self.default_entry is None:\r | |
72 | # the first default entry wins\r | |
73 | self.default_entry = entry\r | |
74 | else:\r | |
75 | self.entries.append(entry)\r | |
76 | \r | |
77 | def parse(self, lines):\r | |
78 | """parse the input lines from a robots.txt file.\r | |
79 | We allow that a user-agent: line is not preceded by\r | |
80 | one or more blank lines."""\r | |
81 | # states:\r | |
82 | # 0: start state\r | |
83 | # 1: saw user-agent line\r | |
84 | # 2: saw an allow or disallow line\r | |
85 | state = 0\r | |
86 | linenumber = 0\r | |
87 | entry = Entry()\r | |
88 | \r | |
89 | for line in lines:\r | |
90 | linenumber += 1\r | |
91 | if not line:\r | |
92 | if state == 1:\r | |
93 | entry = Entry()\r | |
94 | state = 0\r | |
95 | elif state == 2:\r | |
96 | self._add_entry(entry)\r | |
97 | entry = Entry()\r | |
98 | state = 0\r | |
99 | # remove optional comment and strip line\r | |
100 | i = line.find('#')\r | |
101 | if i >= 0:\r | |
102 | line = line[:i]\r | |
103 | line = line.strip()\r | |
104 | if not line:\r | |
105 | continue\r | |
106 | line = line.split(':', 1)\r | |
107 | if len(line) == 2:\r | |
108 | line[0] = line[0].strip().lower()\r | |
109 | line[1] = urllib.unquote(line[1].strip())\r | |
110 | if line[0] == "user-agent":\r | |
111 | if state == 2:\r | |
112 | self._add_entry(entry)\r | |
113 | entry = Entry()\r | |
114 | entry.useragents.append(line[1])\r | |
115 | state = 1\r | |
116 | elif line[0] == "disallow":\r | |
117 | if state != 0:\r | |
118 | entry.rulelines.append(RuleLine(line[1], False))\r | |
119 | state = 2\r | |
120 | elif line[0] == "allow":\r | |
121 | if state != 0:\r | |
122 | entry.rulelines.append(RuleLine(line[1], True))\r | |
123 | state = 2\r | |
124 | if state == 2:\r | |
125 | self._add_entry(entry)\r | |
126 | \r | |
127 | \r | |
128 | def can_fetch(self, useragent, url):\r | |
129 | """using the parsed robots.txt decide if useragent can fetch url"""\r | |
130 | if self.disallow_all:\r | |
131 | return False\r | |
132 | if self.allow_all:\r | |
133 | return True\r | |
134 | # search for given user agent matches\r | |
135 | # the first match counts\r | |
136 | parsed_url = urlparse.urlparse(urllib.unquote(url))\r | |
137 | url = urlparse.urlunparse(('', '', parsed_url.path,\r | |
138 | parsed_url.params, parsed_url.query, parsed_url.fragment))\r | |
139 | url = urllib.quote(url)\r | |
140 | if not url:\r | |
141 | url = "/"\r | |
142 | for entry in self.entries:\r | |
143 | if entry.applies_to(useragent):\r | |
144 | return entry.allowance(url)\r | |
145 | # try the default entry last\r | |
146 | if self.default_entry:\r | |
147 | return self.default_entry.allowance(url)\r | |
148 | # agent not found ==> access granted\r | |
149 | return True\r | |
150 | \r | |
151 | \r | |
152 | def __str__(self):\r | |
153 | return ''.join([str(entry) + "\n" for entry in self.entries])\r | |
154 | \r | |
155 | \r | |
156 | class RuleLine:\r | |
157 | """A rule line is a single "Allow:" (allowance==True) or "Disallow:"\r | |
158 | (allowance==False) followed by a path."""\r | |
159 | def __init__(self, path, allowance):\r | |
160 | if path == '' and not allowance:\r | |
161 | # an empty value means allow all\r | |
162 | allowance = True\r | |
163 | self.path = urllib.quote(path)\r | |
164 | self.allowance = allowance\r | |
165 | \r | |
166 | def applies_to(self, filename):\r | |
167 | return self.path == "*" or filename.startswith(self.path)\r | |
168 | \r | |
169 | def __str__(self):\r | |
170 | return (self.allowance and "Allow" or "Disallow") + ": " + self.path\r | |
171 | \r | |
172 | \r | |
173 | class Entry:\r | |
174 | """An entry has one or more user-agents and zero or more rulelines"""\r | |
175 | def __init__(self):\r | |
176 | self.useragents = []\r | |
177 | self.rulelines = []\r | |
178 | \r | |
179 | def __str__(self):\r | |
180 | ret = []\r | |
181 | for agent in self.useragents:\r | |
182 | ret.extend(["User-agent: ", agent, "\n"])\r | |
183 | for line in self.rulelines:\r | |
184 | ret.extend([str(line), "\n"])\r | |
185 | return ''.join(ret)\r | |
186 | \r | |
187 | def applies_to(self, useragent):\r | |
188 | """check if this entry applies to the specified agent"""\r | |
189 | # split the name token and make it lower case\r | |
190 | useragent = useragent.split("/")[0].lower()\r | |
191 | for agent in self.useragents:\r | |
192 | if agent == '*':\r | |
193 | # we have the catch-all agent\r | |
194 | return True\r | |
195 | agent = agent.lower()\r | |
196 | if agent in useragent:\r | |
197 | return True\r | |
198 | return False\r | |
199 | \r | |
200 | def allowance(self, filename):\r | |
201 | """Preconditions:\r | |
202 | - our agent applies to this entry\r | |
203 | - filename is URL decoded"""\r | |
204 | for line in self.rulelines:\r | |
205 | if line.applies_to(filename):\r | |
206 | return line.allowance\r | |
207 | return True\r | |
208 | \r | |
209 | class URLopener(urllib.FancyURLopener):\r | |
210 | def __init__(self, *args):\r | |
211 | urllib.FancyURLopener.__init__(self, *args)\r | |
212 | self.errcode = 200\r | |
213 | \r | |
214 | def prompt_user_passwd(self, host, realm):\r | |
215 | ## If robots.txt file is accessible only with a password,\r | |
216 | ## we act as if the file wasn't there.\r | |
217 | return None, None\r | |
218 | \r | |
219 | def http_error_default(self, url, fp, errcode, errmsg, headers):\r | |
220 | self.errcode = errcode\r | |
221 | return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,\r | |
222 | errmsg, headers)\r |