]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Lib/robotparser.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / robotparser.py
1 """ robotparser.py
2
3 Copyright (C) 2000 Bastian Kleineidam
4
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11 """
12 import urlparse
13 import urllib
14
15 __all__ = ["RobotFileParser"]
16
17
18 class RobotFileParser:
19 """ This class provides a set of methods to read, parse and answer
20 questions about a single robots.txt file.
21
22 """
23
24 def __init__(self, url=''):
25 self.entries = []
26 self.default_entry = None
27 self.disallow_all = False
28 self.allow_all = False
29 self.set_url(url)
30 self.last_checked = 0
31
32 def mtime(self):
33 """Returns the time the robots.txt file was last fetched.
34
35 This is useful for long-running web spiders that need to
36 check for new robots.txt files periodically.
37
38 """
39 return self.last_checked
40
41 def modified(self):
42 """Sets the time the robots.txt file was last fetched to the
43 current time.
44
45 """
46 import time
47 self.last_checked = time.time()
48
49 def set_url(self, url):
50 """Sets the URL referring to a robots.txt file."""
51 self.url = url
52 self.host, self.path = urlparse.urlparse(url)[1:3]
53
54 def read(self):
55 """Reads the robots.txt URL and feeds it to the parser."""
56 opener = URLopener()
57 f = opener.open(self.url)
58 lines = [line.strip() for line in f]
59 f.close()
60 self.errcode = opener.errcode
61 if self.errcode in (401, 403):
62 self.disallow_all = True
63 elif self.errcode >= 400:
64 self.allow_all = True
65 elif self.errcode == 200 and lines:
66 self.parse(lines)
67
68 def _add_entry(self, entry):
69 if "*" in entry.useragents:
70 # the default entry is considered last
71 if self.default_entry is None:
72 # the first default entry wins
73 self.default_entry = entry
74 else:
75 self.entries.append(entry)
76
77 def parse(self, lines):
78 """parse the input lines from a robots.txt file.
79 We allow that a user-agent: line is not preceded by
80 one or more blank lines."""
81 # states:
82 # 0: start state
83 # 1: saw user-agent line
84 # 2: saw an allow or disallow line
85 state = 0
86 linenumber = 0
87 entry = Entry()
88
89 for line in lines:
90 linenumber += 1
91 if not line:
92 if state == 1:
93 entry = Entry()
94 state = 0
95 elif state == 2:
96 self._add_entry(entry)
97 entry = Entry()
98 state = 0
99 # remove optional comment and strip line
100 i = line.find('#')
101 if i >= 0:
102 line = line[:i]
103 line = line.strip()
104 if not line:
105 continue
106 line = line.split(':', 1)
107 if len(line) == 2:
108 line[0] = line[0].strip().lower()
109 line[1] = urllib.unquote(line[1].strip())
110 if line[0] == "user-agent":
111 if state == 2:
112 self._add_entry(entry)
113 entry = Entry()
114 entry.useragents.append(line[1])
115 state = 1
116 elif line[0] == "disallow":
117 if state != 0:
118 entry.rulelines.append(RuleLine(line[1], False))
119 state = 2
120 elif line[0] == "allow":
121 if state != 0:
122 entry.rulelines.append(RuleLine(line[1], True))
123 state = 2
124 if state == 2:
125 self._add_entry(entry)
126
127
128 def can_fetch(self, useragent, url):
129 """using the parsed robots.txt decide if useragent can fetch url"""
130 if self.disallow_all:
131 return False
132 if self.allow_all:
133 return True
134 # search for given user agent matches
135 # the first match counts
136 parsed_url = urlparse.urlparse(urllib.unquote(url))
137 url = urlparse.urlunparse(('', '', parsed_url.path,
138 parsed_url.params, parsed_url.query, parsed_url.fragment))
139 url = urllib.quote(url)
140 if not url:
141 url = "/"
142 for entry in self.entries:
143 if entry.applies_to(useragent):
144 return entry.allowance(url)
145 # try the default entry last
146 if self.default_entry:
147 return self.default_entry.allowance(url)
148 # agent not found ==> access granted
149 return True
150
151
152 def __str__(self):
153 return ''.join([str(entry) + "\n" for entry in self.entries])
154
155
156 class RuleLine:
157 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
158 (allowance==False) followed by a path."""
159 def __init__(self, path, allowance):
160 if path == '' and not allowance:
161 # an empty value means allow all
162 allowance = True
163 self.path = urllib.quote(path)
164 self.allowance = allowance
165
166 def applies_to(self, filename):
167 return self.path == "*" or filename.startswith(self.path)
168
169 def __str__(self):
170 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
171
172
173 class Entry:
174 """An entry has one or more user-agents and zero or more rulelines"""
175 def __init__(self):
176 self.useragents = []
177 self.rulelines = []
178
179 def __str__(self):
180 ret = []
181 for agent in self.useragents:
182 ret.extend(["User-agent: ", agent, "\n"])
183 for line in self.rulelines:
184 ret.extend([str(line), "\n"])
185 return ''.join(ret)
186
187 def applies_to(self, useragent):
188 """check if this entry applies to the specified agent"""
189 # split the name token and make it lower case
190 useragent = useragent.split("/")[0].lower()
191 for agent in self.useragents:
192 if agent == '*':
193 # we have the catch-all agent
194 return True
195 agent = agent.lower()
196 if agent in useragent:
197 return True
198 return False
199
200 def allowance(self, filename):
201 """Preconditions:
202 - our agent applies to this entry
203 - filename is URL decoded"""
204 for line in self.rulelines:
205 if line.applies_to(filename):
206 return line.allowance
207 return True
208
209 class URLopener(urllib.FancyURLopener):
210 def __init__(self, *args):
211 urllib.FancyURLopener.__init__(self, *args)
212 self.errcode = 200
213
214 def prompt_user_passwd(self, host, realm):
215 ## If robots.txt file is accessible only with a password,
216 ## we act as if the file wasn't there.
217 return None, None
218
219 def http_error_default(self, url, fp, errcode, errmsg, headers):
220 self.errcode = errcode
221 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
222 errmsg, headers)