+++ /dev/null
-""" robotparser.py\r
-\r
- Copyright (C) 2000 Bastian Kleineidam\r
-\r
- You can choose between two licenses when using this package:\r
- 1) GNU GPLv2\r
- 2) PSF license for Python 2.2\r
-\r
- The robots.txt Exclusion Protocol is implemented as specified in\r
- http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html\r
-"""\r
-import urlparse\r
-import urllib\r
-\r
-__all__ = ["RobotFileParser"]\r
-\r
-\r
-class RobotFileParser:\r
- """ This class provides a set of methods to read, parse and answer\r
- questions about a single robots.txt file.\r
-\r
- """\r
-\r
- def __init__(self, url=''):\r
- self.entries = []\r
- self.default_entry = None\r
- self.disallow_all = False\r
- self.allow_all = False\r
- self.set_url(url)\r
- self.last_checked = 0\r
-\r
- def mtime(self):\r
- """Returns the time the robots.txt file was last fetched.\r
-\r
- This is useful for long-running web spiders that need to\r
- check for new robots.txt files periodically.\r
-\r
- """\r
- return self.last_checked\r
-\r
- def modified(self):\r
- """Sets the time the robots.txt file was last fetched to the\r
- current time.\r
-\r
- """\r
- import time\r
- self.last_checked = time.time()\r
-\r
- def set_url(self, url):\r
- """Sets the URL referring to a robots.txt file."""\r
- self.url = url\r
- self.host, self.path = urlparse.urlparse(url)[1:3]\r
-\r
- def read(self):\r
- """Reads the robots.txt URL and feeds it to the parser."""\r
- opener = URLopener()\r
- f = opener.open(self.url)\r
- lines = [line.strip() for line in f]\r
- f.close()\r
- self.errcode = opener.errcode\r
- if self.errcode in (401, 403):\r
- self.disallow_all = True\r
- elif self.errcode >= 400:\r
- self.allow_all = True\r
- elif self.errcode == 200 and lines:\r
- self.parse(lines)\r
-\r
- def _add_entry(self, entry):\r
- if "*" in entry.useragents:\r
- # the default entry is considered last\r
- if self.default_entry is None:\r
- # the first default entry wins\r
- self.default_entry = entry\r
- else:\r
- self.entries.append(entry)\r
-\r
- def parse(self, lines):\r
- """parse the input lines from a robots.txt file.\r
- We allow that a user-agent: line is not preceded by\r
- one or more blank lines."""\r
- # states:\r
- # 0: start state\r
- # 1: saw user-agent line\r
- # 2: saw an allow or disallow line\r
- state = 0\r
- linenumber = 0\r
- entry = Entry()\r
-\r
- for line in lines:\r
- linenumber += 1\r
- if not line:\r
- if state == 1:\r
- entry = Entry()\r
- state = 0\r
- elif state == 2:\r
- self._add_entry(entry)\r
- entry = Entry()\r
- state = 0\r
- # remove optional comment and strip line\r
- i = line.find('#')\r
- if i >= 0:\r
- line = line[:i]\r
- line = line.strip()\r
- if not line:\r
- continue\r
- line = line.split(':', 1)\r
- if len(line) == 2:\r
- line[0] = line[0].strip().lower()\r
- line[1] = urllib.unquote(line[1].strip())\r
- if line[0] == "user-agent":\r
- if state == 2:\r
- self._add_entry(entry)\r
- entry = Entry()\r
- entry.useragents.append(line[1])\r
- state = 1\r
- elif line[0] == "disallow":\r
- if state != 0:\r
- entry.rulelines.append(RuleLine(line[1], False))\r
- state = 2\r
- elif line[0] == "allow":\r
- if state != 0:\r
- entry.rulelines.append(RuleLine(line[1], True))\r
- state = 2\r
- if state == 2:\r
- self._add_entry(entry)\r
-\r
-\r
- def can_fetch(self, useragent, url):\r
- """using the parsed robots.txt decide if useragent can fetch url"""\r
- if self.disallow_all:\r
- return False\r
- if self.allow_all:\r
- return True\r
- # search for given user agent matches\r
- # the first match counts\r
- parsed_url = urlparse.urlparse(urllib.unquote(url))\r
- url = urlparse.urlunparse(('', '', parsed_url.path,\r
- parsed_url.params, parsed_url.query, parsed_url.fragment))\r
- url = urllib.quote(url)\r
- if not url:\r
- url = "/"\r
- for entry in self.entries:\r
- if entry.applies_to(useragent):\r
- return entry.allowance(url)\r
- # try the default entry last\r
- if self.default_entry:\r
- return self.default_entry.allowance(url)\r
- # agent not found ==> access granted\r
- return True\r
-\r
-\r
- def __str__(self):\r
- return ''.join([str(entry) + "\n" for entry in self.entries])\r
-\r
-\r
-class RuleLine:\r
- """A rule line is a single "Allow:" (allowance==True) or "Disallow:"\r
- (allowance==False) followed by a path."""\r
- def __init__(self, path, allowance):\r
- if path == '' and not allowance:\r
- # an empty value means allow all\r
- allowance = True\r
- self.path = urllib.quote(path)\r
- self.allowance = allowance\r
-\r
- def applies_to(self, filename):\r
- return self.path == "*" or filename.startswith(self.path)\r
-\r
- def __str__(self):\r
- return (self.allowance and "Allow" or "Disallow") + ": " + self.path\r
-\r
-\r
-class Entry:\r
- """An entry has one or more user-agents and zero or more rulelines"""\r
- def __init__(self):\r
- self.useragents = []\r
- self.rulelines = []\r
-\r
- def __str__(self):\r
- ret = []\r
- for agent in self.useragents:\r
- ret.extend(["User-agent: ", agent, "\n"])\r
- for line in self.rulelines:\r
- ret.extend([str(line), "\n"])\r
- return ''.join(ret)\r
-\r
- def applies_to(self, useragent):\r
- """check if this entry applies to the specified agent"""\r
- # split the name token and make it lower case\r
- useragent = useragent.split("/")[0].lower()\r
- for agent in self.useragents:\r
- if agent == '*':\r
- # we have the catch-all agent\r
- return True\r
- agent = agent.lower()\r
- if agent in useragent:\r
- return True\r
- return False\r
-\r
- def allowance(self, filename):\r
- """Preconditions:\r
- - our agent applies to this entry\r
- - filename is URL decoded"""\r
- for line in self.rulelines:\r
- if line.applies_to(filename):\r
- return line.allowance\r
- return True\r
-\r
-class URLopener(urllib.FancyURLopener):\r
- def __init__(self, *args):\r
- urllib.FancyURLopener.__init__(self, *args)\r
- self.errcode = 200\r
-\r
- def prompt_user_passwd(self, host, realm):\r
- ## If robots.txt file is accessible only with a password,\r
- ## we act as if the file wasn't there.\r
- return None, None\r
-\r
- def http_error_default(self, url, fp, errcode, errmsg, headers):\r
- self.errcode = errcode\r
- return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,\r
- errmsg, headers)\r