+++ /dev/null
-#! /usr/bin/env python\r
-\r
-"""A variant on webchecker that creates a mirror copy of a remote site."""\r
-\r
-__version__ = "$Revision$"\r
-\r
-import os\r
-import sys\r
-import urllib\r
-import getopt\r
-\r
-import webchecker\r
-\r
-# Extract real version number if necessary\r
-if __version__[0] == '$':\r
- _v = __version__.split()\r
- if len(_v) == 3:\r
- __version__ = _v[1]\r
-\r
-def main():\r
- verbose = webchecker.VERBOSE\r
- try:\r
- opts, args = getopt.getopt(sys.argv[1:], "qv")\r
- except getopt.error, msg:\r
- print msg\r
- print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."\r
- return 2\r
- for o, a in opts:\r
- if o == "-q":\r
- verbose = 0\r
- if o == "-v":\r
- verbose = verbose + 1\r
- c = Sucker()\r
- c.setflags(verbose=verbose)\r
- c.urlopener.addheaders = [\r
- ('User-agent', 'websucker/%s' % __version__),\r
- ]\r
- for arg in args:\r
- print "Adding root", arg\r
- c.addroot(arg)\r
- print "Run..."\r
- c.run()\r
-\r
-class Sucker(webchecker.Checker):\r
-\r
- checkext = 0\r
- nonames = 1\r
-\r
- # SAM 11/13/99: in general, URLs are now URL pairs.\r
- # Since we've suppressed name anchor checking,\r
- # we can ignore the second dimension.\r
-\r
- def readhtml(self, url_pair):\r
- url = url_pair[0]\r
- text = None\r
- path = self.savefilename(url)\r
- try:\r
- f = open(path, "rb")\r
- except IOError:\r
- f = self.openpage(url_pair)\r
- if f:\r
- info = f.info()\r
- nurl = f.geturl()\r
- if nurl != url:\r
- url = nurl\r
- path = self.savefilename(url)\r
- text = f.read()\r
- f.close()\r
- self.savefile(text, path)\r
- if not self.checkforhtml(info, url):\r
- text = None\r
- else:\r
- if self.checkforhtml({}, url):\r
- text = f.read()\r
- f.close()\r
- return text, url\r
-\r
- def savefile(self, text, path):\r
- dir, base = os.path.split(path)\r
- makedirs(dir)\r
- try:\r
- f = open(path, "wb")\r
- f.write(text)\r
- f.close()\r
- self.message("saved %s", path)\r
- except IOError, msg:\r
- self.message("didn't save %s: %s", path, str(msg))\r
-\r
- def savefilename(self, url):\r
- type, rest = urllib.splittype(url)\r
- host, path = urllib.splithost(rest)\r
- path = path.lstrip("/")\r
- user, host = urllib.splituser(host)\r
- host, port = urllib.splitnport(host)\r
- host = host.lower()\r
- if not path or path[-1] == "/":\r
- path = path + "index.html"\r
- if os.sep != "/":\r
- path = os.sep.join(path.split("/"))\r
- path = os.path.join(host, path)\r
- return path\r
-\r
-def makedirs(dir):\r
- if not dir:\r
- return\r
- if os.path.exists(dir):\r
- if not os.path.isdir(dir):\r
- try:\r
- os.rename(dir, dir + ".bak")\r
- os.mkdir(dir)\r
- os.rename(dir + ".bak", os.path.join(dir, "index.html"))\r
- except os.error:\r
- pass\r
- return\r
- head, tail = os.path.split(dir)\r
- if not tail:\r
- print "Huh? Don't know how to make dir", dir\r
- return\r
- makedirs(head)\r
- os.mkdir(dir, 0777)\r
-\r
-if __name__ == '__main__':\r
- sys.exit(main() or 0)\r