]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #! /usr/bin/env python\r |
2 | \r | |
3 | # Original code by Guido van Rossum; extensive changes by Sam Bayer,\r | |
4 | # including code to check URL fragments.\r | |
5 | \r | |
6 | """Web tree checker.\r | |
7 | \r | |
8 | This utility is handy to check a subweb of the world-wide web for\r | |
9 | errors. A subweb is specified by giving one or more ``root URLs''; a\r | |
10 | page belongs to the subweb if one of the root URLs is an initial\r | |
11 | prefix of it.\r | |
12 | \r | |
13 | File URL extension:\r | |
14 | \r | |
15 | In order to easy the checking of subwebs via the local file system,\r | |
16 | the interpretation of ``file:'' URLs is extended to mimic the behavior\r | |
17 | of your average HTTP daemon: if a directory pathname is given, the\r | |
18 | file index.html in that directory is returned if it exists, otherwise\r | |
19 | a directory listing is returned. Now, you can point webchecker to the\r | |
20 | document tree in the local file system of your HTTP daemon, and have\r | |
21 | most of it checked. In fact the default works this way if your local\r | |
22 | web tree is located at /usr/local/etc/httpd/htdpcs (the default for\r | |
23 | the NCSA HTTP daemon and probably others).\r | |
24 | \r | |
25 | Report printed:\r | |
26 | \r | |
27 | When done, it reports pages with bad links within the subweb. When\r | |
28 | interrupted, it reports for the pages that it has checked already.\r | |
29 | \r | |
30 | In verbose mode, additional messages are printed during the\r | |
31 | information gathering phase. By default, it prints a summary of its\r | |
32 | work status every 50 URLs (adjustable with the -r option), and it\r | |
33 | reports errors as they are encountered. Use the -q option to disable\r | |
34 | this output.\r | |
35 | \r | |
36 | Checkpoint feature:\r | |
37 | \r | |
38 | Whether interrupted or not, it dumps its state (a Python pickle) to a\r | |
39 | checkpoint file and the -R option allows it to restart from the\r | |
40 | checkpoint (assuming that the pages on the subweb that were already\r | |
41 | processed haven't changed). Even when it has run till completion, -R\r | |
42 | can still be useful -- it will print the reports again, and -Rq prints\r | |
43 | the errors only. In this case, the checkpoint file is not written\r | |
44 | again. The checkpoint file can be set with the -d option.\r | |
45 | \r | |
46 | The checkpoint file is written as a Python pickle. Remember that\r | |
47 | Python's pickle module is currently quite slow. Give it the time it\r | |
48 | needs to load and save the checkpoint file. When interrupted while\r | |
49 | writing the checkpoint file, the old checkpoint file is not\r | |
50 | overwritten, but all work done in the current run is lost.\r | |
51 | \r | |
52 | Miscellaneous:\r | |
53 | \r | |
54 | - You may find the (Tk-based) GUI version easier to use. See wcgui.py.\r | |
55 | \r | |
56 | - Webchecker honors the "robots.txt" convention. Thanks to Skip\r | |
57 | Montanaro for his robotparser.py module (included in this directory)!\r | |
58 | The agent name is hardwired to "webchecker". URLs that are disallowed\r | |
59 | by the robots.txt file are reported as external URLs.\r | |
60 | \r | |
61 | - Because the SGML parser is a bit slow, very large SGML files are\r | |
62 | skipped. The size limit can be set with the -m option.\r | |
63 | \r | |
64 | - When the server or protocol does not tell us a file's type, we guess\r | |
65 | it based on the URL's suffix. The mimetypes.py module (also in this\r | |
66 | directory) has a built-in table mapping most currently known suffixes,\r | |
67 | and in addition attempts to read the mime.types configuration files in\r | |
68 | the default locations of Netscape and the NCSA HTTP daemon.\r | |
69 | \r | |
70 | - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also\r | |
71 | honor the <BASE> tag.\r | |
72 | \r | |
73 | - We now check internal NAME anchor links, as well as toplevel links.\r | |
74 | \r | |
75 | - Checking external links is now done by default; use -x to *disable*\r | |
76 | this feature. External links are now checked during normal\r | |
77 | processing. (XXX The status of a checked link could be categorized\r | |
78 | better. Later...)\r | |
79 | \r | |
80 | - If external links are not checked, you can use the -t flag to\r | |
81 | provide specific overrides to -x.\r | |
82 | \r | |
83 | Usage: webchecker.py [option] ... [rooturl] ...\r | |
84 | \r | |
85 | Options:\r | |
86 | \r | |
87 | -R -- restart from checkpoint file\r | |
88 | -d file -- checkpoint filename (default %(DUMPFILE)s)\r | |
89 | -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)\r | |
90 | -n -- reports only, no checking (use with -R)\r | |
91 | -q -- quiet operation (also suppresses external links report)\r | |
92 | -r number -- number of links processed per round (default %(ROUNDSIZE)d)\r | |
93 | -t root -- specify root dir which should be treated as internal (can repeat)\r | |
94 | -v -- verbose operation; repeating -v will increase verbosity\r | |
95 | -x -- don't check external links (these are often slow to check)\r | |
96 | -a -- don't check name anchors\r | |
97 | \r | |
98 | Arguments:\r | |
99 | \r | |
100 | rooturl -- URL to start checking\r | |
101 | (default %(DEFROOT)s)\r | |
102 | \r | |
103 | """\r | |
104 | \r | |
105 | \r | |
106 | __version__ = "$Revision$"\r | |
107 | \r | |
108 | \r | |
109 | import sys\r | |
110 | import os\r | |
111 | from types import *\r | |
112 | import StringIO\r | |
113 | import getopt\r | |
114 | import pickle\r | |
115 | \r | |
116 | import urllib\r | |
117 | import urlparse\r | |
118 | import sgmllib\r | |
119 | import cgi\r | |
120 | \r | |
121 | import mimetypes\r | |
122 | import robotparser\r | |
123 | \r | |
124 | # Extract real version number if necessary\r | |
125 | if __version__[0] == '$':\r | |
126 | _v = __version__.split()\r | |
127 | if len(_v) == 3:\r | |
128 | __version__ = _v[1]\r | |
129 | \r | |
130 | \r | |
131 | # Tunable parameters\r | |
132 | DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL\r | |
133 | CHECKEXT = 1 # Check external references (1 deep)\r | |
134 | VERBOSE = 1 # Verbosity level (0-3)\r | |
135 | MAXPAGE = 150000 # Ignore files bigger than this\r | |
136 | ROUNDSIZE = 50 # Number of links processed per round\r | |
137 | DUMPFILE = "@webchecker.pickle" # Pickled checkpoint\r | |
138 | AGENTNAME = "webchecker" # Agent name for robots.txt parser\r | |
139 | NONAMES = 0 # Force name anchor checking\r | |
140 | \r | |
141 | \r | |
142 | # Global variables\r | |
143 | \r | |
144 | \r | |
145 | def main():\r | |
146 | checkext = CHECKEXT\r | |
147 | verbose = VERBOSE\r | |
148 | maxpage = MAXPAGE\r | |
149 | roundsize = ROUNDSIZE\r | |
150 | dumpfile = DUMPFILE\r | |
151 | restart = 0\r | |
152 | norun = 0\r | |
153 | \r | |
154 | try:\r | |
155 | opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')\r | |
156 | except getopt.error, msg:\r | |
157 | sys.stdout = sys.stderr\r | |
158 | print msg\r | |
159 | print __doc__%globals()\r | |
160 | sys.exit(2)\r | |
161 | \r | |
162 | # The extra_roots variable collects extra roots.\r | |
163 | extra_roots = []\r | |
164 | nonames = NONAMES\r | |
165 | \r | |
166 | for o, a in opts:\r | |
167 | if o == '-R':\r | |
168 | restart = 1\r | |
169 | if o == '-d':\r | |
170 | dumpfile = a\r | |
171 | if o == '-m':\r | |
172 | maxpage = int(a)\r | |
173 | if o == '-n':\r | |
174 | norun = 1\r | |
175 | if o == '-q':\r | |
176 | verbose = 0\r | |
177 | if o == '-r':\r | |
178 | roundsize = int(a)\r | |
179 | if o == '-t':\r | |
180 | extra_roots.append(a)\r | |
181 | if o == '-a':\r | |
182 | nonames = not nonames\r | |
183 | if o == '-v':\r | |
184 | verbose = verbose + 1\r | |
185 | if o == '-x':\r | |
186 | checkext = not checkext\r | |
187 | \r | |
188 | if verbose > 0:\r | |
189 | print AGENTNAME, "version", __version__\r | |
190 | \r | |
191 | if restart:\r | |
192 | c = load_pickle(dumpfile=dumpfile, verbose=verbose)\r | |
193 | else:\r | |
194 | c = Checker()\r | |
195 | \r | |
196 | c.setflags(checkext=checkext, verbose=verbose,\r | |
197 | maxpage=maxpage, roundsize=roundsize,\r | |
198 | nonames=nonames\r | |
199 | )\r | |
200 | \r | |
201 | if not restart and not args:\r | |
202 | args.append(DEFROOT)\r | |
203 | \r | |
204 | for arg in args:\r | |
205 | c.addroot(arg)\r | |
206 | \r | |
207 | # The -t flag is only needed if external links are not to be\r | |
208 | # checked. So -t values are ignored unless -x was specified.\r | |
209 | if not checkext:\r | |
210 | for root in extra_roots:\r | |
211 | # Make sure it's terminated by a slash,\r | |
212 | # so that addroot doesn't discard the last\r | |
213 | # directory component.\r | |
214 | if root[-1] != "/":\r | |
215 | root = root + "/"\r | |
216 | c.addroot(root, add_to_do = 0)\r | |
217 | \r | |
218 | try:\r | |
219 | \r | |
220 | if not norun:\r | |
221 | try:\r | |
222 | c.run()\r | |
223 | except KeyboardInterrupt:\r | |
224 | if verbose > 0:\r | |
225 | print "[run interrupted]"\r | |
226 | \r | |
227 | try:\r | |
228 | c.report()\r | |
229 | except KeyboardInterrupt:\r | |
230 | if verbose > 0:\r | |
231 | print "[report interrupted]"\r | |
232 | \r | |
233 | finally:\r | |
234 | if c.save_pickle(dumpfile):\r | |
235 | if dumpfile == DUMPFILE:\r | |
236 | print "Use ``%s -R'' to restart." % sys.argv[0]\r | |
237 | else:\r | |
238 | print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],\r | |
239 | dumpfile)\r | |
240 | \r | |
241 | \r | |
242 | def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):\r | |
243 | if verbose > 0:\r | |
244 | print "Loading checkpoint from %s ..." % dumpfile\r | |
245 | f = open(dumpfile, "rb")\r | |
246 | c = pickle.load(f)\r | |
247 | f.close()\r | |
248 | if verbose > 0:\r | |
249 | print "Done."\r | |
250 | print "Root:", "\n ".join(c.roots)\r | |
251 | return c\r | |
252 | \r | |
253 | \r | |
254 | class Checker:\r | |
255 | \r | |
256 | checkext = CHECKEXT\r | |
257 | verbose = VERBOSE\r | |
258 | maxpage = MAXPAGE\r | |
259 | roundsize = ROUNDSIZE\r | |
260 | nonames = NONAMES\r | |
261 | \r | |
262 | validflags = tuple(dir())\r | |
263 | \r | |
264 | def __init__(self):\r | |
265 | self.reset()\r | |
266 | \r | |
267 | def setflags(self, **kw):\r | |
268 | for key in kw.keys():\r | |
269 | if key not in self.validflags:\r | |
270 | raise NameError, "invalid keyword argument: %s" % str(key)\r | |
271 | for key, value in kw.items():\r | |
272 | setattr(self, key, value)\r | |
273 | \r | |
274 | def reset(self):\r | |
275 | self.roots = []\r | |
276 | self.todo = {}\r | |
277 | self.done = {}\r | |
278 | self.bad = {}\r | |
279 | \r | |
280 | # Add a name table, so that the name URLs can be checked. Also\r | |
281 | # serves as an implicit cache for which URLs are done.\r | |
282 | self.name_table = {}\r | |
283 | \r | |
284 | self.round = 0\r | |
285 | # The following are not pickled:\r | |
286 | self.robots = {}\r | |
287 | self.errors = {}\r | |
288 | self.urlopener = MyURLopener()\r | |
289 | self.changed = 0\r | |
290 | \r | |
291 | def note(self, level, format, *args):\r | |
292 | if self.verbose > level:\r | |
293 | if args:\r | |
294 | format = format%args\r | |
295 | self.message(format)\r | |
296 | \r | |
297 | def message(self, format, *args):\r | |
298 | if args:\r | |
299 | format = format%args\r | |
300 | print format\r | |
301 | \r | |
302 | def __getstate__(self):\r | |
303 | return (self.roots, self.todo, self.done, self.bad, self.round)\r | |
304 | \r | |
305 | def __setstate__(self, state):\r | |
306 | self.reset()\r | |
307 | (self.roots, self.todo, self.done, self.bad, self.round) = state\r | |
308 | for root in self.roots:\r | |
309 | self.addrobot(root)\r | |
310 | for url in self.bad.keys():\r | |
311 | self.markerror(url)\r | |
312 | \r | |
313 | def addroot(self, root, add_to_do = 1):\r | |
314 | if root not in self.roots:\r | |
315 | troot = root\r | |
316 | scheme, netloc, path, params, query, fragment = \\r | |
317 | urlparse.urlparse(root)\r | |
318 | i = path.rfind("/") + 1\r | |
319 | if 0 < i < len(path):\r | |
320 | path = path[:i]\r | |
321 | troot = urlparse.urlunparse((scheme, netloc, path,\r | |
322 | params, query, fragment))\r | |
323 | self.roots.append(troot)\r | |
324 | self.addrobot(root)\r | |
325 | if add_to_do:\r | |
326 | self.newlink((root, ""), ("<root>", root))\r | |
327 | \r | |
328 | def addrobot(self, root):\r | |
329 | root = urlparse.urljoin(root, "/")\r | |
330 | if self.robots.has_key(root): return\r | |
331 | url = urlparse.urljoin(root, "/robots.txt")\r | |
332 | self.robots[root] = rp = robotparser.RobotFileParser()\r | |
333 | self.note(2, "Parsing %s", url)\r | |
334 | rp.debug = self.verbose > 3\r | |
335 | rp.set_url(url)\r | |
336 | try:\r | |
337 | rp.read()\r | |
338 | except (OSError, IOError), msg:\r | |
339 | self.note(1, "I/O error parsing %s: %s", url, msg)\r | |
340 | \r | |
341 | def run(self):\r | |
342 | while self.todo:\r | |
343 | self.round = self.round + 1\r | |
344 | self.note(0, "\nRound %d (%s)\n", self.round, self.status())\r | |
345 | urls = self.todo.keys()\r | |
346 | urls.sort()\r | |
347 | del urls[self.roundsize:]\r | |
348 | for url in urls:\r | |
349 | self.dopage(url)\r | |
350 | \r | |
351 | def status(self):\r | |
352 | return "%d total, %d to do, %d done, %d bad" % (\r | |
353 | len(self.todo)+len(self.done),\r | |
354 | len(self.todo), len(self.done),\r | |
355 | len(self.bad))\r | |
356 | \r | |
357 | def report(self):\r | |
358 | self.message("")\r | |
359 | if not self.todo: s = "Final"\r | |
360 | else: s = "Interim"\r | |
361 | self.message("%s Report (%s)", s, self.status())\r | |
362 | self.report_errors()\r | |
363 | \r | |
364 | def report_errors(self):\r | |
365 | if not self.bad:\r | |
366 | self.message("\nNo errors")\r | |
367 | return\r | |
368 | self.message("\nError Report:")\r | |
369 | sources = self.errors.keys()\r | |
370 | sources.sort()\r | |
371 | for source in sources:\r | |
372 | triples = self.errors[source]\r | |
373 | self.message("")\r | |
374 | if len(triples) > 1:\r | |
375 | self.message("%d Errors in %s", len(triples), source)\r | |
376 | else:\r | |
377 | self.message("Error in %s", source)\r | |
378 | # Call self.format_url() instead of referring\r | |
379 | # to the URL directly, since the URLs in these\r | |
380 | # triples is now a (URL, fragment) pair. The value\r | |
381 | # of the "source" variable comes from the list of\r | |
382 | # origins, and is a URL, not a pair.\r | |
383 | for url, rawlink, msg in triples:\r | |
384 | if rawlink != self.format_url(url): s = " (%s)" % rawlink\r | |
385 | else: s = ""\r | |
386 | self.message(" HREF %s%s\n msg %s",\r | |
387 | self.format_url(url), s, msg)\r | |
388 | \r | |
389 | def dopage(self, url_pair):\r | |
390 | \r | |
391 | # All printing of URLs uses format_url(); argument changed to\r | |
392 | # url_pair for clarity.\r | |
393 | if self.verbose > 1:\r | |
394 | if self.verbose > 2:\r | |
395 | self.show("Check ", self.format_url(url_pair),\r | |
396 | " from", self.todo[url_pair])\r | |
397 | else:\r | |
398 | self.message("Check %s", self.format_url(url_pair))\r | |
399 | url, local_fragment = url_pair\r | |
400 | if local_fragment and self.nonames:\r | |
401 | self.markdone(url_pair)\r | |
402 | return\r | |
403 | try:\r | |
404 | page = self.getpage(url_pair)\r | |
405 | except sgmllib.SGMLParseError, msg:\r | |
406 | msg = self.sanitize(msg)\r | |
407 | self.note(0, "Error parsing %s: %s",\r | |
408 | self.format_url(url_pair), msg)\r | |
409 | # Dont actually mark the URL as bad - it exists, just\r | |
410 | # we can't parse it!\r | |
411 | page = None\r | |
412 | if page:\r | |
413 | # Store the page which corresponds to this URL.\r | |
414 | self.name_table[url] = page\r | |
415 | # If there is a fragment in this url_pair, and it's not\r | |
416 | # in the list of names for the page, call setbad(), since\r | |
417 | # it's a missing anchor.\r | |
418 | if local_fragment and local_fragment not in page.getnames():\r | |
419 | self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))\r | |
420 | for info in page.getlinkinfos():\r | |
421 | # getlinkinfos() now returns the fragment as well,\r | |
422 | # and we store that fragment here in the "todo" dictionary.\r | |
423 | link, rawlink, fragment = info\r | |
424 | # However, we don't want the fragment as the origin, since\r | |
425 | # the origin is logically a page.\r | |
426 | origin = url, rawlink\r | |
427 | self.newlink((link, fragment), origin)\r | |
428 | else:\r | |
429 | # If no page has been created yet, we want to\r | |
430 | # record that fact.\r | |
431 | self.name_table[url_pair[0]] = None\r | |
432 | self.markdone(url_pair)\r | |
433 | \r | |
434 | def newlink(self, url, origin):\r | |
435 | if self.done.has_key(url):\r | |
436 | self.newdonelink(url, origin)\r | |
437 | else:\r | |
438 | self.newtodolink(url, origin)\r | |
439 | \r | |
440 | def newdonelink(self, url, origin):\r | |
441 | if origin not in self.done[url]:\r | |
442 | self.done[url].append(origin)\r | |
443 | \r | |
444 | # Call self.format_url(), since the URL here\r | |
445 | # is now a (URL, fragment) pair.\r | |
446 | self.note(3, " Done link %s", self.format_url(url))\r | |
447 | \r | |
448 | # Make sure that if it's bad, that the origin gets added.\r | |
449 | if self.bad.has_key(url):\r | |
450 | source, rawlink = origin\r | |
451 | triple = url, rawlink, self.bad[url]\r | |
452 | self.seterror(source, triple)\r | |
453 | \r | |
454 | def newtodolink(self, url, origin):\r | |
455 | # Call self.format_url(), since the URL here\r | |
456 | # is now a (URL, fragment) pair.\r | |
457 | if self.todo.has_key(url):\r | |
458 | if origin not in self.todo[url]:\r | |
459 | self.todo[url].append(origin)\r | |
460 | self.note(3, " Seen todo link %s", self.format_url(url))\r | |
461 | else:\r | |
462 | self.todo[url] = [origin]\r | |
463 | self.note(3, " New todo link %s", self.format_url(url))\r | |
464 | \r | |
465 | def format_url(self, url):\r | |
466 | link, fragment = url\r | |
467 | if fragment: return link + "#" + fragment\r | |
468 | else: return link\r | |
469 | \r | |
470 | def markdone(self, url):\r | |
471 | self.done[url] = self.todo[url]\r | |
472 | del self.todo[url]\r | |
473 | self.changed = 1\r | |
474 | \r | |
475 | def inroots(self, url):\r | |
476 | for root in self.roots:\r | |
477 | if url[:len(root)] == root:\r | |
478 | return self.isallowed(root, url)\r | |
479 | return 0\r | |
480 | \r | |
481 | def isallowed(self, root, url):\r | |
482 | root = urlparse.urljoin(root, "/")\r | |
483 | return self.robots[root].can_fetch(AGENTNAME, url)\r | |
484 | \r | |
485 | def getpage(self, url_pair):\r | |
486 | # Incoming argument name is a (URL, fragment) pair.\r | |
487 | # The page may have been cached in the name_table variable.\r | |
488 | url, fragment = url_pair\r | |
489 | if self.name_table.has_key(url):\r | |
490 | return self.name_table[url]\r | |
491 | \r | |
492 | scheme, path = urllib.splittype(url)\r | |
493 | if scheme in ('mailto', 'news', 'javascript', 'telnet'):\r | |
494 | self.note(1, " Not checking %s URL" % scheme)\r | |
495 | return None\r | |
496 | isint = self.inroots(url)\r | |
497 | \r | |
498 | # Ensure that openpage gets the URL pair to\r | |
499 | # print out its error message and record the error pair\r | |
500 | # correctly.\r | |
501 | if not isint:\r | |
502 | if not self.checkext:\r | |
503 | self.note(1, " Not checking ext link")\r | |
504 | return None\r | |
505 | f = self.openpage(url_pair)\r | |
506 | if f:\r | |
507 | self.safeclose(f)\r | |
508 | return None\r | |
509 | text, nurl = self.readhtml(url_pair)\r | |
510 | \r | |
511 | if nurl != url:\r | |
512 | self.note(1, " Redirected to %s", nurl)\r | |
513 | url = nurl\r | |
514 | if text:\r | |
515 | return Page(text, url, maxpage=self.maxpage, checker=self)\r | |
516 | \r | |
517 | # These next three functions take (URL, fragment) pairs as\r | |
518 | # arguments, so that openpage() receives the appropriate tuple to\r | |
519 | # record error messages.\r | |
520 | def readhtml(self, url_pair):\r | |
521 | url, fragment = url_pair\r | |
522 | text = None\r | |
523 | f, url = self.openhtml(url_pair)\r | |
524 | if f:\r | |
525 | text = f.read()\r | |
526 | f.close()\r | |
527 | return text, url\r | |
528 | \r | |
529 | def openhtml(self, url_pair):\r | |
530 | url, fragment = url_pair\r | |
531 | f = self.openpage(url_pair)\r | |
532 | if f:\r | |
533 | url = f.geturl()\r | |
534 | info = f.info()\r | |
535 | if not self.checkforhtml(info, url):\r | |
536 | self.safeclose(f)\r | |
537 | f = None\r | |
538 | return f, url\r | |
539 | \r | |
540 | def openpage(self, url_pair):\r | |
541 | url, fragment = url_pair\r | |
542 | try:\r | |
543 | return self.urlopener.open(url)\r | |
544 | except (OSError, IOError), msg:\r | |
545 | msg = self.sanitize(msg)\r | |
546 | self.note(0, "Error %s", msg)\r | |
547 | if self.verbose > 0:\r | |
548 | self.show(" HREF ", url, " from", self.todo[url_pair])\r | |
549 | self.setbad(url_pair, msg)\r | |
550 | return None\r | |
551 | \r | |
552 | def checkforhtml(self, info, url):\r | |
553 | if info.has_key('content-type'):\r | |
554 | ctype = cgi.parse_header(info['content-type'])[0].lower()\r | |
555 | if ';' in ctype:\r | |
556 | # handle content-type: text/html; charset=iso8859-1 :\r | |
557 | ctype = ctype.split(';', 1)[0].strip()\r | |
558 | else:\r | |
559 | if url[-1:] == "/":\r | |
560 | return 1\r | |
561 | ctype, encoding = mimetypes.guess_type(url)\r | |
562 | if ctype == 'text/html':\r | |
563 | return 1\r | |
564 | else:\r | |
565 | self.note(1, " Not HTML, mime type %s", ctype)\r | |
566 | return 0\r | |
567 | \r | |
568 | def setgood(self, url):\r | |
569 | if self.bad.has_key(url):\r | |
570 | del self.bad[url]\r | |
571 | self.changed = 1\r | |
572 | self.note(0, "(Clear previously seen error)")\r | |
573 | \r | |
574 | def setbad(self, url, msg):\r | |
575 | if self.bad.has_key(url) and self.bad[url] == msg:\r | |
576 | self.note(0, "(Seen this error before)")\r | |
577 | return\r | |
578 | self.bad[url] = msg\r | |
579 | self.changed = 1\r | |
580 | self.markerror(url)\r | |
581 | \r | |
582 | def markerror(self, url):\r | |
583 | try:\r | |
584 | origins = self.todo[url]\r | |
585 | except KeyError:\r | |
586 | origins = self.done[url]\r | |
587 | for source, rawlink in origins:\r | |
588 | triple = url, rawlink, self.bad[url]\r | |
589 | self.seterror(source, triple)\r | |
590 | \r | |
591 | def seterror(self, url, triple):\r | |
592 | try:\r | |
593 | # Because of the way the URLs are now processed, I need to\r | |
594 | # check to make sure the URL hasn't been entered in the\r | |
595 | # error list. The first element of the triple here is a\r | |
596 | # (URL, fragment) pair, but the URL key is not, since it's\r | |
597 | # from the list of origins.\r | |
598 | if triple not in self.errors[url]:\r | |
599 | self.errors[url].append(triple)\r | |
600 | except KeyError:\r | |
601 | self.errors[url] = [triple]\r | |
602 | \r | |
603 | # The following used to be toplevel functions; they have been\r | |
604 | # changed into methods so they can be overridden in subclasses.\r | |
605 | \r | |
606 | def show(self, p1, link, p2, origins):\r | |
607 | self.message("%s %s", p1, link)\r | |
608 | i = 0\r | |
609 | for source, rawlink in origins:\r | |
610 | i = i+1\r | |
611 | if i == 2:\r | |
612 | p2 = ' '*len(p2)\r | |
613 | if rawlink != link: s = " (%s)" % rawlink\r | |
614 | else: s = ""\r | |
615 | self.message("%s %s%s", p2, source, s)\r | |
616 | \r | |
617 | def sanitize(self, msg):\r | |
618 | if isinstance(IOError, ClassType) and isinstance(msg, IOError):\r | |
619 | # Do the other branch recursively\r | |
620 | msg.args = self.sanitize(msg.args)\r | |
621 | elif isinstance(msg, TupleType):\r | |
622 | if len(msg) >= 4 and msg[0] == 'http error' and \\r | |
623 | isinstance(msg[3], InstanceType):\r | |
624 | # Remove the Message instance -- it may contain\r | |
625 | # a file object which prevents pickling.\r | |
626 | msg = msg[:3] + msg[4:]\r | |
627 | return msg\r | |
628 | \r | |
629 | def safeclose(self, f):\r | |
630 | try:\r | |
631 | url = f.geturl()\r | |
632 | except AttributeError:\r | |
633 | pass\r | |
634 | else:\r | |
635 | if url[:4] == 'ftp:' or url[:7] == 'file://':\r | |
636 | # Apparently ftp connections don't like to be closed\r | |
637 | # prematurely...\r | |
638 | text = f.read()\r | |
639 | f.close()\r | |
640 | \r | |
641 | def save_pickle(self, dumpfile=DUMPFILE):\r | |
642 | if not self.changed:\r | |
643 | self.note(0, "\nNo need to save checkpoint")\r | |
644 | elif not dumpfile:\r | |
645 | self.note(0, "No dumpfile, won't save checkpoint")\r | |
646 | else:\r | |
647 | self.note(0, "\nSaving checkpoint to %s ...", dumpfile)\r | |
648 | newfile = dumpfile + ".new"\r | |
649 | f = open(newfile, "wb")\r | |
650 | pickle.dump(self, f)\r | |
651 | f.close()\r | |
652 | try:\r | |
653 | os.unlink(dumpfile)\r | |
654 | except os.error:\r | |
655 | pass\r | |
656 | os.rename(newfile, dumpfile)\r | |
657 | self.note(0, "Done.")\r | |
658 | return 1\r | |
659 | \r | |
660 | \r | |
661 | class Page:\r | |
662 | \r | |
663 | def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):\r | |
664 | self.text = text\r | |
665 | self.url = url\r | |
666 | self.verbose = verbose\r | |
667 | self.maxpage = maxpage\r | |
668 | self.checker = checker\r | |
669 | \r | |
670 | # The parsing of the page is done in the __init__() routine in\r | |
671 | # order to initialize the list of names the file\r | |
672 | # contains. Stored the parser in an instance variable. Passed\r | |
673 | # the URL to MyHTMLParser().\r | |
674 | size = len(self.text)\r | |
675 | if size > self.maxpage:\r | |
676 | self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))\r | |
677 | self.parser = None\r | |
678 | return\r | |
679 | self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)\r | |
680 | self.parser = MyHTMLParser(url, verbose=self.verbose,\r | |
681 | checker=self.checker)\r | |
682 | self.parser.feed(self.text)\r | |
683 | self.parser.close()\r | |
684 | \r | |
685 | def note(self, level, msg, *args):\r | |
686 | if self.checker:\r | |
687 | apply(self.checker.note, (level, msg) + args)\r | |
688 | else:\r | |
689 | if self.verbose >= level:\r | |
690 | if args:\r | |
691 | msg = msg%args\r | |
692 | print msg\r | |
693 | \r | |
694 | # Method to retrieve names.\r | |
695 | def getnames(self):\r | |
696 | if self.parser:\r | |
697 | return self.parser.names\r | |
698 | else:\r | |
699 | return []\r | |
700 | \r | |
701 | def getlinkinfos(self):\r | |
702 | # File reading is done in __init__() routine. Store parser in\r | |
703 | # local variable to indicate success of parsing.\r | |
704 | \r | |
705 | # If no parser was stored, fail.\r | |
706 | if not self.parser: return []\r | |
707 | \r | |
708 | rawlinks = self.parser.getlinks()\r | |
709 | base = urlparse.urljoin(self.url, self.parser.getbase() or "")\r | |
710 | infos = []\r | |
711 | for rawlink in rawlinks:\r | |
712 | t = urlparse.urlparse(rawlink)\r | |
713 | # DON'T DISCARD THE FRAGMENT! Instead, include\r | |
714 | # it in the tuples which are returned. See Checker.dopage().\r | |
715 | fragment = t[-1]\r | |
716 | t = t[:-1] + ('',)\r | |
717 | rawlink = urlparse.urlunparse(t)\r | |
718 | link = urlparse.urljoin(base, rawlink)\r | |
719 | infos.append((link, rawlink, fragment))\r | |
720 | \r | |
721 | return infos\r | |
722 | \r | |
723 | \r | |
724 | class MyStringIO(StringIO.StringIO):\r | |
725 | \r | |
726 | def __init__(self, url, info):\r | |
727 | self.__url = url\r | |
728 | self.__info = info\r | |
729 | StringIO.StringIO.__init__(self)\r | |
730 | \r | |
731 | def info(self):\r | |
732 | return self.__info\r | |
733 | \r | |
734 | def geturl(self):\r | |
735 | return self.__url\r | |
736 | \r | |
737 | \r | |
738 | class MyURLopener(urllib.FancyURLopener):\r | |
739 | \r | |
740 | http_error_default = urllib.URLopener.http_error_default\r | |
741 | \r | |
742 | def __init__(*args):\r | |
743 | self = args[0]\r | |
744 | apply(urllib.FancyURLopener.__init__, args)\r | |
745 | self.addheaders = [\r | |
746 | ('User-agent', 'Python-webchecker/%s' % __version__),\r | |
747 | ]\r | |
748 | \r | |
749 | def http_error_401(self, url, fp, errcode, errmsg, headers):\r | |
750 | return None\r | |
751 | \r | |
752 | def open_file(self, url):\r | |
753 | path = urllib.url2pathname(urllib.unquote(url))\r | |
754 | if os.path.isdir(path):\r | |
755 | if path[-1] != os.sep:\r | |
756 | url = url + '/'\r | |
757 | indexpath = os.path.join(path, "index.html")\r | |
758 | if os.path.exists(indexpath):\r | |
759 | return self.open_file(url + "index.html")\r | |
760 | try:\r | |
761 | names = os.listdir(path)\r | |
762 | except os.error, msg:\r | |
763 | exc_type, exc_value, exc_tb = sys.exc_info()\r | |
764 | raise IOError, msg, exc_tb\r | |
765 | names.sort()\r | |
766 | s = MyStringIO("file:"+url, {'content-type': 'text/html'})\r | |
767 | s.write('<BASE HREF="file:%s">\n' %\r | |
768 | urllib.quote(os.path.join(path, "")))\r | |
769 | for name in names:\r | |
770 | q = urllib.quote(name)\r | |
771 | s.write('<A HREF="%s">%s</A>\n' % (q, q))\r | |
772 | s.seek(0)\r | |
773 | return s\r | |
774 | return urllib.FancyURLopener.open_file(self, url)\r | |
775 | \r | |
776 | \r | |
777 | class MyHTMLParser(sgmllib.SGMLParser):\r | |
778 | \r | |
779 | def __init__(self, url, verbose=VERBOSE, checker=None):\r | |
780 | self.myverbose = verbose # now unused\r | |
781 | self.checker = checker\r | |
782 | self.base = None\r | |
783 | self.links = {}\r | |
784 | self.names = []\r | |
785 | self.url = url\r | |
786 | sgmllib.SGMLParser.__init__(self)\r | |
787 | \r | |
788 | def check_name_id(self, attributes):\r | |
789 | """ Check the name or id attributes on an element.\r | |
790 | """\r | |
791 | # We must rescue the NAME or id (name is deprecated in XHTML)\r | |
792 | # attributes from the anchor, in order to\r | |
793 | # cache the internal anchors which are made\r | |
794 | # available in the page.\r | |
795 | for name, value in attributes:\r | |
796 | if name == "name" or name == "id":\r | |
797 | if value in self.names:\r | |
798 | self.checker.message("WARNING: duplicate ID name %s in %s",\r | |
799 | value, self.url)\r | |
800 | else: self.names.append(value)\r | |
801 | break\r | |
802 | \r | |
803 | def unknown_starttag(self, tag, attributes):\r | |
804 | """ In XHTML, you can have id attributes on any element.\r | |
805 | """\r | |
806 | self.check_name_id(attributes)\r | |
807 | \r | |
808 | def start_a(self, attributes):\r | |
809 | self.link_attr(attributes, 'href')\r | |
810 | self.check_name_id(attributes)\r | |
811 | \r | |
812 | def end_a(self): pass\r | |
813 | \r | |
814 | def do_area(self, attributes):\r | |
815 | self.link_attr(attributes, 'href')\r | |
816 | self.check_name_id(attributes)\r | |
817 | \r | |
818 | def do_body(self, attributes):\r | |
819 | self.link_attr(attributes, 'background', 'bgsound')\r | |
820 | self.check_name_id(attributes)\r | |
821 | \r | |
822 | def do_img(self, attributes):\r | |
823 | self.link_attr(attributes, 'src', 'lowsrc')\r | |
824 | self.check_name_id(attributes)\r | |
825 | \r | |
826 | def do_frame(self, attributes):\r | |
827 | self.link_attr(attributes, 'src', 'longdesc')\r | |
828 | self.check_name_id(attributes)\r | |
829 | \r | |
830 | def do_iframe(self, attributes):\r | |
831 | self.link_attr(attributes, 'src', 'longdesc')\r | |
832 | self.check_name_id(attributes)\r | |
833 | \r | |
834 | def do_link(self, attributes):\r | |
835 | for name, value in attributes:\r | |
836 | if name == "rel":\r | |
837 | parts = value.lower().split()\r | |
838 | if ( parts == ["stylesheet"]\r | |
839 | or parts == ["alternate", "stylesheet"]):\r | |
840 | self.link_attr(attributes, "href")\r | |
841 | break\r | |
842 | self.check_name_id(attributes)\r | |
843 | \r | |
844 | def do_object(self, attributes):\r | |
845 | self.link_attr(attributes, 'data', 'usemap')\r | |
846 | self.check_name_id(attributes)\r | |
847 | \r | |
848 | def do_script(self, attributes):\r | |
849 | self.link_attr(attributes, 'src')\r | |
850 | self.check_name_id(attributes)\r | |
851 | \r | |
852 | def do_table(self, attributes):\r | |
853 | self.link_attr(attributes, 'background')\r | |
854 | self.check_name_id(attributes)\r | |
855 | \r | |
856 | def do_td(self, attributes):\r | |
857 | self.link_attr(attributes, 'background')\r | |
858 | self.check_name_id(attributes)\r | |
859 | \r | |
860 | def do_th(self, attributes):\r | |
861 | self.link_attr(attributes, 'background')\r | |
862 | self.check_name_id(attributes)\r | |
863 | \r | |
864 | def do_tr(self, attributes):\r | |
865 | self.link_attr(attributes, 'background')\r | |
866 | self.check_name_id(attributes)\r | |
867 | \r | |
868 | def link_attr(self, attributes, *args):\r | |
869 | for name, value in attributes:\r | |
870 | if name in args:\r | |
871 | if value: value = value.strip()\r | |
872 | if value: self.links[value] = None\r | |
873 | \r | |
874 | def do_base(self, attributes):\r | |
875 | for name, value in attributes:\r | |
876 | if name == 'href':\r | |
877 | if value: value = value.strip()\r | |
878 | if value:\r | |
879 | if self.checker:\r | |
880 | self.checker.note(1, " Base %s", value)\r | |
881 | self.base = value\r | |
882 | self.check_name_id(attributes)\r | |
883 | \r | |
884 | def getlinks(self):\r | |
885 | return self.links.keys()\r | |
886 | \r | |
887 | def getbase(self):\r | |
888 | return self.base\r | |
889 | \r | |
890 | \r | |
891 | if __name__ == '__main__':\r | |
892 | main()\r |