]> git.proxmox.com Git - rustc.git/blob - src/etc/htmldocck.py
New upstream version 1.31.0~beta.4+dfsg1
[rustc.git] / src / etc / htmldocck.py
1 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2 # file at the top-level directory of this distribution and at
3 # http://rust-lang.org/COPYRIGHT.
4 #
5 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 # option. This file may not be copied, modified, or distributed
9 # except according to those terms.
10
11 r"""
12 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
13
14 # How and why?
15
16 The principle is simple: This script receives a path to generated HTML
17 documentation and a "template" script, which has a series of check
18 commands like `@has` or `@matches`. Each command is used to check if
19 some pattern is present or not present in the particular file or in
20 a particular node of the HTML tree. In many cases, the template script
21 happens to be the source code given to rustdoc.
22
23 While it indeed is possible to test in smaller portions, it has been
24 hard to construct tests in this fashion and major rendering errors were
25 discovered much later. This script is designed to make black-box and
26 regression testing of Rustdoc easy. This does not preclude the needs for
27 unit testing, but can be used to complement related tests by quickly
28 showing the expected renderings.
29
30 In order to avoid one-off dependencies for this task, this script uses
31 a reasonably working HTML parser and the existing XPath implementation
32 from Python's standard library. Hopefully, we won't render
33 non-well-formed HTML.
34
35 # Commands
36
37 Commands start with an `@` followed by a command name (letters and
38 hyphens), and zero or more arguments separated by one or more whitespace
39 characters and optionally delimited with single or double quotes. The `@`
40 mark cannot be preceded by a non-whitespace character. Other lines
41 (including every text up to the first `@`) are ignored, but it is
42 recommended to avoid the use of `@` in the template file.
43
44 There are a number of supported commands:
45
46 * `@has PATH` checks for the existence of the given file.
47
48 `PATH` is relative to the output directory. It can be given as `-`
49 which repeats the most recently used `PATH`.
50
51 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
52 the occurrence of the given pattern `PATTERN` in the specified file.
53 Only one occurrence of the pattern is enough.
54
55 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
56 whitespace being replaced by one single space character) string.
57 The entire file is also whitespace-normalized including newlines.
58
59 For `@matches`, `PATTERN` is a Python-supported regular expression.
60 The file remains intact but the regexp is matched without the `MULTILINE`
61 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
62 to override them, and `\A` and `\Z` for definitely matching
63 the beginning and end of the file.
64
65 (The same distinction goes to other variants of these commands.)
66
67 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
68 the presence of the given XPath `XPATH` in the specified HTML file,
69 and also the occurrence of the given pattern `PATTERN` in the matching
70 node or attribute. Only one occurrence of the pattern in the match
71 is enough.
72
73 `PATH` should be a valid and well-formed HTML file. It does *not*
74 accept arbitrary HTML5; it should have matching open and close tags
75 and correct entity references at least.
76
77 `XPATH` is an XPath expression to match. The XPath is fairly limited:
78 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
79 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
80 and `@attr` (both as the last segment) are supported. Some examples:
81
82 - `//pre` or `.//pre` matches any element with a name `pre`.
83 - `//a[@href]` matches any element with an `href` attribute.
84 - `//*[@class="impl"]//code` matches any element with a name `code`,
85 which is an ancestor of some element which `class` attr is `impl`.
86 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
87 `class` attribute in the last `a` element (can be followed by more
88 elements that are not `a`) inside the first `span` in the `h1` with
89 a class of `fqn`. Note that there cannot be any additional elements
90 between them due to the use of `/` instead of `//`.
91
92 Do not try to use non-absolute paths, it won't work due to the flawed
93 ElementTree implementation. The script rejects them.
94
95 For the text matches (i.e. paths not ending with `@attr`), any
96 subelements are flattened into one string; this is handy for ignoring
97 highlights for example. If you want to simply check for the presence of
98 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
99
100 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
101 in the specified file. The number of occurrences must match the given
102 count.
103
104 * `@has-dir PATH` checks for the existence of the given directory.
105
106 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
107 checks if the given file does not exist, for example.
108
109 """
110
111 from __future__ import print_function
112 import sys
113 import os.path
114 import re
115 import shlex
116 from collections import namedtuple
117 try:
118 from html.parser import HTMLParser
119 except ImportError:
120 from HTMLParser import HTMLParser
121 from xml.etree import cElementTree as ET
122
123 # &larrb;/&rarrb; are not in HTML 4 but are in HTML 5
124 try:
125 from html.entities import entitydefs
126 except ImportError:
127 from htmlentitydefs import entitydefs
128 entitydefs['larrb'] = u'\u21e4'
129 entitydefs['rarrb'] = u'\u21e5'
130 entitydefs['nbsp'] = ' '
131
132 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
133 VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
134 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
135
136 # Python 2 -> 3 compatibility
137 try:
138 unichr
139 except NameError:
140 unichr = chr
141
142 class CustomHTMLParser(HTMLParser):
143 """simplified HTML parser.
144
145 this is possible because we are dealing with very regular HTML from
146 rustdoc; we only have to deal with i) void elements and ii) empty
147 attributes."""
148 def __init__(self, target=None):
149 HTMLParser.__init__(self)
150 self.__builder = target or ET.TreeBuilder()
151
152 def handle_starttag(self, tag, attrs):
153 attrs = dict((k, v or '') for k, v in attrs)
154 self.__builder.start(tag, attrs)
155 if tag in VOID_ELEMENTS:
156 self.__builder.end(tag)
157
158 def handle_endtag(self, tag):
159 self.__builder.end(tag)
160
161 def handle_startendtag(self, tag, attrs):
162 attrs = dict((k, v or '') for k, v in attrs)
163 self.__builder.start(tag, attrs)
164 self.__builder.end(tag)
165
166 def handle_data(self, data):
167 self.__builder.data(data)
168
169 def handle_entityref(self, name):
170 self.__builder.data(entitydefs[name])
171
172 def handle_charref(self, name):
173 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
174 self.__builder.data(unichr(code).encode('utf-8'))
175
176 def close(self):
177 HTMLParser.close(self)
178 return self.__builder.close()
179
180 Command = namedtuple('Command', 'negated cmd args lineno context')
181
182 class FailedCheck(Exception):
183 pass
184
185 class InvalidCheck(Exception):
186 pass
187
188 def concat_multi_lines(f):
189 """returns a generator out of the file object, which
190 - removes `\\` then `\n` then a shared prefix with the previous line then
191 optional whitespace;
192 - keeps a line number (starting from 0) of the first line being
193 concatenated."""
194 lastline = None # set to the last line when the last line has a backslash
195 firstlineno = None
196 catenated = ''
197 for lineno, line in enumerate(f):
198 line = line.rstrip('\r\n')
199
200 # strip the common prefix from the current line if needed
201 if lastline is not None:
202 common_prefix = os.path.commonprefix([line, lastline])
203 line = line[len(common_prefix):].lstrip()
204
205 firstlineno = firstlineno or lineno
206 if line.endswith('\\'):
207 if lastline is None:
208 lastline = line[:-1]
209 catenated += line[:-1]
210 else:
211 yield firstlineno, catenated + line
212 lastline = None
213 firstlineno = None
214 catenated = ''
215
216 if lastline is not None:
217 print_err(lineno, line, 'Trailing backslash at the end of the file')
218
219 LINE_PATTERN = re.compile(r'''
220 (?<=(?<!\S)@)(?P<negated>!?)
221 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
222 (?P<args>.*)$
223 ''', re.X)
224
225
226 def get_commands(template):
227 with open(template, 'rU') as f:
228 for lineno, line in concat_multi_lines(f):
229 m = LINE_PATTERN.search(line)
230 if not m:
231 continue
232
233 negated = (m.group('negated') == '!')
234 cmd = m.group('cmd')
235 args = m.group('args')
236 if args and not args[:1].isspace():
237 print_err(lineno, line, 'Invalid template syntax')
238 continue
239 args = shlex.split(args)
240 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
241
242
243 def _flatten(node, acc):
244 if node.text:
245 acc.append(node.text)
246 for e in node:
247 _flatten(e, acc)
248 if e.tail:
249 acc.append(e.tail)
250
251
252 def flatten(node):
253 acc = []
254 _flatten(node, acc)
255 return ''.join(acc)
256
257
258 def normalize_xpath(path):
259 if path.startswith('//'):
260 return '.' + path # avoid warnings
261 elif path.startswith('.//'):
262 return path
263 else:
264 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
265
266
267 class CachedFiles(object):
268 def __init__(self, root):
269 self.root = root
270 self.files = {}
271 self.trees = {}
272 self.last_path = None
273
274 def resolve_path(self, path):
275 if path != '-':
276 path = os.path.normpath(path)
277 self.last_path = path
278 return path
279 elif self.last_path is None:
280 raise InvalidCheck('Tried to use the previous path in the first command')
281 else:
282 return self.last_path
283
284 def get_file(self, path):
285 path = self.resolve_path(path)
286 if path in self.files:
287 return self.files[path]
288
289 abspath = os.path.join(self.root, path)
290 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
291 raise FailedCheck('File does not exist {!r}'.format(path))
292
293 with open(abspath) as f:
294 data = f.read()
295 self.files[path] = data
296 return data
297
298 def get_tree(self, path):
299 path = self.resolve_path(path)
300 if path in self.trees:
301 return self.trees[path]
302
303 abspath = os.path.join(self.root, path)
304 if not(os.path.exists(abspath) and os.path.isfile(abspath)):
305 raise FailedCheck('File does not exist {!r}'.format(path))
306
307 with open(abspath) as f:
308 try:
309 tree = ET.parse(f, CustomHTMLParser())
310 except Exception as e:
311 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
312 self.trees[path] = tree
313 return self.trees[path]
314
315 def get_dir(self, path):
316 path = self.resolve_path(path)
317 abspath = os.path.join(self.root, path)
318 if not(os.path.exists(abspath) and os.path.isdir(abspath)):
319 raise FailedCheck('Directory does not exist {!r}'.format(path))
320
321
322 def check_string(data, pat, regexp):
323 if not pat:
324 return True # special case a presence testing
325 elif regexp:
326 return re.search(pat, data) is not None
327 else:
328 data = ' '.join(data.split())
329 pat = ' '.join(pat.split())
330 return pat in data
331
332
333 def check_tree_attr(tree, path, attr, pat, regexp):
334 path = normalize_xpath(path)
335 ret = False
336 for e in tree.findall(path):
337 if attr in e.attrib:
338 value = e.attrib[attr]
339 else:
340 continue
341
342 ret = check_string(value, pat, regexp)
343 if ret:
344 break
345 return ret
346
347
348 def check_tree_text(tree, path, pat, regexp):
349 path = normalize_xpath(path)
350 ret = False
351 try:
352 for e in tree.findall(path):
353 try:
354 value = flatten(e)
355 except KeyError:
356 continue
357 else:
358 ret = check_string(value, pat, regexp)
359 if ret:
360 break
361 except Exception as e:
362 print('Failed to get path "{}"'.format(path))
363 raise e
364 return ret
365
366
367 def get_tree_count(tree, path):
368 path = normalize_xpath(path)
369 return len(tree.findall(path))
370
371 def stderr(*args):
372 print(*args, file=sys.stderr)
373
374 def print_err(lineno, context, err, message=None):
375 global ERR_COUNT
376 ERR_COUNT += 1
377 stderr("{}: {}".format(lineno, message or err))
378 if message and err:
379 stderr("\t{}".format(err))
380
381 if context:
382 stderr("\t{}".format(context))
383
384 ERR_COUNT = 0
385
386 def check_command(c, cache):
387 try:
388 cerr = ""
389 if c.cmd == 'has' or c.cmd == 'matches': # string test
390 regexp = (c.cmd == 'matches')
391 if len(c.args) == 1 and not regexp: # @has <path> = file existence
392 try:
393 cache.get_file(c.args[0])
394 ret = True
395 except FailedCheck as err:
396 cerr = str(err)
397 ret = False
398 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
399 cerr = "`PATTERN` did not match"
400 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
401 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
402 cerr = "`XPATH PATTERN` did not match"
403 tree = cache.get_tree(c.args[0])
404 pat, sep, attr = c.args[1].partition('/@')
405 if sep: # attribute
406 tree = cache.get_tree(c.args[0])
407 ret = check_tree_attr(tree, pat, attr, c.args[2], regexp)
408 else: # normalized text
409 pat = c.args[1]
410 if pat.endswith('/text()'):
411 pat = pat[:-7]
412 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
413 else:
414 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
415
416 elif c.cmd == 'count': # count test
417 if len(c.args) == 3: # @count <path> <pat> <count> = count test
418 expected = int(c.args[2])
419 found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
420 cerr = "Expected {} occurrences but found {}".format(expected, found)
421 ret = expected == found
422 else:
423 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
424 elif c.cmd == 'has-dir': # has-dir test
425 if len(c.args) == 1: # @has-dir <path> = has-dir test
426 try:
427 cache.get_dir(c.args[0])
428 ret = True
429 except FailedCheck as err:
430 cerr = str(err)
431 ret = False
432 else:
433 raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
434 elif c.cmd == 'valid-html':
435 raise InvalidCheck('Unimplemented @valid-html')
436
437 elif c.cmd == 'valid-links':
438 raise InvalidCheck('Unimplemented @valid-links')
439 else:
440 raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
441
442 if ret == c.negated:
443 raise FailedCheck(cerr)
444
445 except FailedCheck as err:
446 message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
447 print_err(c.lineno, c.context, str(err), message)
448 except InvalidCheck as err:
449 print_err(c.lineno, c.context, str(err))
450
451 def check(target, commands):
452 cache = CachedFiles(target)
453 for c in commands:
454 check_command(c, cache)
455
456 if __name__ == '__main__':
457 if len(sys.argv) != 3:
458 stderr('Usage: {} <doc dir> <template>'.format(sys.argv[0]))
459 raise SystemExit(1)
460
461 check(sys.argv[1], get_commands(sys.argv[2]))
462 if ERR_COUNT:
463 stderr("\nEncountered {} errors".format(ERR_COUNT))
464 raise SystemExit(1)