]>
git.proxmox.com Git - rustc.git/blob - src/etc/htmldocck.py
1 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2 # file at the top-level directory of this distribution and at
3 # http://rust-lang.org/COPYRIGHT.
5 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 # option. This file may not be copied, modified, or distributed
9 # except according to those terms.
12 htmldocck.py is a custom checker script for Rustdoc HTML outputs.
16 The principle is simple: This script receives a path to generated HTML
17 documentation and a "template" script, which has a series of check
18 commands like `@has` or `@matches`. Each command is used to check if
19 some pattern is present or not present in the particular file or in
20 a particular node of the HTML tree. In many cases, the template script
21 happens to be the source code given to rustdoc.
23 While it indeed is possible to test in smaller portions, it has been
24 hard to construct tests in this fashion and major rendering errors were
25 discovered much later. This script is designed to make black-box and
26 regression testing of Rustdoc easy. This does not preclude the needs for
27 unit testing, but can be used to complement related tests by quickly
28 showing the expected renderings.
30 In order to avoid one-off dependencies for this task, this script uses
31 a reasonably working HTML parser and the existing XPath implementation
32 from Python's standard library. Hopefully, we won't render
37 Commands start with an `@` followed by a command name (letters and
38 hyphens), and zero or more arguments separated by one or more whitespace
39 characters and optionally delimited with single or double quotes. The `@`
40 mark cannot be preceded by a non-whitespace character. Other lines
41 (including every text up to the first `@`) are ignored, but it is
42 recommended to avoid the use of `@` in the template file.
44 There are a number of supported commands:
46 * `@has PATH` checks for the existence of the given file.
48 `PATH` is relative to the output directory. It can be given as `-`
49 which repeats the most recently used `PATH`.
51 * `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
52 the occurrence of the given pattern `PATTERN` in the specified file.
53 Only one occurrence of the pattern is enough.
55 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
56 whitespace being replaced by one single space character) string.
57 The entire file is also whitespace-normalized including newlines.
59 For `@matches`, `PATTERN` is a Python-supported regular expression.
60 The file remains intact but the regexp is matched without the `MULTILINE`
61 and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
62 to override them, and `\A` and `\Z` for definitely matching
63 the beginning and end of the file.
65 (The same distinction goes to other variants of these commands.)
67 * `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
68 the presence of the given XPath `XPATH` in the specified HTML file,
69 and also the occurrence of the given pattern `PATTERN` in the matching
70 node or attribute. Only one occurrence of the pattern in the match
73 `PATH` should be a valid and well-formed HTML file. It does *not*
74 accept arbitrary HTML5; it should have matching open and close tags
75 and correct entity references at least.
77 `XPATH` is an XPath expression to match. The XPath is fairly limited:
78 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
79 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
80 and `@attr` (both as the last segment) are supported. Some examples:
82 - `//pre` or `.//pre` matches any element with a name `pre`.
83 - `//a[@href]` matches any element with an `href` attribute.
84 - `//*[@class="impl"]//code` matches any element with a name `code`,
85 which is an ancestor of some element which `class` attr is `impl`.
86 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
87 `class` attribute in the last `a` element (can be followed by more
88 elements that are not `a`) inside the first `span` in the `h1` with
89 a class of `fqn`. Note that there cannot be any additional elements
90 between them due to the use of `/` instead of `//`.
92 Do not try to use non-absolute paths, it won't work due to the flawed
93 ElementTree implementation. The script rejects them.
95 For the text matches (i.e. paths not ending with `@attr`), any
96 subelements are flattened into one string; this is handy for ignoring
97 highlights for example. If you want to simply check for the presence of
98 a given node or attribute, use an empty string (`""`) as a `PATTERN`.
100 * `@count PATH XPATH COUNT' checks for the occurrence of the given XPath
101 in the specified file. The number of occurrences must match the given
104 * `@has-dir PATH` checks for the existence of the given directory.
106 All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
107 checks if the given file does not exist, for example.
111 from __future__
import print_function
116 from collections
import namedtuple
118 from html
.parser
import HTMLParser
120 from HTMLParser
import HTMLParser
121 from xml
.etree
import cElementTree
as ET
123 # ⇤/⇥ are not in HTML 4 but are in HTML 5
125 from html
.entities
import entitydefs
127 from htmlentitydefs
import entitydefs
128 entitydefs
['larrb'] = u
'\u21e4'
129 entitydefs
['rarrb'] = u
'\u21e5'
130 entitydefs
['nbsp'] = ' '
132 # "void elements" (no closing tag) from the HTML Standard section 12.1.2
133 VOID_ELEMENTS
= set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
134 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
136 # Python 2 -> 3 compatibility
142 class CustomHTMLParser(HTMLParser
):
143 """simplified HTML parser.
145 this is possible because we are dealing with very regular HTML from
146 rustdoc; we only have to deal with i) void elements and ii) empty
148 def __init__(self
, target
=None):
149 HTMLParser
.__init
__(self
)
150 self
.__builder
= target
or ET
.TreeBuilder()
152 def handle_starttag(self
, tag
, attrs
):
153 attrs
= dict((k
, v
or '') for k
, v
in attrs
)
154 self
.__builder
.start(tag
, attrs
)
155 if tag
in VOID_ELEMENTS
:
156 self
.__builder
.end(tag
)
158 def handle_endtag(self
, tag
):
159 self
.__builder
.end(tag
)
161 def handle_startendtag(self
, tag
, attrs
):
162 attrs
= dict((k
, v
or '') for k
, v
in attrs
)
163 self
.__builder
.start(tag
, attrs
)
164 self
.__builder
.end(tag
)
166 def handle_data(self
, data
):
167 self
.__builder
.data(data
)
169 def handle_entityref(self
, name
):
170 self
.__builder
.data(entitydefs
[name
])
172 def handle_charref(self
, name
):
173 code
= int(name
[1:], 16) if name
.startswith(('x', 'X')) else int(name
, 10)
174 self
.__builder
.data(unichr(code
).encode('utf-8'))
177 HTMLParser
.close(self
)
178 return self
.__builder
.close()
180 Command
= namedtuple('Command', 'negated cmd args lineno context')
182 class FailedCheck(Exception):
185 class InvalidCheck(Exception):
188 def concat_multi_lines(f
):
189 """returns a generator out of the file object, which
190 - removes `\\` then `\n` then a shared prefix with the previous line then
192 - keeps a line number (starting from 0) of the first line being
194 lastline
= None # set to the last line when the last line has a backslash
197 for lineno
, line
in enumerate(f
):
198 line
= line
.rstrip('\r\n')
200 # strip the common prefix from the current line if needed
201 if lastline
is not None:
202 common_prefix
= os
.path
.commonprefix([line
, lastline
])
203 line
= line
[len(common_prefix
):].lstrip()
205 firstlineno
= firstlineno
or lineno
206 if line
.endswith('\\'):
209 catenated
+= line
[:-1]
211 yield firstlineno
, catenated
+ line
216 if lastline
is not None:
217 print_err(lineno
, line
, 'Trailing backslash at the end of the file')
219 LINE_PATTERN
= re
.compile(r
'''
220 (?<=(?<!\S)@)(?P<negated>!?)
221 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
226 def get_commands(template
):
227 with
open(template
, 'rU') as f
:
228 for lineno
, line
in concat_multi_lines(f
):
229 m
= LINE_PATTERN
.search(line
)
233 negated
= (m
.group('negated') == '!')
235 args
= m
.group('args')
236 if args
and not args
[:1].isspace():
237 print_err(lineno
, line
, 'Invalid template syntax')
239 args
= shlex
.split(args
)
240 yield Command(negated
=negated
, cmd
=cmd
, args
=args
, lineno
=lineno
+1, context
=line
)
243 def _flatten(node
, acc
):
245 acc
.append(node
.text
)
258 def normalize_xpath(path
):
259 if path
.startswith('//'):
260 return '.' + path
# avoid warnings
261 elif path
.startswith('.//'):
264 raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
267 class CachedFiles(object):
268 def __init__(self
, root
):
272 self
.last_path
= None
274 def resolve_path(self
, path
):
276 path
= os
.path
.normpath(path
)
277 self
.last_path
= path
279 elif self
.last_path
is None:
280 raise InvalidCheck('Tried to use the previous path in the first command')
282 return self
.last_path
284 def get_file(self
, path
):
285 path
= self
.resolve_path(path
)
286 if path
in self
.files
:
287 return self
.files
[path
]
289 abspath
= os
.path
.join(self
.root
, path
)
290 if not(os
.path
.exists(abspath
) and os
.path
.isfile(abspath
)):
291 raise FailedCheck('File does not exist {!r}'.format(path
))
293 with
open(abspath
) as f
:
295 self
.files
[path
] = data
298 def get_tree(self
, path
):
299 path
= self
.resolve_path(path
)
300 if path
in self
.trees
:
301 return self
.trees
[path
]
303 abspath
= os
.path
.join(self
.root
, path
)
304 if not(os
.path
.exists(abspath
) and os
.path
.isfile(abspath
)):
305 raise FailedCheck('File does not exist {!r}'.format(path
))
307 with
open(abspath
) as f
:
309 tree
= ET
.parse(f
, CustomHTMLParser())
310 except Exception as e
:
311 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path
, e
))
312 self
.trees
[path
] = tree
313 return self
.trees
[path
]
315 def get_dir(self
, path
):
316 path
= self
.resolve_path(path
)
317 abspath
= os
.path
.join(self
.root
, path
)
318 if not(os
.path
.exists(abspath
) and os
.path
.isdir(abspath
)):
319 raise FailedCheck('Directory does not exist {!r}'.format(path
))
322 def check_string(data
, pat
, regexp
):
324 return True # special case a presence testing
326 return re
.search(pat
, data
) is not None
328 data
= ' '.join(data
.split())
329 pat
= ' '.join(pat
.split())
333 def check_tree_attr(tree
, path
, attr
, pat
, regexp
):
334 path
= normalize_xpath(path
)
336 for e
in tree
.findall(path
):
338 value
= e
.attrib
[attr
]
342 ret
= check_string(value
, pat
, regexp
)
348 def check_tree_text(tree
, path
, pat
, regexp
):
349 path
= normalize_xpath(path
)
352 for e
in tree
.findall(path
):
358 ret
= check_string(value
, pat
, regexp
)
361 except Exception as e
:
362 print('Failed to get path "{}"'.format(path
))
367 def get_tree_count(tree
, path
):
368 path
= normalize_xpath(path
)
369 return len(tree
.findall(path
))
372 print(*args
, file=sys
.stderr
)
374 def print_err(lineno
, context
, err
, message
=None):
377 stderr("{}: {}".format(lineno
, message
or err
))
379 stderr("\t{}".format(err
))
382 stderr("\t{}".format(context
))
386 def check_command(c
, cache
):
389 if c
.cmd
== 'has' or c
.cmd
== 'matches': # string test
390 regexp
= (c
.cmd
== 'matches')
391 if len(c
.args
) == 1 and not regexp
: # @has <path> = file existence
393 cache
.get_file(c
.args
[0])
395 except FailedCheck
as err
:
398 elif len(c
.args
) == 2: # @has/matches <path> <pat> = string test
399 cerr
= "`PATTERN` did not match"
400 ret
= check_string(cache
.get_file(c
.args
[0]), c
.args
[1], regexp
)
401 elif len(c
.args
) == 3: # @has/matches <path> <pat> <match> = XML tree test
402 cerr
= "`XPATH PATTERN` did not match"
403 tree
= cache
.get_tree(c
.args
[0])
404 pat
, sep
, attr
= c
.args
[1].partition('/@')
406 tree
= cache
.get_tree(c
.args
[0])
407 ret
= check_tree_attr(tree
, pat
, attr
, c
.args
[2], regexp
)
408 else: # normalized text
410 if pat
.endswith('/text()'):
412 ret
= check_tree_text(cache
.get_tree(c
.args
[0]), pat
, c
.args
[2], regexp
)
414 raise InvalidCheck('Invalid number of @{} arguments'.format(c
.cmd
))
416 elif c
.cmd
== 'count': # count test
417 if len(c
.args
) == 3: # @count <path> <pat> <count> = count test
418 expected
= int(c
.args
[2])
419 found
= get_tree_count(cache
.get_tree(c
.args
[0]), c
.args
[1])
420 cerr
= "Expected {} occurrences but found {}".format(expected
, found
)
421 ret
= expected
== found
423 raise InvalidCheck('Invalid number of @{} arguments'.format(c
.cmd
))
424 elif c
.cmd
== 'has-dir': # has-dir test
425 if len(c
.args
) == 1: # @has-dir <path> = has-dir test
427 cache
.get_dir(c
.args
[0])
429 except FailedCheck
as err
:
433 raise InvalidCheck('Invalid number of @{} arguments'.format(c
.cmd
))
434 elif c
.cmd
== 'valid-html':
435 raise InvalidCheck('Unimplemented @valid-html')
437 elif c
.cmd
== 'valid-links':
438 raise InvalidCheck('Unimplemented @valid-links')
440 raise InvalidCheck('Unrecognized @{}'.format(c
.cmd
))
443 raise FailedCheck(cerr
)
445 except FailedCheck
as err
:
446 message
= '@{}{} check failed'.format('!' if c
.negated
else '', c
.cmd
)
447 print_err(c
.lineno
, c
.context
, str(err
), message
)
448 except InvalidCheck
as err
:
449 print_err(c
.lineno
, c
.context
, str(err
))
451 def check(target
, commands
):
452 cache
= CachedFiles(target
)
454 check_command(c
, cache
)
456 if __name__
== '__main__':
457 if len(sys
.argv
) != 3:
458 stderr('Usage: {} <doc dir> <template>'.format(sys
.argv
[0]))
461 check(sys
.argv
[1], get_commands(sys
.argv
[2]))
463 stderr("\nEncountered {} errors".format(ERR_COUNT
))