]> git.proxmox.com Git - rustc.git/blame - src/etc/htmldocck.py
Imported Upstream version 1.0.0~beta.3
[rustc.git] / src / etc / htmldocck.py
CommitLineData
85aaf69f
SL
1# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2# file at the top-level directory of this distribution and at
3# http://rust-lang.org/COPYRIGHT.
4#
5# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8# option. This file may not be copied, modified, or distributed
9# except according to those terms.
10
11r"""
12htmldocck.py is a custom checker script for Rustdoc HTML outputs.
13
14# How and why?
15
16The principle is simple: This script receives a path to generated HTML
17documentation and a "template" script, which has a series of check
18commands like `@has` or `@matches`. Each command can be used to check if
19some pattern is present or not present in the particular file or in
20the particular node of HTML tree. In many cases, the template script
21happens to be a source code given to rustdoc.
22
23While it indeed is possible to test in smaller portions, it has been
24hard to construct tests in this fashion and major rendering errors were
25discovered much later. This script is designed for making the black-box
26and regression testing of Rustdoc easy. This does not preclude the needs
27for unit testing, but can be used to complement related tests by quickly
28showing the expected renderings.
29
30In order to avoid one-off dependencies for this task, this script uses
31a reasonably working HTML parser and the existing XPath implementation
32from Python 2's standard library. Hopefully we won't render
33non-well-formed HTML.
34
35# Commands
36
37Commands start with an `@` followed by a command name (letters and
38hyphens), and zero or more arguments separated by one or more whitespace
39and optionally delimited with single or double quotes. The `@` mark
40cannot be preceded by a non-whitespace character. Other lines (including
41every text up to the first `@`) are ignored, but it is recommended to
42avoid the use of `@` in the template file.
43
44There are a number of supported commands:
45
46* `@has PATH` checks for the existence of given file.
47
48 `PATH` is relative to the output directory. It can be given as `-`
49 which repeats the most recently used `PATH`.
50
51* `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
52 the occurrence of given `PATTERN` in the given file. Only one
53 occurrence of given pattern is enough.
54
55 For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
56 whitespace being replaced by one single space character) string.
57 The entire file is also whitespace-normalized including newlines.
58
59 For `@matches`, `PATTERN` is a Python-supported regular expression.
60 The file remains intact but the regexp is matched with no `MULTILINE`
61 and `IGNORECASE` option. You can still use a prefix `(?m)` or `(?i)`
62 to override them, and `\A` and `\Z` for definitely matching
63 the beginning and end of the file.
64
65 (The same distinction goes to other variants of these commands.)
66
67* `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
68 the presence of given `XPATH` in the given HTML file, and also
69 the occurrence of given `PATTERN` in the matching node or attribute.
70 Only one occurrence of given pattern in the match is enough.
71
72 `PATH` should be a valid and well-formed HTML file. It does *not*
73 accept arbitrary HTML5; it should have matching open and close tags
74 and correct entity references at least.
75
76 `XPATH` is an XPath expression to match. This is fairly limited:
77 `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
78 `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
79 and `@attr` (both as the last segment) are supported. Some examples:
80
81 - `//pre` or `.//pre` matches any element with a name `pre`.
82 - `//a[@href]` matches any element with an `href` attribute.
83 - `//*[@class="impl"]//code` matches any element with a name `code`,
84 which is an ancestor of some element which `class` attr is `impl`.
85 - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
86 `class` attribute in the last `a` element (can be followed by more
87 elements that are not `a`) inside the first `span` in the `h1` with
88 a class of `fqn`. Note that there cannot be no additional elements
89 between them due to the use of `/` instead of `//`.
90
91 Do not try to use non-absolute paths, it won't work due to the flawed
92 ElementTree implementation. The script rejects them.
93
94 For the text matches (i.e. paths not ending with `@attr`), any
95 subelements are flattened into one string; this is handy for ignoring
96 highlights for example. If you want to simply check the presence of
97 given node or attribute, use an empty string (`""`) as a `PATTERN`.
98
c34b1796
AL
99* `@count PATH XPATH COUNT' checks for the occurrence of given XPath
100 in the given file. The number of occurrences must match the given count.
101
85aaf69f
SL
102All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
103checks if the given file does not exist, for example.
104
105"""
106
107import sys
108import os.path
109import re
110import shlex
111from collections import namedtuple
112from HTMLParser import HTMLParser
113from xml.etree import cElementTree as ET
114
115# &larrb;/&rarrb; are not in HTML 4 but are in HTML 5
116from htmlentitydefs import entitydefs
117entitydefs['larrb'] = u'\u21e4'
118entitydefs['rarrb'] = u'\u21e5'
119
120# "void elements" (no closing tag) from the HTML Standard section 12.1.2
121VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
122 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'])
123
124
125class CustomHTMLParser(HTMLParser):
126 """simplified HTML parser.
127
128 this is possible because we are dealing with very regular HTML from
129 rustdoc; we only have to deal with i) void elements and ii) empty
130 attributes."""
131 def __init__(self, target=None):
132 HTMLParser.__init__(self)
133 self.__builder = target or ET.TreeBuilder()
134
135 def handle_starttag(self, tag, attrs):
136 attrs = dict((k, v or '') for k, v in attrs)
137 self.__builder.start(tag, attrs)
138 if tag in VOID_ELEMENTS:
139 self.__builder.end(tag)
140
141 def handle_endtag(self, tag):
142 self.__builder.end(tag)
143
144 def handle_startendtag(self, tag, attrs):
145 attrs = dict((k, v or '') for k, v in attrs)
146 self.__builder.start(tag, attrs)
147 self.__builder.end(tag)
148
149 def handle_data(self, data):
150 self.__builder.data(data)
151
152 def handle_entityref(self, name):
153 self.__builder.data(entitydefs[name])
154
155 def handle_charref(self, name):
156 code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
157 self.__builder.data(unichr(code).encode('utf-8'))
158
159 def close(self):
160 HTMLParser.close(self)
161 return self.__builder.close()
162
163Command = namedtuple('Command', 'negated cmd args lineno')
164
165
166def concat_multi_lines(f):
167 """returns a generator out of the file object, which
168 - removes `\\` then `\n` then a shared prefix with the previous line then
169 optional whitespace;
170 - keeps a line number (starting from 0) of the first line being
171 concatenated."""
172 lastline = None # set to the last line when the last line has a backslash
173 firstlineno = None
174 catenated = ''
175 for lineno, line in enumerate(f):
176 line = line.rstrip('\r\n')
177
178 # strip the common prefix from the current line if needed
179 if lastline is not None:
180 maxprefix = 0
181 for i in xrange(min(len(line), len(lastline))):
182 if line[i] != lastline[i]:
183 break
184 maxprefix += 1
185 line = line[maxprefix:].lstrip()
186
187 firstlineno = firstlineno or lineno
188 if line.endswith('\\'):
9346a6ac
AL
189 if lastline is None:
190 lastline = line[:-1]
85aaf69f
SL
191 catenated += line[:-1]
192 else:
193 yield firstlineno, catenated + line
194 lastline = None
195 firstlineno = None
196 catenated = ''
197
198 if lastline is not None:
199 raise RuntimeError('Trailing backslash in the end of file')
200
201LINE_PATTERN = re.compile(r'''
202 (?<=(?<!\S)@)(?P<negated>!?)
203 (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
204 (?P<args>.*)$
205''', re.X)
206
207
208def get_commands(template):
209 with open(template, 'rUb') as f:
210 for lineno, line in concat_multi_lines(f):
211 m = LINE_PATTERN.search(line)
212 if not m:
213 continue
214
215 negated = (m.group('negated') == '!')
216 cmd = m.group('cmd')
217 args = m.group('args')
218 if args and not args[:1].isspace():
219 raise RuntimeError('Invalid template syntax at line {}'.format(lineno+1))
220 args = shlex.split(args)
221 yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1)
222
223
224def _flatten(node, acc):
225 if node.text:
226 acc.append(node.text)
227 for e in node:
228 _flatten(e, acc)
229 if e.tail:
230 acc.append(e.tail)
231
232
233def flatten(node):
234 acc = []
235 _flatten(node, acc)
236 return ''.join(acc)
237
238
239def normalize_xpath(path):
240 if path.startswith('//'):
241 return '.' + path # avoid warnings
242 elif path.startswith('.//'):
243 return path
244 else:
245 raise RuntimeError('Non-absolute XPath is not supported due to \
246 the implementation issue.')
247
248
249class CachedFiles(object):
250 def __init__(self, root):
251 self.root = root
252 self.files = {}
253 self.trees = {}
254 self.last_path = None
255
256 def resolve_path(self, path):
257 if path != '-':
258 path = os.path.normpath(path)
259 self.last_path = path
260 return path
261 elif self.last_path is None:
262 raise RuntimeError('Tried to use the previous path in the first command')
263 else:
264 return self.last_path
265
266 def get_file(self, path):
267 path = self.resolve_path(path)
268 try:
269 return self.files[path]
270 except KeyError:
271 try:
272 with open(os.path.join(self.root, path)) as f:
273 data = f.read()
274 except Exception as e:
275 raise RuntimeError('Cannot open file {!r}: {}'.format(path, e))
276 else:
277 self.files[path] = data
278 return data
279
280 def get_tree(self, path):
281 path = self.resolve_path(path)
282 try:
283 return self.trees[path]
284 except KeyError:
285 try:
286 f = open(os.path.join(self.root, path))
287 except Exception as e:
288 raise RuntimeError('Cannot open file {!r}: {}'.format(path, e))
289 try:
290 with f:
291 tree = ET.parse(f, CustomHTMLParser())
292 except Exception as e:
293 raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
294 else:
295 self.trees[path] = tree
296 return self.trees[path]
297
298
299def check_string(data, pat, regexp):
300 if not pat:
301 return True # special case a presence testing
302 elif regexp:
303 return re.search(pat, data) is not None
304 else:
305 data = ' '.join(data.split())
306 pat = ' '.join(pat.split())
307 return pat in data
308
309
310def check_tree_attr(tree, path, attr, pat, regexp):
311 path = normalize_xpath(path)
312 ret = False
313 for e in tree.findall(path):
314 try:
315 value = e.attrib[attr]
316 except KeyError:
317 continue
318 else:
319 ret = check_string(value, pat, regexp)
320 if ret:
321 break
322 return ret
323
324
325def check_tree_text(tree, path, pat, regexp):
326 path = normalize_xpath(path)
327 ret = False
328 for e in tree.findall(path):
329 try:
330 value = flatten(e)
331 except KeyError:
332 continue
333 else:
334 ret = check_string(value, pat, regexp)
335 if ret:
336 break
337 return ret
338
339
c34b1796
AL
340def check_tree_count(tree, path, count):
341 path = normalize_xpath(path)
342 return len(tree.findall(path)) == count
343
344
85aaf69f
SL
345def check(target, commands):
346 cache = CachedFiles(target)
347 for c in commands:
348 if c.cmd == 'has' or c.cmd == 'matches': # string test
349 regexp = (c.cmd == 'matches')
350 if len(c.args) == 1 and not regexp: # @has <path> = file existence
351 try:
352 cache.get_file(c.args[0])
353 ret = True
354 except RuntimeError:
355 ret = False
356 elif len(c.args) == 2: # @has/matches <path> <pat> = string test
357 ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
358 elif len(c.args) == 3: # @has/matches <path> <pat> <match> = XML tree test
359 tree = cache.get_tree(c.args[0])
360 pat, sep, attr = c.args[1].partition('/@')
361 if sep: # attribute
362 ret = check_tree_attr(cache.get_tree(c.args[0]), pat, attr, c.args[2], regexp)
363 else: # normalized text
364 pat = c.args[1]
365 if pat.endswith('/text()'):
366 pat = pat[:-7]
367 ret = check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp)
368 else:
369 raise RuntimeError('Invalid number of @{} arguments \
370 at line {}'.format(c.cmd, c.lineno))
371
c34b1796
AL
372 elif c.cmd == 'count': # count test
373 if len(c.args) == 3: # @count <path> <pat> <count> = count test
374 ret = check_tree_count(cache.get_tree(c.args[0]), c.args[1], int(c.args[2]))
375 else:
376 raise RuntimeError('Invalid number of @{} arguments \
377 at line {}'.format(c.cmd, c.lineno))
378
85aaf69f
SL
379 elif c.cmd == 'valid-html':
380 raise RuntimeError('Unimplemented @valid-html at line {}'.format(c.lineno))
381
382 elif c.cmd == 'valid-links':
383 raise RuntimeError('Unimplemented @valid-links at line {}'.format(c.lineno))
384
385 else:
386 raise RuntimeError('Unrecognized @{} at line {}'.format(c.cmd, c.lineno))
387
388 if ret == c.negated:
389 raise RuntimeError('@{}{} check failed at line {}'.format('!' if c.negated else '',
390 c.cmd, c.lineno))
391
392if __name__ == '__main__':
393 if len(sys.argv) < 3:
394 print >>sys.stderr, 'Usage: {} <doc dir> <template>'.format(sys.argv[0])
395 raise SystemExit(1)
396 else:
397 check(sys.argv[1], get_commands(sys.argv[2]))