]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/test/test_htmlparser.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / test / test_htmlparser.py
CommitLineData
4710c53d 1"""Tests for HTMLParser.py."""\r
2\r
3import HTMLParser\r
4import pprint\r
5import unittest\r
6from test import test_support\r
7\r
8\r
9class EventCollector(HTMLParser.HTMLParser):\r
10\r
11 def __init__(self):\r
12 self.events = []\r
13 self.append = self.events.append\r
14 HTMLParser.HTMLParser.__init__(self)\r
15\r
16 def get_events(self):\r
17 # Normalize the list of events so that buffer artefacts don't\r
18 # separate runs of contiguous characters.\r
19 L = []\r
20 prevtype = None\r
21 for event in self.events:\r
22 type = event[0]\r
23 if type == prevtype == "data":\r
24 L[-1] = ("data", L[-1][1] + event[1])\r
25 else:\r
26 L.append(event)\r
27 prevtype = type\r
28 self.events = L\r
29 return L\r
30\r
31 # structure markup\r
32\r
33 def handle_starttag(self, tag, attrs):\r
34 self.append(("starttag", tag, attrs))\r
35\r
36 def handle_startendtag(self, tag, attrs):\r
37 self.append(("startendtag", tag, attrs))\r
38\r
39 def handle_endtag(self, tag):\r
40 self.append(("endtag", tag))\r
41\r
42 # all other markup\r
43\r
44 def handle_comment(self, data):\r
45 self.append(("comment", data))\r
46\r
47 def handle_charref(self, data):\r
48 self.append(("charref", data))\r
49\r
50 def handle_data(self, data):\r
51 self.append(("data", data))\r
52\r
53 def handle_decl(self, data):\r
54 self.append(("decl", data))\r
55\r
56 def handle_entityref(self, data):\r
57 self.append(("entityref", data))\r
58\r
59 def handle_pi(self, data):\r
60 self.append(("pi", data))\r
61\r
62 def unknown_decl(self, decl):\r
63 self.append(("unknown decl", decl))\r
64\r
65\r
66class EventCollectorExtra(EventCollector):\r
67\r
68 def handle_starttag(self, tag, attrs):\r
69 EventCollector.handle_starttag(self, tag, attrs)\r
70 self.append(("starttag_text", self.get_starttag_text()))\r
71\r
72\r
73class TestCaseBase(unittest.TestCase):\r
74\r
75 def _run_check(self, source, expected_events, collector=EventCollector):\r
76 parser = collector()\r
77 for s in source:\r
78 parser.feed(s)\r
79 parser.close()\r
80 events = parser.get_events()\r
81 if events != expected_events:\r
82 self.fail("received events did not match expected events\n"\r
83 "Expected:\n" + pprint.pformat(expected_events) +\r
84 "\nReceived:\n" + pprint.pformat(events))\r
85\r
86 def _run_check_extra(self, source, events):\r
87 self._run_check(source, events, EventCollectorExtra)\r
88\r
89 def _parse_error(self, source):\r
90 def parse(source=source):\r
91 parser = HTMLParser.HTMLParser()\r
92 parser.feed(source)\r
93 parser.close()\r
94 self.assertRaises(HTMLParser.HTMLParseError, parse)\r
95\r
96\r
97class HTMLParserTestCase(TestCaseBase):\r
98\r
99 def test_processing_instruction_only(self):\r
100 self._run_check("<?processing instruction>", [\r
101 ("pi", "processing instruction"),\r
102 ])\r
103 self._run_check("<?processing instruction ?>", [\r
104 ("pi", "processing instruction ?"),\r
105 ])\r
106\r
107 def test_simple_html(self):\r
108 self._run_check("""\r
109<!DOCTYPE html PUBLIC 'foo'>\r
110<HTML>&entity;&#32;\r
111<!--comment1a\r
112-></foo><bar>&lt;<?pi?></foo<bar\r
113comment1b-->\r
114<Img sRc='Bar' isMAP>sample\r
115text\r
116&#x201C;\r
117<!--comment2a-- --comment2b--><!>\r
118</Html>\r
119""", [\r
120 ("data", "\n"),\r
121 ("decl", "DOCTYPE html PUBLIC 'foo'"),\r
122 ("data", "\n"),\r
123 ("starttag", "html", []),\r
124 ("entityref", "entity"),\r
125 ("charref", "32"),\r
126 ("data", "\n"),\r
127 ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),\r
128 ("data", "\n"),\r
129 ("starttag", "img", [("src", "Bar"), ("ismap", None)]),\r
130 ("data", "sample\ntext\n"),\r
131 ("charref", "x201C"),\r
132 ("data", "\n"),\r
133 ("comment", "comment2a-- --comment2b"),\r
134 ("data", "\n"),\r
135 ("endtag", "html"),\r
136 ("data", "\n"),\r
137 ])\r
138\r
139 def test_unclosed_entityref(self):\r
140 self._run_check("&entityref foo", [\r
141 ("entityref", "entityref"),\r
142 ("data", " foo"),\r
143 ])\r
144\r
145 def test_doctype_decl(self):\r
146 inside = """\\r
147DOCTYPE html [\r
148 <!ELEMENT html - O EMPTY>\r
149 <!ATTLIST html\r
150 version CDATA #IMPLIED\r
151 profile CDATA 'DublinCore'>\r
152 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>\r
153 <!ENTITY myEntity 'internal parsed entity'>\r
154 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>\r
155 <!ENTITY % paramEntity 'name|name|name'>\r
156 %paramEntity;\r
157 <!-- comment -->\r
158]"""\r
159 self._run_check("<!%s>" % inside, [\r
160 ("decl", inside),\r
161 ])\r
162\r
163 def test_bad_nesting(self):\r
164 # Strangely, this *is* supposed to test that overlapping\r
165 # elements are allowed. HTMLParser is more geared toward\r
166 # lexing the input that parsing the structure.\r
167 self._run_check("<a><b></a></b>", [\r
168 ("starttag", "a", []),\r
169 ("starttag", "b", []),\r
170 ("endtag", "a"),\r
171 ("endtag", "b"),\r
172 ])\r
173\r
174 def test_bare_ampersands(self):\r
175 self._run_check("this text & contains & ampersands &", [\r
176 ("data", "this text & contains & ampersands &"),\r
177 ])\r
178\r
179 def test_bare_pointy_brackets(self):\r
180 self._run_check("this < text > contains < bare>pointy< brackets", [\r
181 ("data", "this < text > contains < bare>pointy< brackets"),\r
182 ])\r
183\r
184 def test_attr_syntax(self):\r
185 output = [\r
186 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])\r
187 ]\r
188 self._run_check("""<a b='v' c="v" d=v e>""", output)\r
189 self._run_check("""<a b = 'v' c = "v" d = v e>""", output)\r
190 self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)\r
191 self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)\r
192\r
193 def test_attr_values(self):\r
194 self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",\r
195 [("starttag", "a", [("b", "xxx\n\txxx"),\r
196 ("c", "yyy\t\nyyy"),\r
197 ("d", "\txyz\n")])\r
198 ])\r
199 self._run_check("""<a b='' c="">""", [\r
200 ("starttag", "a", [("b", ""), ("c", "")]),\r
201 ])\r
202 # Regression test for SF patch #669683.\r
203 self._run_check("<e a=rgb(1,2,3)>", [\r
204 ("starttag", "e", [("a", "rgb(1,2,3)")]),\r
205 ])\r
206 # Regression test for SF bug #921657.\r
207 self._run_check("<a href=mailto:xyz@example.com>", [\r
208 ("starttag", "a", [("href", "mailto:xyz@example.com")]),\r
209 ])\r
210\r
211 def test_attr_nonascii(self):\r
212 # see issue 7311\r
213 self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [\r
214 ("starttag", "img", [("src", "/foo/bar.png"),\r
215 ("alt", u"\u4e2d\u6587")]),\r
216 ])\r
217 self._run_check(u"<a title='\u30c6\u30b9\u30c8' "\r
218 u"href='\u30c6\u30b9\u30c8.html'>", [\r
219 ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),\r
220 ("href", u"\u30c6\u30b9\u30c8.html")]),\r
221 ])\r
222 self._run_check(u'<a title="\u30c6\u30b9\u30c8" '\r
223 u'href="\u30c6\u30b9\u30c8.html">', [\r
224 ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),\r
225 ("href", u"\u30c6\u30b9\u30c8.html")]),\r
226 ])\r
227\r
228 def test_attr_entity_replacement(self):\r
229 self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [\r
230 ("starttag", "a", [("b", "&><\"'")]),\r
231 ])\r
232\r
233 def test_attr_funky_names(self):\r
234 self._run_check("""<a a.b='v' c:d=v e-f=v>""", [\r
235 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),\r
236 ])\r
237\r
238 def test_illegal_declarations(self):\r
239 self._parse_error('<!spacer type="block" height="25">')\r
240\r
241 def test_starttag_end_boundary(self):\r
242 self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])\r
243 self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])\r
244\r
245 def test_buffer_artefacts(self):\r
246 output = [("starttag", "a", [("b", "<")])]\r
247 self._run_check(["<a b='<'>"], output)\r
248 self._run_check(["<a ", "b='<'>"], output)\r
249 self._run_check(["<a b", "='<'>"], output)\r
250 self._run_check(["<a b=", "'<'>"], output)\r
251 self._run_check(["<a b='<", "'>"], output)\r
252 self._run_check(["<a b='<'", ">"], output)\r
253\r
254 output = [("starttag", "a", [("b", ">")])]\r
255 self._run_check(["<a b='>'>"], output)\r
256 self._run_check(["<a ", "b='>'>"], output)\r
257 self._run_check(["<a b", "='>'>"], output)\r
258 self._run_check(["<a b=", "'>'>"], output)\r
259 self._run_check(["<a b='>", "'>"], output)\r
260 self._run_check(["<a b='>'", ">"], output)\r
261\r
262 output = [("comment", "abc")]\r
263 self._run_check(["", "<!--abc-->"], output)\r
264 self._run_check(["<", "!--abc-->"], output)\r
265 self._run_check(["<!", "--abc-->"], output)\r
266 self._run_check(["<!-", "-abc-->"], output)\r
267 self._run_check(["<!--", "abc-->"], output)\r
268 self._run_check(["<!--a", "bc-->"], output)\r
269 self._run_check(["<!--ab", "c-->"], output)\r
270 self._run_check(["<!--abc", "-->"], output)\r
271 self._run_check(["<!--abc-", "->"], output)\r
272 self._run_check(["<!--abc--", ">"], output)\r
273 self._run_check(["<!--abc-->", ""], output)\r
274\r
275 def test_starttag_junk_chars(self):\r
276 self._parse_error("</>")\r
277 self._parse_error("</$>")\r
278 self._parse_error("</")\r
279 self._parse_error("</a")\r
280 self._parse_error("<a<a>")\r
281 self._parse_error("</a<a>")\r
282 self._parse_error("<!")\r
283 self._parse_error("<a $>")\r
284 self._parse_error("<a")\r
285 self._parse_error("<a foo='bar'")\r
286 self._parse_error("<a foo='bar")\r
287 self._parse_error("<a foo='>'")\r
288 self._parse_error("<a foo='>")\r
289 self._parse_error("<a foo=>")\r
290\r
291 def test_declaration_junk_chars(self):\r
292 self._parse_error("<!DOCTYPE foo $ >")\r
293\r
294 def test_startendtag(self):\r
295 self._run_check("<p/>", [\r
296 ("startendtag", "p", []),\r
297 ])\r
298 self._run_check("<p></p>", [\r
299 ("starttag", "p", []),\r
300 ("endtag", "p"),\r
301 ])\r
302 self._run_check("<p><img src='foo' /></p>", [\r
303 ("starttag", "p", []),\r
304 ("startendtag", "img", [("src", "foo")]),\r
305 ("endtag", "p"),\r
306 ])\r
307\r
308 def test_get_starttag_text(self):\r
309 s = """<foo:bar \n one="1"\ttwo=2 >"""\r
310 self._run_check_extra(s, [\r
311 ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),\r
312 ("starttag_text", s)])\r
313\r
314 def test_cdata_content(self):\r
315 s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""\r
316 self._run_check(s, [\r
317 ("starttag", "script", []),\r
318 ("data", " <!-- not a comment --> &not-an-entity-ref; "),\r
319 ("endtag", "script"),\r
320 ])\r
321 s = """<script> <not a='start tag'> </script>"""\r
322 self._run_check(s, [\r
323 ("starttag", "script", []),\r
324 ("data", " <not a='start tag'> "),\r
325 ("endtag", "script"),\r
326 ])\r
327\r
328 def test_entityrefs_in_attributes(self):\r
329 self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>", [\r
330 ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])\r
331 ])\r
332\r
333 def test_malformatted_charref(self):\r
334 self._run_check("<p>&#bad;</p>", [\r
335 ("starttag", "p", []),\r
336 ("data", "&#bad;"),\r
337 ("endtag", "p"),\r
338 ])\r
339\r
340 def test_unescape_function(self):\r
341 parser = HTMLParser.HTMLParser()\r
342 self.assertEqual(parser.unescape('&#bad;'),'&#bad;')\r
343 self.assertEqual(parser.unescape('&#0038;'),'&')\r
344\r
345\r
346def test_main():\r
347 test_support.run_unittest(HTMLParserTestCase)\r
348\r
349\r
350if __name__ == "__main__":\r
351 test_main()\r