+++ /dev/null
-"""Tests for HTMLParser.py."""\r
-\r
-import HTMLParser\r
-import pprint\r
-import unittest\r
-from test import test_support\r
-\r
-\r
-class EventCollector(HTMLParser.HTMLParser):\r
-\r
- def __init__(self):\r
- self.events = []\r
- self.append = self.events.append\r
- HTMLParser.HTMLParser.__init__(self)\r
-\r
- def get_events(self):\r
- # Normalize the list of events so that buffer artefacts don't\r
- # separate runs of contiguous characters.\r
- L = []\r
- prevtype = None\r
- for event in self.events:\r
- type = event[0]\r
- if type == prevtype == "data":\r
- L[-1] = ("data", L[-1][1] + event[1])\r
- else:\r
- L.append(event)\r
- prevtype = type\r
- self.events = L\r
- return L\r
-\r
- # structure markup\r
-\r
- def handle_starttag(self, tag, attrs):\r
- self.append(("starttag", tag, attrs))\r
-\r
- def handle_startendtag(self, tag, attrs):\r
- self.append(("startendtag", tag, attrs))\r
-\r
- def handle_endtag(self, tag):\r
- self.append(("endtag", tag))\r
-\r
- # all other markup\r
-\r
- def handle_comment(self, data):\r
- self.append(("comment", data))\r
-\r
- def handle_charref(self, data):\r
- self.append(("charref", data))\r
-\r
- def handle_data(self, data):\r
- self.append(("data", data))\r
-\r
- def handle_decl(self, data):\r
- self.append(("decl", data))\r
-\r
- def handle_entityref(self, data):\r
- self.append(("entityref", data))\r
-\r
- def handle_pi(self, data):\r
- self.append(("pi", data))\r
-\r
- def unknown_decl(self, decl):\r
- self.append(("unknown decl", decl))\r
-\r
-\r
-class EventCollectorExtra(EventCollector):\r
-\r
- def handle_starttag(self, tag, attrs):\r
- EventCollector.handle_starttag(self, tag, attrs)\r
- self.append(("starttag_text", self.get_starttag_text()))\r
-\r
-\r
-class TestCaseBase(unittest.TestCase):\r
-\r
- def _run_check(self, source, expected_events, collector=EventCollector):\r
- parser = collector()\r
- for s in source:\r
- parser.feed(s)\r
- parser.close()\r
- events = parser.get_events()\r
- if events != expected_events:\r
- self.fail("received events did not match expected events\n"\r
- "Expected:\n" + pprint.pformat(expected_events) +\r
- "\nReceived:\n" + pprint.pformat(events))\r
-\r
- def _run_check_extra(self, source, events):\r
- self._run_check(source, events, EventCollectorExtra)\r
-\r
- def _parse_error(self, source):\r
- def parse(source=source):\r
- parser = HTMLParser.HTMLParser()\r
- parser.feed(source)\r
- parser.close()\r
- self.assertRaises(HTMLParser.HTMLParseError, parse)\r
-\r
-\r
-class HTMLParserTestCase(TestCaseBase):\r
-\r
- def test_processing_instruction_only(self):\r
- self._run_check("<?processing instruction>", [\r
- ("pi", "processing instruction"),\r
- ])\r
- self._run_check("<?processing instruction ?>", [\r
- ("pi", "processing instruction ?"),\r
- ])\r
-\r
- def test_simple_html(self):\r
- self._run_check("""\r
-<!DOCTYPE html PUBLIC 'foo'>\r
-<HTML>&entity; \r
-<!--comment1a\r
--></foo><bar><<?pi?></foo<bar\r
-comment1b-->\r
-<Img sRc='Bar' isMAP>sample\r
-text\r
-“\r
-<!--comment2a-- --comment2b--><!>\r
-</Html>\r
-""", [\r
- ("data", "\n"),\r
- ("decl", "DOCTYPE html PUBLIC 'foo'"),\r
- ("data", "\n"),\r
- ("starttag", "html", []),\r
- ("entityref", "entity"),\r
- ("charref", "32"),\r
- ("data", "\n"),\r
- ("comment", "comment1a\n-></foo><bar><<?pi?></foo<bar\ncomment1b"),\r
- ("data", "\n"),\r
- ("starttag", "img", [("src", "Bar"), ("ismap", None)]),\r
- ("data", "sample\ntext\n"),\r
- ("charref", "x201C"),\r
- ("data", "\n"),\r
- ("comment", "comment2a-- --comment2b"),\r
- ("data", "\n"),\r
- ("endtag", "html"),\r
- ("data", "\n"),\r
- ])\r
-\r
- def test_unclosed_entityref(self):\r
- self._run_check("&entityref foo", [\r
- ("entityref", "entityref"),\r
- ("data", " foo"),\r
- ])\r
-\r
- def test_doctype_decl(self):\r
- inside = """\\r
-DOCTYPE html [\r
- <!ELEMENT html - O EMPTY>\r
- <!ATTLIST html\r
- version CDATA #IMPLIED\r
- profile CDATA 'DublinCore'>\r
- <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>\r
- <!ENTITY myEntity 'internal parsed entity'>\r
- <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>\r
- <!ENTITY % paramEntity 'name|name|name'>\r
- %paramEntity;\r
- <!-- comment -->\r
-]"""\r
- self._run_check("<!%s>" % inside, [\r
- ("decl", inside),\r
- ])\r
-\r
- def test_bad_nesting(self):\r
- # Strangely, this *is* supposed to test that overlapping\r
- # elements are allowed. HTMLParser is more geared toward\r
- # lexing the input that parsing the structure.\r
- self._run_check("<a><b></a></b>", [\r
- ("starttag", "a", []),\r
- ("starttag", "b", []),\r
- ("endtag", "a"),\r
- ("endtag", "b"),\r
- ])\r
-\r
- def test_bare_ampersands(self):\r
- self._run_check("this text & contains & ampersands &", [\r
- ("data", "this text & contains & ampersands &"),\r
- ])\r
-\r
- def test_bare_pointy_brackets(self):\r
- self._run_check("this < text > contains < bare>pointy< brackets", [\r
- ("data", "this < text > contains < bare>pointy< brackets"),\r
- ])\r
-\r
- def test_attr_syntax(self):\r
- output = [\r
- ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])\r
- ]\r
- self._run_check("""<a b='v' c="v" d=v e>""", output)\r
- self._run_check("""<a b = 'v' c = "v" d = v e>""", output)\r
- self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)\r
- self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)\r
-\r
- def test_attr_values(self):\r
- self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",\r
- [("starttag", "a", [("b", "xxx\n\txxx"),\r
- ("c", "yyy\t\nyyy"),\r
- ("d", "\txyz\n")])\r
- ])\r
- self._run_check("""<a b='' c="">""", [\r
- ("starttag", "a", [("b", ""), ("c", "")]),\r
- ])\r
- # Regression test for SF patch #669683.\r
- self._run_check("<e a=rgb(1,2,3)>", [\r
- ("starttag", "e", [("a", "rgb(1,2,3)")]),\r
- ])\r
- # Regression test for SF bug #921657.\r
- self._run_check("<a href=mailto:xyz@example.com>", [\r
- ("starttag", "a", [("href", "mailto:xyz@example.com")]),\r
- ])\r
-\r
- def test_attr_nonascii(self):\r
- # see issue 7311\r
- self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [\r
- ("starttag", "img", [("src", "/foo/bar.png"),\r
- ("alt", u"\u4e2d\u6587")]),\r
- ])\r
- self._run_check(u"<a title='\u30c6\u30b9\u30c8' "\r
- u"href='\u30c6\u30b9\u30c8.html'>", [\r
- ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),\r
- ("href", u"\u30c6\u30b9\u30c8.html")]),\r
- ])\r
- self._run_check(u'<a title="\u30c6\u30b9\u30c8" '\r
- u'href="\u30c6\u30b9\u30c8.html">', [\r
- ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),\r
- ("href", u"\u30c6\u30b9\u30c8.html")]),\r
- ])\r
-\r
- def test_attr_entity_replacement(self):\r
- self._run_check("""<a b='&><"''>""", [\r
- ("starttag", "a", [("b", "&><\"'")]),\r
- ])\r
-\r
- def test_attr_funky_names(self):\r
- self._run_check("""<a a.b='v' c:d=v e-f=v>""", [\r
- ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),\r
- ])\r
-\r
- def test_illegal_declarations(self):\r
- self._parse_error('<!spacer type="block" height="25">')\r
-\r
- def test_starttag_end_boundary(self):\r
- self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])\r
- self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])\r
-\r
- def test_buffer_artefacts(self):\r
- output = [("starttag", "a", [("b", "<")])]\r
- self._run_check(["<a b='<'>"], output)\r
- self._run_check(["<a ", "b='<'>"], output)\r
- self._run_check(["<a b", "='<'>"], output)\r
- self._run_check(["<a b=", "'<'>"], output)\r
- self._run_check(["<a b='<", "'>"], output)\r
- self._run_check(["<a b='<'", ">"], output)\r
-\r
- output = [("starttag", "a", [("b", ">")])]\r
- self._run_check(["<a b='>'>"], output)\r
- self._run_check(["<a ", "b='>'>"], output)\r
- self._run_check(["<a b", "='>'>"], output)\r
- self._run_check(["<a b=", "'>'>"], output)\r
- self._run_check(["<a b='>", "'>"], output)\r
- self._run_check(["<a b='>'", ">"], output)\r
-\r
- output = [("comment", "abc")]\r
- self._run_check(["", "<!--abc-->"], output)\r
- self._run_check(["<", "!--abc-->"], output)\r
- self._run_check(["<!", "--abc-->"], output)\r
- self._run_check(["<!-", "-abc-->"], output)\r
- self._run_check(["<!--", "abc-->"], output)\r
- self._run_check(["<!--a", "bc-->"], output)\r
- self._run_check(["<!--ab", "c-->"], output)\r
- self._run_check(["<!--abc", "-->"], output)\r
- self._run_check(["<!--abc-", "->"], output)\r
- self._run_check(["<!--abc--", ">"], output)\r
- self._run_check(["<!--abc-->", ""], output)\r
-\r
- def test_starttag_junk_chars(self):\r
- self._parse_error("</>")\r
- self._parse_error("</$>")\r
- self._parse_error("</")\r
- self._parse_error("</a")\r
- self._parse_error("<a<a>")\r
- self._parse_error("</a<a>")\r
- self._parse_error("<!")\r
- self._parse_error("<a $>")\r
- self._parse_error("<a")\r
- self._parse_error("<a foo='bar'")\r
- self._parse_error("<a foo='bar")\r
- self._parse_error("<a foo='>'")\r
- self._parse_error("<a foo='>")\r
- self._parse_error("<a foo=>")\r
-\r
- def test_declaration_junk_chars(self):\r
- self._parse_error("<!DOCTYPE foo $ >")\r
-\r
- def test_startendtag(self):\r
- self._run_check("<p/>", [\r
- ("startendtag", "p", []),\r
- ])\r
- self._run_check("<p></p>", [\r
- ("starttag", "p", []),\r
- ("endtag", "p"),\r
- ])\r
- self._run_check("<p><img src='foo' /></p>", [\r
- ("starttag", "p", []),\r
- ("startendtag", "img", [("src", "foo")]),\r
- ("endtag", "p"),\r
- ])\r
-\r
- def test_get_starttag_text(self):\r
- s = """<foo:bar \n one="1"\ttwo=2 >"""\r
- self._run_check_extra(s, [\r
- ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),\r
- ("starttag_text", s)])\r
-\r
- def test_cdata_content(self):\r
- s = """<script> <!-- not a comment --> ¬-an-entity-ref; </script>"""\r
- self._run_check(s, [\r
- ("starttag", "script", []),\r
- ("data", " <!-- not a comment --> ¬-an-entity-ref; "),\r
- ("endtag", "script"),\r
- ])\r
- s = """<script> <not a='start tag'> </script>"""\r
- self._run_check(s, [\r
- ("starttag", "script", []),\r
- ("data", " <not a='start tag'> "),\r
- ("endtag", "script"),\r
- ])\r
-\r
- def test_entityrefs_in_attributes(self):\r
- self._run_check("<html foo='€&aa&unsupported;'>", [\r
- ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])\r
- ])\r
-\r
- def test_malformatted_charref(self):\r
- self._run_check("<p>&#bad;</p>", [\r
- ("starttag", "p", []),\r
- ("data", "&#bad;"),\r
- ("endtag", "p"),\r
- ])\r
-\r
- def test_unescape_function(self):\r
- parser = HTMLParser.HTMLParser()\r
- self.assertEqual(parser.unescape('&#bad;'),'&#bad;')\r
- self.assertEqual(parser.unescape('&'),'&')\r
-\r
-\r
-def test_main():\r
- test_support.run_unittest(HTMLParserTestCase)\r
-\r
-\r
-if __name__ == "__main__":\r
- test_main()\r