]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #!/usr/bin/env python\r |
2 | """ Utility for parsing HTML entity definitions available from:\r | |
3 | \r | |
4 | http://www.w3.org/ as e.g.\r | |
5 | http://www.w3.org/TR/REC-html40/HTMLlat1.ent\r | |
6 | \r | |
7 | Input is read from stdin, output is written to stdout in form of a\r | |
8 | Python snippet defining a dictionary "entitydefs" mapping literal\r | |
9 | entity name to character or numeric entity.\r | |
10 | \r | |
11 | Marc-Andre Lemburg, mal@lemburg.com, 1999.\r | |
12 | Use as you like. NO WARRANTIES.\r | |
13 | \r | |
14 | """\r | |
15 | import re,sys\r | |
16 | import TextTools\r | |
17 | \r | |
18 | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')\r | |
19 | \r | |
20 | def parse(text,pos=0,endpos=None):\r | |
21 | \r | |
22 | pos = 0\r | |
23 | if endpos is None:\r | |
24 | endpos = len(text)\r | |
25 | d = {}\r | |
26 | while 1:\r | |
27 | m = entityRE.search(text,pos,endpos)\r | |
28 | if not m:\r | |
29 | break\r | |
30 | name,charcode,comment = m.groups()\r | |
31 | d[name] = charcode,comment\r | |
32 | pos = m.end()\r | |
33 | return d\r | |
34 | \r | |
35 | def writefile(f,defs):\r | |
36 | \r | |
37 | f.write("entitydefs = {\n")\r | |
38 | items = defs.items()\r | |
39 | items.sort()\r | |
40 | for name,(charcode,comment) in items:\r | |
41 | if charcode[:2] == '&#':\r | |
42 | code = int(charcode[2:-1])\r | |
43 | if code < 256:\r | |
44 | charcode = "'\%o'" % code\r | |
45 | else:\r | |
46 | charcode = repr(charcode)\r | |
47 | else:\r | |
48 | charcode = repr(charcode)\r | |
49 | comment = TextTools.collapse(comment)\r | |
50 | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))\r | |
51 | f.write('\n}\n')\r | |
52 | \r | |
53 | if __name__ == '__main__':\r | |
54 | if len(sys.argv) > 1:\r | |
55 | infile = open(sys.argv[1])\r | |
56 | else:\r | |
57 | infile = sys.stdin\r | |
58 | if len(sys.argv) > 2:\r | |
59 | outfile = open(sys.argv[2],'w')\r | |
60 | else:\r | |
61 | outfile = sys.stdout\r | |
62 | text = infile.read()\r | |
63 | defs = parse(text)\r | |
64 | writefile(outfile,defs)\r |