+++ /dev/null
-#!/usr/bin/env python\r
-""" Utility for parsing HTML entity definitions available from:\r
-\r
- http://www.w3.org/ as e.g.\r
- http://www.w3.org/TR/REC-html40/HTMLlat1.ent\r
-\r
- Input is read from stdin, output is written to stdout in form of a\r
- Python snippet defining a dictionary "entitydefs" mapping literal\r
- entity name to character or numeric entity.\r
-\r
- Marc-Andre Lemburg, mal@lemburg.com, 1999.\r
- Use as you like. NO WARRANTIES.\r
-\r
-"""\r
-import re,sys\r
-import TextTools\r
-\r
-entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')\r
-\r
-def parse(text,pos=0,endpos=None):\r
-\r
- pos = 0\r
- if endpos is None:\r
- endpos = len(text)\r
- d = {}\r
- while 1:\r
- m = entityRE.search(text,pos,endpos)\r
- if not m:\r
- break\r
- name,charcode,comment = m.groups()\r
- d[name] = charcode,comment\r
- pos = m.end()\r
- return d\r
-\r
-def writefile(f,defs):\r
-\r
- f.write("entitydefs = {\n")\r
- items = defs.items()\r
- items.sort()\r
- for name,(charcode,comment) in items:\r
- if charcode[:2] == '&#':\r
- code = int(charcode[2:-1])\r
- if code < 256:\r
- charcode = "'\%o'" % code\r
- else:\r
- charcode = repr(charcode)\r
- else:\r
- charcode = repr(charcode)\r
- comment = TextTools.collapse(comment)\r
- f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))\r
- f.write('\n}\n')\r
-\r
-if __name__ == '__main__':\r
- if len(sys.argv) > 1:\r
- infile = open(sys.argv[1])\r
- else:\r
- infile = sys.stdin\r
- if len(sys.argv) > 2:\r
- outfile = open(sys.argv[2],'w')\r
- else:\r
- outfile = sys.stdout\r
- text = infile.read()\r
- defs = parse(text)\r
- writefile(outfile,defs)\r