]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.\r |
2 | # Licensed to PSF under a Contributor Agreement.\r | |
3 | \r | |
4 | """Convert graminit.[ch] spit out by pgen to Python code.\r | |
5 | \r | |
6 | Pgen is the Python parser generator. It is useful to quickly create a\r | |
7 | parser from a grammar file in Python's grammar notation. But I don't\r | |
8 | want my parsers to be written in C (yet), so I'm translating the\r | |
9 | parsing tables to Python data structures and writing a Python parse\r | |
10 | engine.\r | |
11 | \r | |
12 | Note that the token numbers are constants determined by the standard\r | |
13 | Python tokenizer. The standard token module defines these numbers and\r | |
14 | their names (the names are not used much). The token numbers are\r | |
15 | hardcoded into the Python tokenizer and into pgen. A Python\r | |
16 | implementation of the Python tokenizer is also available, in the\r | |
17 | standard tokenize module.\r | |
18 | \r | |
19 | On the other hand, symbol numbers (representing the grammar's\r | |
20 | non-terminals) are assigned by pgen based on the actual grammar\r | |
21 | input.\r | |
22 | \r | |
23 | Note: this module is pretty much obsolete; the pgen module generates\r | |
24 | equivalent grammar tables directly from the Grammar.txt input file\r | |
25 | without having to invoke the Python pgen C program.\r | |
26 | \r | |
27 | """\r | |
28 | \r | |
29 | # Python imports\r | |
30 | import re\r | |
31 | \r | |
32 | # Local imports\r | |
33 | from pgen2 import grammar, token\r | |
34 | \r | |
35 | \r | |
36 | class Converter(grammar.Grammar):\r | |
37 | """Grammar subclass that reads classic pgen output files.\r | |
38 | \r | |
39 | The run() method reads the tables as produced by the pgen parser\r | |
40 | generator, typically contained in two C files, graminit.h and\r | |
41 | graminit.c. The other methods are for internal use only.\r | |
42 | \r | |
43 | See the base class for more documentation.\r | |
44 | \r | |
45 | """\r | |
46 | \r | |
47 | def run(self, graminit_h, graminit_c):\r | |
48 | """Load the grammar tables from the text files written by pgen."""\r | |
49 | self.parse_graminit_h(graminit_h)\r | |
50 | self.parse_graminit_c(graminit_c)\r | |
51 | self.finish_off()\r | |
52 | \r | |
53 | def parse_graminit_h(self, filename):\r | |
54 | """Parse the .h file written by pgen. (Internal)\r | |
55 | \r | |
56 | This file is a sequence of #define statements defining the\r | |
57 | nonterminals of the grammar as numbers. We build two tables\r | |
58 | mapping the numbers to names and back.\r | |
59 | \r | |
60 | """\r | |
61 | try:\r | |
62 | f = open(filename)\r | |
63 | except IOError, err:\r | |
64 | print "Can't open %s: %s" % (filename, err)\r | |
65 | return False\r | |
66 | self.symbol2number = {}\r | |
67 | self.number2symbol = {}\r | |
68 | lineno = 0\r | |
69 | for line in f:\r | |
70 | lineno += 1\r | |
71 | mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)\r | |
72 | if not mo and line.strip():\r | |
73 | print "%s(%s): can't parse %s" % (filename, lineno,\r | |
74 | line.strip())\r | |
75 | else:\r | |
76 | symbol, number = mo.groups()\r | |
77 | number = int(number)\r | |
78 | assert symbol not in self.symbol2number\r | |
79 | assert number not in self.number2symbol\r | |
80 | self.symbol2number[symbol] = number\r | |
81 | self.number2symbol[number] = symbol\r | |
82 | return True\r | |
83 | \r | |
84 | def parse_graminit_c(self, filename):\r | |
85 | """Parse the .c file written by pgen. (Internal)\r | |
86 | \r | |
87 | The file looks as follows. The first two lines are always this:\r | |
88 | \r | |
89 | #include "pgenheaders.h"\r | |
90 | #include "grammar.h"\r | |
91 | \r | |
92 | After that come four blocks:\r | |
93 | \r | |
94 | 1) one or more state definitions\r | |
95 | 2) a table defining dfas\r | |
96 | 3) a table defining labels\r | |
97 | 4) a struct defining the grammar\r | |
98 | \r | |
99 | A state definition has the following form:\r | |
100 | - one or more arc arrays, each of the form:\r | |
101 | static arc arcs_<n>_<m>[<k>] = {\r | |
102 | {<i>, <j>},\r | |
103 | ...\r | |
104 | };\r | |
105 | - followed by a state array, of the form:\r | |
106 | static state states_<s>[<t>] = {\r | |
107 | {<k>, arcs_<n>_<m>},\r | |
108 | ...\r | |
109 | };\r | |
110 | \r | |
111 | """\r | |
112 | try:\r | |
113 | f = open(filename)\r | |
114 | except IOError, err:\r | |
115 | print "Can't open %s: %s" % (filename, err)\r | |
116 | return False\r | |
117 | # The code below essentially uses f's iterator-ness!\r | |
118 | lineno = 0\r | |
119 | \r | |
120 | # Expect the two #include lines\r | |
121 | lineno, line = lineno+1, f.next()\r | |
122 | assert line == '#include "pgenheaders.h"\n', (lineno, line)\r | |
123 | lineno, line = lineno+1, f.next()\r | |
124 | assert line == '#include "grammar.h"\n', (lineno, line)\r | |
125 | \r | |
126 | # Parse the state definitions\r | |
127 | lineno, line = lineno+1, f.next()\r | |
128 | allarcs = {}\r | |
129 | states = []\r | |
130 | while line.startswith("static arc "):\r | |
131 | while line.startswith("static arc "):\r | |
132 | mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",\r | |
133 | line)\r | |
134 | assert mo, (lineno, line)\r | |
135 | n, m, k = map(int, mo.groups())\r | |
136 | arcs = []\r | |
137 | for _ in range(k):\r | |
138 | lineno, line = lineno+1, f.next()\r | |
139 | mo = re.match(r"\s+{(\d+), (\d+)},$", line)\r | |
140 | assert mo, (lineno, line)\r | |
141 | i, j = map(int, mo.groups())\r | |
142 | arcs.append((i, j))\r | |
143 | lineno, line = lineno+1, f.next()\r | |
144 | assert line == "};\n", (lineno, line)\r | |
145 | allarcs[(n, m)] = arcs\r | |
146 | lineno, line = lineno+1, f.next()\r | |
147 | mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)\r | |
148 | assert mo, (lineno, line)\r | |
149 | s, t = map(int, mo.groups())\r | |
150 | assert s == len(states), (lineno, line)\r | |
151 | state = []\r | |
152 | for _ in range(t):\r | |
153 | lineno, line = lineno+1, f.next()\r | |
154 | mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)\r | |
155 | assert mo, (lineno, line)\r | |
156 | k, n, m = map(int, mo.groups())\r | |
157 | arcs = allarcs[n, m]\r | |
158 | assert k == len(arcs), (lineno, line)\r | |
159 | state.append(arcs)\r | |
160 | states.append(state)\r | |
161 | lineno, line = lineno+1, f.next()\r | |
162 | assert line == "};\n", (lineno, line)\r | |
163 | lineno, line = lineno+1, f.next()\r | |
164 | self.states = states\r | |
165 | \r | |
166 | # Parse the dfas\r | |
167 | dfas = {}\r | |
168 | mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)\r | |
169 | assert mo, (lineno, line)\r | |
170 | ndfas = int(mo.group(1))\r | |
171 | for i in range(ndfas):\r | |
172 | lineno, line = lineno+1, f.next()\r | |
173 | mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',\r | |
174 | line)\r | |
175 | assert mo, (lineno, line)\r | |
176 | symbol = mo.group(2)\r | |
177 | number, x, y, z = map(int, mo.group(1, 3, 4, 5))\r | |
178 | assert self.symbol2number[symbol] == number, (lineno, line)\r | |
179 | assert self.number2symbol[number] == symbol, (lineno, line)\r | |
180 | assert x == 0, (lineno, line)\r | |
181 | state = states[z]\r | |
182 | assert y == len(state), (lineno, line)\r | |
183 | lineno, line = lineno+1, f.next()\r | |
184 | mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)\r | |
185 | assert mo, (lineno, line)\r | |
186 | first = {}\r | |
187 | rawbitset = eval(mo.group(1))\r | |
188 | for i, c in enumerate(rawbitset):\r | |
189 | byte = ord(c)\r | |
190 | for j in range(8):\r | |
191 | if byte & (1<<j):\r | |
192 | first[i*8 + j] = 1\r | |
193 | dfas[number] = (state, first)\r | |
194 | lineno, line = lineno+1, f.next()\r | |
195 | assert line == "};\n", (lineno, line)\r | |
196 | self.dfas = dfas\r | |
197 | \r | |
198 | # Parse the labels\r | |
199 | labels = []\r | |
200 | lineno, line = lineno+1, f.next()\r | |
201 | mo = re.match(r"static label labels\[(\d+)\] = {$", line)\r | |
202 | assert mo, (lineno, line)\r | |
203 | nlabels = int(mo.group(1))\r | |
204 | for i in range(nlabels):\r | |
205 | lineno, line = lineno+1, f.next()\r | |
206 | mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)\r | |
207 | assert mo, (lineno, line)\r | |
208 | x, y = mo.groups()\r | |
209 | x = int(x)\r | |
210 | if y == "0":\r | |
211 | y = None\r | |
212 | else:\r | |
213 | y = eval(y)\r | |
214 | labels.append((x, y))\r | |
215 | lineno, line = lineno+1, f.next()\r | |
216 | assert line == "};\n", (lineno, line)\r | |
217 | self.labels = labels\r | |
218 | \r | |
219 | # Parse the grammar struct\r | |
220 | lineno, line = lineno+1, f.next()\r | |
221 | assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)\r | |
222 | lineno, line = lineno+1, f.next()\r | |
223 | mo = re.match(r"\s+(\d+),$", line)\r | |
224 | assert mo, (lineno, line)\r | |
225 | ndfas = int(mo.group(1))\r | |
226 | assert ndfas == len(self.dfas)\r | |
227 | lineno, line = lineno+1, f.next()\r | |
228 | assert line == "\tdfas,\n", (lineno, line)\r | |
229 | lineno, line = lineno+1, f.next()\r | |
230 | mo = re.match(r"\s+{(\d+), labels},$", line)\r | |
231 | assert mo, (lineno, line)\r | |
232 | nlabels = int(mo.group(1))\r | |
233 | assert nlabels == len(self.labels), (lineno, line)\r | |
234 | lineno, line = lineno+1, f.next()\r | |
235 | mo = re.match(r"\s+(\d+)$", line)\r | |
236 | assert mo, (lineno, line)\r | |
237 | start = int(mo.group(1))\r | |
238 | assert start in self.number2symbol, (lineno, line)\r | |
239 | self.start = start\r | |
240 | lineno, line = lineno+1, f.next()\r | |
241 | assert line == "};\n", (lineno, line)\r | |
242 | try:\r | |
243 | lineno, line = lineno+1, f.next()\r | |
244 | except StopIteration:\r | |
245 | pass\r | |
246 | else:\r | |
247 | assert 0, (lineno, line)\r | |
248 | \r | |
249 | def finish_off(self):\r | |
250 | """Create additional useful structures. (Internal)."""\r | |
251 | self.keywords = {} # map from keyword strings to arc labels\r | |
252 | self.tokens = {} # map from numeric token values to arc labels\r | |
253 | for ilabel, (type, value) in enumerate(self.labels):\r | |
254 | if type == token.NAME and value is not None:\r | |
255 | self.keywords[value] = ilabel\r | |
256 | elif value is None:\r | |
257 | self.tokens[type] = ilabel\r |