]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | "Utility functions used by the btm_matcher module"\r |
2 | \r | |
3 | from . import pytree\r | |
4 | from .pgen2 import grammar, token\r | |
5 | from .pygram import pattern_symbols, python_symbols\r | |
6 | \r | |
7 | syms = pattern_symbols\r | |
8 | pysyms = python_symbols\r | |
9 | tokens = grammar.opmap\r | |
10 | token_labels = token\r | |
11 | \r | |
12 | TYPE_ANY = -1\r | |
13 | TYPE_ALTERNATIVES = -2\r | |
14 | TYPE_GROUP = -3\r | |
15 | \r | |
16 | class MinNode(object):\r | |
17 | """This class serves as an intermediate representation of the\r | |
18 | pattern tree during the conversion to sets of leaf-to-root\r | |
19 | subpatterns"""\r | |
20 | \r | |
21 | def __init__(self, type=None, name=None):\r | |
22 | self.type = type\r | |
23 | self.name = name\r | |
24 | self.children = []\r | |
25 | self.leaf = False\r | |
26 | self.parent = None\r | |
27 | self.alternatives = []\r | |
28 | self.group = []\r | |
29 | \r | |
30 | def __repr__(self):\r | |
31 | return str(self.type) + ' ' + str(self.name)\r | |
32 | \r | |
33 | def leaf_to_root(self):\r | |
34 | """Internal method. Returns a characteristic path of the\r | |
35 | pattern tree. This method must be run for all leaves until the\r | |
36 | linear subpatterns are merged into a single"""\r | |
37 | node = self\r | |
38 | subp = []\r | |
39 | while node:\r | |
40 | if node.type == TYPE_ALTERNATIVES:\r | |
41 | node.alternatives.append(subp)\r | |
42 | if len(node.alternatives) == len(node.children):\r | |
43 | #last alternative\r | |
44 | subp = [tuple(node.alternatives)]\r | |
45 | node.alternatives = []\r | |
46 | node = node.parent\r | |
47 | continue\r | |
48 | else:\r | |
49 | node = node.parent\r | |
50 | subp = None\r | |
51 | break\r | |
52 | \r | |
53 | if node.type == TYPE_GROUP:\r | |
54 | node.group.append(subp)\r | |
55 | #probably should check the number of leaves\r | |
56 | if len(node.group) == len(node.children):\r | |
57 | subp = get_characteristic_subpattern(node.group)\r | |
58 | node.group = []\r | |
59 | node = node.parent\r | |
60 | continue\r | |
61 | else:\r | |
62 | node = node.parent\r | |
63 | subp = None\r | |
64 | break\r | |
65 | \r | |
66 | if node.type == token_labels.NAME and node.name:\r | |
67 | #in case of type=name, use the name instead\r | |
68 | subp.append(node.name)\r | |
69 | else:\r | |
70 | subp.append(node.type)\r | |
71 | \r | |
72 | node = node.parent\r | |
73 | return subp\r | |
74 | \r | |
75 | def get_linear_subpattern(self):\r | |
76 | """Drives the leaf_to_root method. The reason that\r | |
77 | leaf_to_root must be run multiple times is because we need to\r | |
78 | reject 'group' matches; for example the alternative form\r | |
79 | (a | b c) creates a group [b c] that needs to be matched. Since\r | |
80 | matching multiple linear patterns overcomes the automaton's\r | |
81 | capabilities, leaf_to_root merges each group into a single\r | |
82 | choice based on 'characteristic'ity,\r | |
83 | \r | |
84 | i.e. (a|b c) -> (a|b) if b more characteristic than c\r | |
85 | \r | |
86 | Returns: The most 'characteristic'(as defined by\r | |
87 | get_characteristic_subpattern) path for the compiled pattern\r | |
88 | tree.\r | |
89 | """\r | |
90 | \r | |
91 | for l in self.leaves():\r | |
92 | subp = l.leaf_to_root()\r | |
93 | if subp:\r | |
94 | return subp\r | |
95 | \r | |
96 | def leaves(self):\r | |
97 | "Generator that returns the leaves of the tree"\r | |
98 | for child in self.children:\r | |
99 | for x in child.leaves():\r | |
100 | yield x\r | |
101 | if not self.children:\r | |
102 | yield self\r | |
103 | \r | |
104 | def reduce_tree(node, parent=None):\r | |
105 | """\r | |
106 | Internal function. Reduces a compiled pattern tree to an\r | |
107 | intermediate representation suitable for feeding the\r | |
108 | automaton. This also trims off any optional pattern elements(like\r | |
109 | [a], a*).\r | |
110 | """\r | |
111 | \r | |
112 | new_node = None\r | |
113 | #switch on the node type\r | |
114 | if node.type == syms.Matcher:\r | |
115 | #skip\r | |
116 | node = node.children[0]\r | |
117 | \r | |
118 | if node.type == syms.Alternatives :\r | |
119 | #2 cases\r | |
120 | if len(node.children) <= 2:\r | |
121 | #just a single 'Alternative', skip this node\r | |
122 | new_node = reduce_tree(node.children[0], parent)\r | |
123 | else:\r | |
124 | #real alternatives\r | |
125 | new_node = MinNode(type=TYPE_ALTERNATIVES)\r | |
126 | #skip odd children('|' tokens)\r | |
127 | for child in node.children:\r | |
128 | if node.children.index(child)%2:\r | |
129 | continue\r | |
130 | reduced = reduce_tree(child, new_node)\r | |
131 | if reduced is not None:\r | |
132 | new_node.children.append(reduced)\r | |
133 | elif node.type == syms.Alternative:\r | |
134 | if len(node.children) > 1:\r | |
135 | \r | |
136 | new_node = MinNode(type=TYPE_GROUP)\r | |
137 | for child in node.children:\r | |
138 | reduced = reduce_tree(child, new_node)\r | |
139 | if reduced:\r | |
140 | new_node.children.append(reduced)\r | |
141 | if not new_node.children:\r | |
142 | # delete the group if all of the children were reduced to None\r | |
143 | new_node = None\r | |
144 | \r | |
145 | else:\r | |
146 | new_node = reduce_tree(node.children[0], parent)\r | |
147 | \r | |
148 | elif node.type == syms.Unit:\r | |
149 | if (isinstance(node.children[0], pytree.Leaf) and\r | |
150 | node.children[0].value == '('):\r | |
151 | #skip parentheses\r | |
152 | return reduce_tree(node.children[1], parent)\r | |
153 | if ((isinstance(node.children[0], pytree.Leaf) and\r | |
154 | node.children[0].value == '[')\r | |
155 | or\r | |
156 | (len(node.children)>1 and\r | |
157 | hasattr(node.children[1], "value") and\r | |
158 | node.children[1].value == '[')):\r | |
159 | #skip whole unit if its optional\r | |
160 | return None\r | |
161 | \r | |
162 | leaf = True\r | |
163 | details_node = None\r | |
164 | alternatives_node = None\r | |
165 | has_repeater = False\r | |
166 | repeater_node = None\r | |
167 | has_variable_name = False\r | |
168 | \r | |
169 | for child in node.children:\r | |
170 | if child.type == syms.Details:\r | |
171 | leaf = False\r | |
172 | details_node = child\r | |
173 | elif child.type == syms.Repeater:\r | |
174 | has_repeater = True\r | |
175 | repeater_node = child\r | |
176 | elif child.type == syms.Alternatives:\r | |
177 | alternatives_node = child\r | |
178 | if hasattr(child, 'value') and child.value == '=': # variable name\r | |
179 | has_variable_name = True\r | |
180 | \r | |
181 | #skip variable name\r | |
182 | if has_variable_name:\r | |
183 | #skip variable name, '='\r | |
184 | name_leaf = node.children[2]\r | |
185 | if hasattr(name_leaf, 'value') and name_leaf.value == '(':\r | |
186 | # skip parenthesis\r | |
187 | name_leaf = node.children[3]\r | |
188 | else:\r | |
189 | name_leaf = node.children[0]\r | |
190 | \r | |
191 | #set node type\r | |
192 | if name_leaf.type == token_labels.NAME:\r | |
193 | #(python) non-name or wildcard\r | |
194 | if name_leaf.value == 'any':\r | |
195 | new_node = MinNode(type=TYPE_ANY)\r | |
196 | else:\r | |
197 | if hasattr(token_labels, name_leaf.value):\r | |
198 | new_node = MinNode(type=getattr(token_labels, name_leaf.value))\r | |
199 | else:\r | |
200 | new_node = MinNode(type=getattr(pysyms, name_leaf.value))\r | |
201 | \r | |
202 | elif name_leaf.type == token_labels.STRING:\r | |
203 | #(python) name or character; remove the apostrophes from\r | |
204 | #the string value\r | |
205 | name = name_leaf.value.strip("'")\r | |
206 | if name in tokens:\r | |
207 | new_node = MinNode(type=tokens[name])\r | |
208 | else:\r | |
209 | new_node = MinNode(type=token_labels.NAME, name=name)\r | |
210 | elif name_leaf.type == syms.Alternatives:\r | |
211 | new_node = reduce_tree(alternatives_node, parent)\r | |
212 | \r | |
213 | #handle repeaters\r | |
214 | if has_repeater:\r | |
215 | if repeater_node.children[0].value == '*':\r | |
216 | #reduce to None\r | |
217 | new_node = None\r | |
218 | elif repeater_node.children[0].value == '+':\r | |
219 | #reduce to a single occurence i.e. do nothing\r | |
220 | pass\r | |
221 | else:\r | |
222 | #TODO: handle {min, max} repeaters\r | |
223 | raise NotImplementedError\r | |
224 | pass\r | |
225 | \r | |
226 | #add children\r | |
227 | if details_node and new_node is not None:\r | |
228 | for child in details_node.children[1:-1]:\r | |
229 | #skip '<', '>' markers\r | |
230 | reduced = reduce_tree(child, new_node)\r | |
231 | if reduced is not None:\r | |
232 | new_node.children.append(reduced)\r | |
233 | if new_node:\r | |
234 | new_node.parent = parent\r | |
235 | return new_node\r | |
236 | \r | |
237 | \r | |
238 | def get_characteristic_subpattern(subpatterns):\r | |
239 | """Picks the most characteristic from a list of linear patterns\r | |
240 | Current order used is:\r | |
241 | names > common_names > common_chars\r | |
242 | """\r | |
243 | if not isinstance(subpatterns, list):\r | |
244 | return subpatterns\r | |
245 | if len(subpatterns)==1:\r | |
246 | return subpatterns[0]\r | |
247 | \r | |
248 | # first pick out the ones containing variable names\r | |
249 | subpatterns_with_names = []\r | |
250 | subpatterns_with_common_names = []\r | |
251 | common_names = ['in', 'for', 'if' , 'not', 'None']\r | |
252 | subpatterns_with_common_chars = []\r | |
253 | common_chars = "[]().,:"\r | |
254 | for subpattern in subpatterns:\r | |
255 | if any(rec_test(subpattern, lambda x: type(x) is str)):\r | |
256 | if any(rec_test(subpattern,\r | |
257 | lambda x: isinstance(x, str) and x in common_chars)):\r | |
258 | subpatterns_with_common_chars.append(subpattern)\r | |
259 | elif any(rec_test(subpattern,\r | |
260 | lambda x: isinstance(x, str) and x in common_names)):\r | |
261 | subpatterns_with_common_names.append(subpattern)\r | |
262 | \r | |
263 | else:\r | |
264 | subpatterns_with_names.append(subpattern)\r | |
265 | \r | |
266 | if subpatterns_with_names:\r | |
267 | subpatterns = subpatterns_with_names\r | |
268 | elif subpatterns_with_common_names:\r | |
269 | subpatterns = subpatterns_with_common_names\r | |
270 | elif subpatterns_with_common_chars:\r | |
271 | subpatterns = subpatterns_with_common_chars\r | |
272 | # of the remaining subpatterns pick out the longest one\r | |
273 | return max(subpatterns, key=len)\r | |
274 | \r | |
275 | def rec_test(sequence, test_func):\r | |
276 | """Tests test_func on all items of sequence and items of included\r | |
277 | sub-iterables"""\r | |
278 | for x in sequence:\r | |
279 | if isinstance(x, (list, tuple)):\r | |
280 | for y in rec_test(x, test_func):\r | |
281 | yield y\r | |
282 | else:\r | |
283 | yield test_func(x)\r |