]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * | |
3 | * Copyright (c) 1998-2002 | |
4 | * John Maddock | |
5 | * | |
6 | * Use, modification and distribution are subject to the | |
7 | * Boost Software License, Version 1.0. (See accompanying file | |
8 | * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
9 | * | |
10 | */ | |
11 | ||
12 | /* | |
13 | * LOCATION: see http://www.boost.org for most recent version. | |
14 | * FILE states.cpp | |
15 | * VERSION see <boost/version.hpp> | |
16 | * DESCRIPTION: Declares internal state machine structures. | |
17 | */ | |
18 | ||
19 | #ifndef BOOST_REGEX_V4_STATES_HPP | |
20 | #define BOOST_REGEX_V4_STATES_HPP | |
21 | ||
22 | #ifdef BOOST_MSVC | |
23 | #pragma warning(push) | |
24 | #pragma warning(disable: 4103) | |
25 | #endif | |
26 | #ifdef BOOST_HAS_ABI_HEADERS | |
27 | # include BOOST_ABI_PREFIX | |
28 | #endif | |
29 | #ifdef BOOST_MSVC | |
30 | #pragma warning(pop) | |
31 | #endif | |
32 | ||
33 | namespace boost{ | |
34 | namespace BOOST_REGEX_DETAIL_NS{ | |
35 | ||
36 | /*** mask_type ******************************************************* | |
37 | Whenever we have a choice of two alternatives, we use an array of bytes | |
38 | to indicate which of the two alternatives it is possible to take for any | |
39 | given input character. If mask_take is set, then we can take the next | |
40 | state, and if mask_skip is set then we can take the alternative. | |
41 | ***********************************************************************/ | |
42 | enum mask_type | |
43 | { | |
44 | mask_take = 1, | |
45 | mask_skip = 2, | |
46 | mask_init = 4, | |
47 | mask_any = mask_skip | mask_take, | |
48 | mask_all = mask_any | |
49 | }; | |
50 | ||
51 | /*** helpers ********************************************************** | |
52 | These helpers let us use function overload resolution to detect whether | |
53 | we have narrow or wide character strings: | |
54 | ***********************************************************************/ | |
55 | struct _narrow_type{}; | |
56 | struct _wide_type{}; | |
57 | template <class charT> struct is_byte; | |
58 | template<> struct is_byte<char> { typedef _narrow_type width_type; }; | |
59 | template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; | |
60 | template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; | |
61 | template <class charT> struct is_byte { typedef _wide_type width_type; }; | |
62 | ||
63 | /*** enum syntax_element_type ****************************************** | |
64 | Every record in the state machine falls into one of the following types: | |
65 | ***********************************************************************/ | |
66 | enum syntax_element_type | |
67 | { | |
68 | // start of a marked sub-expression, or perl-style (?...) extension | |
69 | syntax_element_startmark = 0, | |
70 | // end of a marked sub-expression, or perl-style (?...) extension | |
71 | syntax_element_endmark = syntax_element_startmark + 1, | |
72 | // any sequence of literal characters | |
73 | syntax_element_literal = syntax_element_endmark + 1, | |
74 | // start of line assertion: ^ | |
75 | syntax_element_start_line = syntax_element_literal + 1, | |
76 | // end of line assertion $ | |
77 | syntax_element_end_line = syntax_element_start_line + 1, | |
78 | // match any character: . | |
79 | syntax_element_wild = syntax_element_end_line + 1, | |
80 | // end of expression: we have a match when we get here | |
81 | syntax_element_match = syntax_element_wild + 1, | |
82 | // perl style word boundary: \b | |
83 | syntax_element_word_boundary = syntax_element_match + 1, | |
84 | // perl style within word boundary: \B | |
85 | syntax_element_within_word = syntax_element_word_boundary + 1, | |
86 | // start of word assertion: \< | |
87 | syntax_element_word_start = syntax_element_within_word + 1, | |
88 | // end of word assertion: \> | |
89 | syntax_element_word_end = syntax_element_word_start + 1, | |
90 | // start of buffer assertion: \` | |
91 | syntax_element_buffer_start = syntax_element_word_end + 1, | |
92 | // end of buffer assertion: \' | |
93 | syntax_element_buffer_end = syntax_element_buffer_start + 1, | |
94 | // backreference to previously matched sub-expression | |
95 | syntax_element_backref = syntax_element_buffer_end + 1, | |
96 | // either a wide character set [..] or one with multicharacter collating elements: | |
97 | syntax_element_long_set = syntax_element_backref + 1, | |
98 | // narrow character set: [...] | |
99 | syntax_element_set = syntax_element_long_set + 1, | |
100 | // jump to a new state in the machine: | |
101 | syntax_element_jump = syntax_element_set + 1, | |
102 | // choose between two production states: | |
103 | syntax_element_alt = syntax_element_jump + 1, | |
104 | // a repeat | |
105 | syntax_element_rep = syntax_element_alt + 1, | |
106 | // match a combining character sequence | |
107 | syntax_element_combining = syntax_element_rep + 1, | |
108 | // perl style soft buffer end: \z | |
109 | syntax_element_soft_buffer_end = syntax_element_combining + 1, | |
110 | // perl style continuation: \G | |
111 | syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, | |
112 | // single character repeats: | |
113 | syntax_element_dot_rep = syntax_element_restart_continue + 1, | |
114 | syntax_element_char_rep = syntax_element_dot_rep + 1, | |
115 | syntax_element_short_set_rep = syntax_element_char_rep + 1, | |
116 | syntax_element_long_set_rep = syntax_element_short_set_rep + 1, | |
117 | // a backstep for lookbehind repeats: | |
118 | syntax_element_backstep = syntax_element_long_set_rep + 1, | |
119 | // an assertion that a mark was matched: | |
120 | syntax_element_assert_backref = syntax_element_backstep + 1, | |
121 | syntax_element_toggle_case = syntax_element_assert_backref + 1, | |
122 | // a recursive expression: | |
123 | syntax_element_recurse = syntax_element_toggle_case + 1, | |
124 | // Verbs: | |
125 | syntax_element_fail = syntax_element_recurse + 1, | |
126 | syntax_element_accept = syntax_element_fail + 1, | |
127 | syntax_element_commit = syntax_element_accept + 1, | |
128 | syntax_element_then = syntax_element_commit + 1 | |
129 | }; | |
130 | ||
131 | #ifdef BOOST_REGEX_DEBUG | |
132 | // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion | |
133 | std::ostream& operator<<(std::ostream&, syntax_element_type); | |
134 | #endif | |
135 | ||
136 | struct re_syntax_base; | |
137 | ||
138 | /*** union offset_type ************************************************ | |
139 | Points to another state in the machine. During machine construction | |
140 | we use integral offsets, but these are converted to pointers before | |
141 | execution of the machine. | |
142 | ***********************************************************************/ | |
143 | union offset_type | |
144 | { | |
145 | re_syntax_base* p; | |
146 | std::ptrdiff_t i; | |
147 | }; | |
148 | ||
149 | /*** struct re_syntax_base ******************************************** | |
150 | Base class for all states in the machine. | |
151 | ***********************************************************************/ | |
152 | struct re_syntax_base | |
153 | { | |
154 | syntax_element_type type; // what kind of state this is | |
155 | offset_type next; // next state in the machine | |
156 | }; | |
157 | ||
158 | /*** struct re_brace ************************************************** | |
159 | A marked parenthesis. | |
160 | ***********************************************************************/ | |
161 | struct re_brace : public re_syntax_base | |
162 | { | |
163 | // The index to match, can be zero (don't mark the sub-expression) | |
164 | // or negative (for perl style (?...) extentions): | |
165 | int index; | |
166 | bool icase; | |
167 | }; | |
168 | ||
169 | /*** struct re_dot ************************************************** | |
170 | Match anything. | |
171 | ***********************************************************************/ | |
172 | enum | |
173 | { | |
174 | dont_care = 1, | |
175 | force_not_newline = 0, | |
176 | force_newline = 2, | |
177 | ||
178 | test_not_newline = 2, | |
179 | test_newline = 3 | |
180 | }; | |
181 | struct re_dot : public re_syntax_base | |
182 | { | |
183 | unsigned char mask; | |
184 | }; | |
185 | ||
186 | /*** struct re_literal ************************************************ | |
187 | A string of literals, following this structure will be an | |
188 | array of characters: charT[length] | |
189 | ***********************************************************************/ | |
190 | struct re_literal : public re_syntax_base | |
191 | { | |
192 | unsigned int length; | |
193 | }; | |
194 | ||
195 | /*** struct re_case ************************************************ | |
196 | Indicates whether we are moving to a case insensive block or not | |
197 | ***********************************************************************/ | |
198 | struct re_case : public re_syntax_base | |
199 | { | |
200 | bool icase; | |
201 | }; | |
202 | ||
203 | /*** struct re_set_long *********************************************** | |
204 | A wide character set of characters, following this structure will be | |
205 | an array of type charT: | |
206 | First csingles null-terminated strings | |
207 | Then 2 * cranges NULL terminated strings | |
208 | Then cequivalents NULL terminated strings | |
209 | ***********************************************************************/ | |
210 | template <class mask_type> | |
211 | struct re_set_long : public re_syntax_base | |
212 | { | |
213 | unsigned int csingles, cranges, cequivalents; | |
214 | mask_type cclasses; | |
215 | mask_type cnclasses; | |
216 | bool isnot; | |
217 | bool singleton; | |
218 | }; | |
219 | ||
220 | /*** struct re_set **************************************************** | |
221 | A set of narrow-characters, matches any of _map which is none-zero | |
222 | ***********************************************************************/ | |
223 | struct re_set : public re_syntax_base | |
224 | { | |
225 | unsigned char _map[1 << CHAR_BIT]; | |
226 | }; | |
227 | ||
228 | /*** struct re_jump *************************************************** | |
229 | Jump to a new location in the machine (not next). | |
230 | ***********************************************************************/ | |
231 | struct re_jump : public re_syntax_base | |
232 | { | |
233 | offset_type alt; // location to jump to | |
234 | }; | |
235 | ||
236 | /*** struct re_alt *************************************************** | |
237 | Jump to a new location in the machine (possibly next). | |
238 | ***********************************************************************/ | |
239 | struct re_alt : public re_jump | |
240 | { | |
241 | unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump | |
242 | unsigned int can_be_null; // true if we match a NULL string | |
243 | }; | |
244 | ||
245 | /*** struct re_repeat ************************************************* | |
246 | Repeat a section of the machine | |
247 | ***********************************************************************/ | |
248 | struct re_repeat : public re_alt | |
249 | { | |
250 | std::size_t min, max; // min and max allowable repeats | |
251 | int state_id; // Unique identifier for this repeat | |
252 | bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) | |
253 | bool greedy; // True if this is a greedy repeat | |
254 | }; | |
255 | ||
256 | /*** struct re_recurse ************************************************ | |
257 | Recurse to a particular subexpression. | |
258 | **********************************************************************/ | |
259 | struct re_recurse : public re_jump | |
260 | { | |
261 | int state_id; // identifier of first nested repeat within the recursion. | |
262 | }; | |
263 | ||
264 | /*** struct re_commit ************************************************* | |
265 | Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens | |
266 | if no match is found and we start searching forward. | |
267 | **********************************************************************/ | |
268 | enum commit_type | |
269 | { | |
270 | commit_prune, | |
271 | commit_skip, | |
272 | commit_commit | |
273 | }; | |
274 | struct re_commit : public re_syntax_base | |
275 | { | |
276 | commit_type action; | |
277 | }; | |
278 | ||
279 | /*** enum re_jump_size_type ******************************************* | |
280 | Provides compiled size of re_jump structure (allowing for trailing alignment). | |
281 | We provide this so we know how manybytes to insert when constructing the machine | |
282 | (The value of padding_mask is defined in regex_raw_buffer.hpp). | |
283 | ***********************************************************************/ | |
284 | enum re_jump_size_type | |
285 | { | |
286 | re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), | |
287 | re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), | |
288 | re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) | |
289 | }; | |
290 | ||
291 | /*** proc re_is_set_member ********************************************* | |
292 | Forward declaration: we'll need this one later... | |
293 | ***********************************************************************/ | |
294 | ||
295 | template<class charT, class traits> | |
296 | struct regex_data; | |
297 | ||
298 | template <class iterator, class charT, class traits_type, class char_classT> | |
299 | iterator BOOST_REGEX_CALL re_is_set_member(iterator next, | |
300 | iterator last, | |
301 | const re_set_long<char_classT>* set_, | |
302 | const regex_data<charT, traits_type>& e, bool icase); | |
303 | ||
304 | } // namespace BOOST_REGEX_DETAIL_NS | |
305 | ||
306 | } // namespace boost | |
307 | ||
308 | #ifdef BOOST_MSVC | |
309 | #pragma warning(push) | |
310 | #pragma warning(disable: 4103) | |
311 | #endif | |
312 | #ifdef BOOST_HAS_ABI_HEADERS | |
313 | # include BOOST_ABI_SUFFIX | |
314 | #endif | |
315 | #ifdef BOOST_MSVC | |
316 | #pragma warning(pop) | |
317 | #endif | |
318 | ||
319 | #endif | |
320 | ||
321 |