]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // string_token.hpp |
2 | // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_STRING_TOKEN_HPP | |
7 | #define BOOST_LEXER_STRING_TOKEN_HPP | |
8 | ||
9 | #include <algorithm> | |
10 | #include "size_t.hpp" | |
11 | #include "consts.hpp" // num_chars, num_wchar_ts | |
12 | #include <string> | |
13 | #include <limits> | |
14 | ||
15 | namespace boost | |
16 | { | |
17 | namespace lexer | |
18 | { | |
19 | template<typename CharT> | |
20 | struct basic_string_token | |
21 | { | |
22 | typedef std::basic_string<CharT> string; | |
23 | ||
24 | bool _negated; | |
25 | string _charset; | |
26 | ||
27 | basic_string_token () : | |
28 | _negated (false) | |
29 | { | |
30 | } | |
31 | ||
32 | basic_string_token (const bool negated_, const string &charset_) : | |
33 | _negated (negated_), | |
34 | _charset (charset_) | |
35 | { | |
36 | } | |
37 | ||
38 | void remove_duplicates () | |
39 | { | |
40 | const CharT *start_ = _charset.c_str (); | |
41 | const CharT *end_ = start_ + _charset.size (); | |
42 | ||
43 | // Optimisation for very large charsets: | |
44 | // sorting via pointers is much quicker than | |
45 | // via iterators... | |
46 | std::sort (const_cast<CharT *> (start_), const_cast<CharT *> (end_)); | |
47 | _charset.erase (std::unique (_charset.begin (), _charset.end ()), | |
48 | _charset.end ()); | |
49 | } | |
50 | ||
51 | void normalise () | |
52 | { | |
53 | const std::size_t max_chars_ = sizeof (CharT) == 1 ? | |
54 | num_chars : num_wchar_ts; | |
55 | ||
56 | if (_charset.length () == max_chars_) | |
57 | { | |
58 | _negated = !_negated; | |
59 | _charset.clear (); | |
60 | } | |
61 | else if (_charset.length () > max_chars_ / 2) | |
62 | { | |
63 | negate (); | |
64 | } | |
65 | } | |
66 | ||
67 | void negate () | |
68 | { | |
69 | const std::size_t max_chars_ = sizeof (CharT) == 1 ? | |
70 | num_chars : num_wchar_ts; | |
71 | CharT curr_char_ = (std::numeric_limits<CharT>::min)(); | |
72 | string temp_; | |
73 | const CharT *curr_ = _charset.c_str (); | |
74 | const CharT *chars_end_ = curr_ + _charset.size (); | |
75 | ||
76 | _negated = !_negated; | |
77 | temp_.resize (max_chars_ - _charset.size ()); | |
78 | ||
79 | CharT *ptr_ = const_cast<CharT *> (temp_.c_str ()); | |
80 | std::size_t i_ = 0; | |
81 | ||
82 | while (curr_ < chars_end_) | |
83 | { | |
84 | while (*curr_ > curr_char_) | |
85 | { | |
86 | *ptr_ = curr_char_; | |
87 | ++ptr_; | |
88 | ++curr_char_; | |
89 | ++i_; | |
90 | } | |
91 | ||
92 | ++curr_char_; | |
93 | ++curr_; | |
94 | ++i_; | |
95 | } | |
96 | ||
97 | for (; i_ < max_chars_; ++i_) | |
98 | { | |
99 | *ptr_ = curr_char_; | |
100 | ++ptr_; | |
101 | ++curr_char_; | |
102 | } | |
103 | ||
104 | _charset = temp_; | |
105 | } | |
106 | ||
107 | bool operator < (const basic_string_token &rhs_) const | |
108 | { | |
109 | return _negated < rhs_._negated || | |
110 | (_negated == rhs_._negated && _charset < rhs_._charset); | |
111 | } | |
112 | ||
113 | bool empty () const | |
114 | { | |
115 | return _charset.empty () && !_negated; | |
116 | } | |
117 | ||
118 | bool any () const | |
119 | { | |
120 | return _charset.empty () && _negated; | |
121 | } | |
122 | ||
123 | void clear () | |
124 | { | |
125 | _negated = false; | |
126 | _charset.clear (); | |
127 | } | |
128 | ||
129 | void intersect (basic_string_token &rhs_, basic_string_token &overlap_) | |
130 | { | |
131 | if ((any () && rhs_.any ()) || (_negated == rhs_._negated && | |
132 | !any () && !rhs_.any ())) | |
133 | { | |
134 | intersect_same_types (rhs_, overlap_); | |
135 | } | |
136 | else | |
137 | { | |
138 | intersect_diff_types (rhs_, overlap_); | |
139 | } | |
140 | } | |
141 | ||
142 | static void escape_char (const CharT ch_, string &out_) | |
143 | { | |
144 | switch (ch_) | |
145 | { | |
146 | case '\0': | |
147 | out_ += '\\'; | |
148 | out_ += '0'; | |
149 | break; | |
150 | case '\a': | |
151 | out_ += '\\'; | |
152 | out_ += 'a'; | |
153 | break; | |
154 | case '\b': | |
155 | out_ += '\\'; | |
156 | out_ += 'b'; | |
157 | break; | |
158 | case 27: | |
159 | out_ += '\\'; | |
160 | out_ += 'x'; | |
161 | out_ += '1'; | |
162 | out_ += 'b'; | |
163 | break; | |
164 | case '\f': | |
165 | out_ += '\\'; | |
166 | out_ += 'f'; | |
167 | break; | |
168 | case '\n': | |
169 | out_ += '\\'; | |
170 | out_ += 'n'; | |
171 | break; | |
172 | case '\r': | |
173 | out_ += '\\'; | |
174 | out_ += 'r'; | |
175 | break; | |
176 | case '\t': | |
177 | out_ += '\\'; | |
178 | out_ += 't'; | |
179 | break; | |
180 | case '\v': | |
181 | out_ += '\\'; | |
182 | out_ += 'v'; | |
183 | break; | |
184 | case '\\': | |
185 | out_ += '\\'; | |
186 | out_ += '\\'; | |
187 | break; | |
188 | case '"': | |
189 | out_ += '\\'; | |
190 | out_ += '"'; | |
191 | break; | |
192 | case '\'': | |
193 | out_ += '\\'; | |
194 | out_ += '\''; | |
195 | break; | |
196 | default: | |
197 | { | |
198 | if (ch_ < 32 && ch_ >= 0) | |
199 | { | |
200 | std::basic_stringstream<CharT> ss_; | |
201 | ||
202 | out_ += '\\'; | |
203 | out_ += 'x'; | |
204 | ss_ << std::hex << | |
205 | static_cast<std::size_t> (ch_); | |
206 | out_ += ss_.str (); | |
207 | } | |
208 | else | |
209 | { | |
210 | out_ += ch_; | |
211 | } | |
212 | ||
213 | break; | |
214 | } | |
215 | } | |
216 | } | |
217 | ||
218 | private: | |
219 | void intersect_same_types (basic_string_token &rhs_, | |
220 | basic_string_token &overlap_) | |
221 | { | |
222 | if (any ()) | |
223 | { | |
224 | clear (); | |
225 | overlap_._negated = true; | |
226 | rhs_.clear (); | |
227 | } | |
228 | else | |
229 | { | |
230 | typename string::iterator iter_ = _charset.begin (); | |
231 | typename string::iterator end_ = _charset.end (); | |
232 | typename string::iterator rhs_iter_ = rhs_._charset.begin (); | |
233 | typename string::iterator rhs_end_ = rhs_._charset.end (); | |
234 | ||
235 | overlap_._negated = _negated; | |
236 | ||
237 | while (iter_ != end_ && rhs_iter_ != rhs_end_) | |
238 | { | |
239 | if (*iter_ < *rhs_iter_) | |
240 | { | |
241 | ++iter_; | |
242 | } | |
243 | else if (*iter_ > *rhs_iter_) | |
244 | { | |
245 | ++rhs_iter_; | |
246 | } | |
247 | else | |
248 | { | |
249 | overlap_._charset += *iter_; | |
250 | iter_ = _charset.erase (iter_); | |
251 | end_ = _charset.end (); | |
252 | rhs_iter_ = rhs_._charset.erase (rhs_iter_); | |
253 | rhs_end_ = rhs_._charset.end (); | |
254 | } | |
255 | } | |
256 | ||
257 | if (_negated) | |
258 | { | |
259 | // duplicates already merged, so safe to merge | |
260 | // using std lib. | |
261 | ||
262 | // src, dest | |
263 | merge (_charset, overlap_._charset); | |
264 | // duplicates already merged, so safe to merge | |
265 | // using std lib. | |
266 | ||
267 | // src, dest | |
268 | merge (rhs_._charset, overlap_._charset); | |
269 | _negated = false; | |
270 | rhs_._negated = false; | |
271 | std::swap (_charset, rhs_._charset); | |
272 | normalise (); | |
273 | overlap_.normalise (); | |
274 | rhs_.normalise (); | |
275 | } | |
276 | else if (!overlap_._charset.empty ()) | |
277 | { | |
278 | normalise (); | |
279 | overlap_.normalise (); | |
280 | rhs_.normalise (); | |
281 | } | |
282 | } | |
283 | } | |
284 | ||
285 | void intersect_diff_types (basic_string_token &rhs_, | |
286 | basic_string_token &overlap_) | |
287 | { | |
288 | if (any ()) | |
289 | { | |
290 | intersect_any (rhs_, overlap_); | |
291 | } | |
292 | else if (_negated) | |
293 | { | |
294 | intersect_negated (rhs_, overlap_); | |
295 | } | |
296 | else // _negated == false | |
297 | { | |
298 | intersect_charset (rhs_, overlap_); | |
299 | } | |
300 | } | |
301 | ||
302 | void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_) | |
303 | { | |
304 | if (rhs_._negated) | |
305 | { | |
306 | rhs_.intersect_negated (*this, overlap_); | |
307 | } | |
308 | else // rhs._negated == false | |
309 | { | |
310 | rhs_.intersect_charset (*this, overlap_); | |
311 | } | |
312 | } | |
313 | ||
314 | void intersect_negated (basic_string_token &rhs_, | |
315 | basic_string_token &overlap_) | |
316 | { | |
317 | if (rhs_.any ()) | |
318 | { | |
319 | overlap_._negated = true; | |
320 | overlap_._charset = _charset; | |
321 | rhs_._negated = false; | |
322 | rhs_._charset = _charset; | |
323 | clear (); | |
324 | } | |
325 | else // rhs._negated == false | |
326 | { | |
327 | rhs_.intersect_charset (*this, overlap_); | |
328 | } | |
329 | } | |
330 | ||
331 | void intersect_charset (basic_string_token &rhs_, | |
332 | basic_string_token &overlap_) | |
333 | { | |
334 | if (rhs_.any ()) | |
335 | { | |
336 | overlap_._charset = _charset; | |
337 | rhs_._negated = true; | |
338 | rhs_._charset = _charset; | |
339 | clear (); | |
340 | } | |
341 | else // rhs_._negated == true | |
342 | { | |
343 | typename string::iterator iter_ = _charset.begin (); | |
344 | typename string::iterator end_ = _charset.end (); | |
345 | typename string::iterator rhs_iter_ = rhs_._charset.begin (); | |
346 | typename string::iterator rhs_end_ = rhs_._charset.end (); | |
347 | ||
348 | while (iter_ != end_ && rhs_iter_ != rhs_end_) | |
349 | { | |
350 | if (*iter_ < *rhs_iter_) | |
351 | { | |
352 | overlap_._charset += *iter_; | |
353 | rhs_iter_ = rhs_._charset.insert (rhs_iter_, *iter_); | |
354 | ++rhs_iter_; | |
355 | rhs_end_ = rhs_._charset.end (); | |
356 | iter_ = _charset.erase (iter_); | |
357 | end_ = _charset.end (); | |
358 | } | |
359 | else if (*iter_ > *rhs_iter_) | |
360 | { | |
361 | ++rhs_iter_; | |
362 | } | |
363 | else | |
364 | { | |
365 | ++iter_; | |
366 | ++rhs_iter_; | |
367 | } | |
368 | } | |
369 | ||
370 | if (iter_ != end_) | |
371 | { | |
372 | // nothing bigger in rhs_ than iter_, | |
373 | // so safe to merge using std lib. | |
374 | string temp_ (iter_, end_); | |
375 | ||
376 | // src, dest | |
377 | merge (temp_, overlap_._charset); | |
378 | _charset.erase (iter_, end_); | |
379 | } | |
380 | ||
381 | if (!overlap_._charset.empty ()) | |
382 | { | |
383 | merge (overlap_._charset, rhs_._charset); | |
384 | // possible duplicates, so check for any and erase. | |
385 | rhs_._charset.erase (std::unique (rhs_._charset.begin (), | |
386 | rhs_._charset.end ()), rhs_._charset.end ()); | |
387 | normalise (); | |
388 | overlap_.normalise (); | |
389 | rhs_.normalise (); | |
390 | } | |
391 | } | |
392 | } | |
393 | ||
394 | void merge (string &src_, string &dest_) | |
395 | { | |
396 | string tmp_ (src_.size () + dest_.size (), 0); | |
397 | ||
398 | std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (), | |
399 | tmp_.begin ()); | |
400 | dest_ = tmp_; | |
401 | } | |
402 | }; | |
403 | } | |
404 | } | |
405 | ||
406 | #endif |