]> git.proxmox.com Git - rustc.git/blob - vendor/regex-1.4.6/src/input.rs
New upstream version 1.55.0+dfsg1
[rustc.git] / vendor / regex-1.4.6 / src / input.rs
1 use std::char;
2 use std::cmp::Ordering;
3 use std::fmt;
4 use std::ops;
5 use std::u32;
6
7 use syntax;
8
9 use literal::LiteralSearcher;
10 use prog::InstEmptyLook;
11 use utf8::{decode_last_utf8, decode_utf8};
12
13 /// Represents a location in the input.
14 #[derive(Clone, Copy, Debug)]
15 pub struct InputAt {
16 pos: usize,
17 c: Char,
18 byte: Option<u8>,
19 len: usize,
20 }
21
22 impl InputAt {
23 /// Returns true iff this position is at the beginning of the input.
24 pub fn is_start(&self) -> bool {
25 self.pos == 0
26 }
27
28 /// Returns true iff this position is past the end of the input.
29 pub fn is_end(&self) -> bool {
30 self.c.is_none() && self.byte.is_none()
31 }
32
33 /// Returns the character at this position.
34 ///
35 /// If this position is just before or after the input, then an absent
36 /// character is returned.
37 pub fn char(&self) -> Char {
38 self.c
39 }
40
41 /// Returns the byte at this position.
42 pub fn byte(&self) -> Option<u8> {
43 self.byte
44 }
45
46 /// Returns the UTF-8 width of the character at this position.
47 pub fn len(&self) -> usize {
48 self.len
49 }
50
51 /// Returns whether the UTF-8 width of the character at this position
52 /// is zero.
53 pub fn is_empty(&self) -> bool {
54 self.len == 0
55 }
56
57 /// Returns the byte offset of this position.
58 pub fn pos(&self) -> usize {
59 self.pos
60 }
61
62 /// Returns the byte offset of the next position in the input.
63 pub fn next_pos(&self) -> usize {
64 self.pos + self.len
65 }
66 }
67
68 /// An abstraction over input used in the matching engines.
69 pub trait Input: fmt::Debug {
70 /// Return an encoding of the position at byte offset `i`.
71 fn at(&self, i: usize) -> InputAt;
72
73 /// Return the Unicode character occurring next to `at`.
74 ///
75 /// If no such character could be decoded, then `Char` is absent.
76 fn next_char(&self, at: InputAt) -> Char;
77
78 /// Return the Unicode character occurring previous to `at`.
79 ///
80 /// If no such character could be decoded, then `Char` is absent.
81 fn previous_char(&self, at: InputAt) -> Char;
82
83 /// Return true if the given empty width instruction matches at the
84 /// input position given.
85 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
86
87 /// Scan the input for a matching prefix.
88 fn prefix_at(
89 &self,
90 prefixes: &LiteralSearcher,
91 at: InputAt,
92 ) -> Option<InputAt>;
93
94 /// The number of bytes in the input.
95 fn len(&self) -> usize;
96
97 /// Whether the input is empty.
98 fn is_empty(&self) -> bool {
99 self.len() == 0
100 }
101
102 /// Return the given input as a sequence of bytes.
103 fn as_bytes(&self) -> &[u8];
104 }
105
106 impl<'a, T: Input> Input for &'a T {
107 fn at(&self, i: usize) -> InputAt {
108 (**self).at(i)
109 }
110
111 fn next_char(&self, at: InputAt) -> Char {
112 (**self).next_char(at)
113 }
114
115 fn previous_char(&self, at: InputAt) -> Char {
116 (**self).previous_char(at)
117 }
118
119 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
120 (**self).is_empty_match(at, empty)
121 }
122
123 fn prefix_at(
124 &self,
125 prefixes: &LiteralSearcher,
126 at: InputAt,
127 ) -> Option<InputAt> {
128 (**self).prefix_at(prefixes, at)
129 }
130
131 fn len(&self) -> usize {
132 (**self).len()
133 }
134
135 fn as_bytes(&self) -> &[u8] {
136 (**self).as_bytes()
137 }
138 }
139
140 /// An input reader over characters.
141 #[derive(Clone, Copy, Debug)]
142 pub struct CharInput<'t>(&'t [u8]);
143
144 impl<'t> CharInput<'t> {
145 /// Return a new character input reader for the given string.
146 pub fn new(s: &'t [u8]) -> CharInput<'t> {
147 CharInput(s)
148 }
149 }
150
151 impl<'t> ops::Deref for CharInput<'t> {
152 type Target = [u8];
153
154 fn deref(&self) -> &[u8] {
155 self.0
156 }
157 }
158
159 impl<'t> Input for CharInput<'t> {
160 fn at(&self, i: usize) -> InputAt {
161 if i >= self.len() {
162 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
163 } else {
164 let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
165 InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
166 }
167 }
168
169 fn next_char(&self, at: InputAt) -> Char {
170 at.char()
171 }
172
173 fn previous_char(&self, at: InputAt) -> Char {
174 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
175 }
176
177 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
178 use prog::EmptyLook::*;
179 match empty.look {
180 StartLine => {
181 let c = self.previous_char(at);
182 at.pos() == 0 || c == '\n'
183 }
184 EndLine => {
185 let c = self.next_char(at);
186 at.pos() == self.len() || c == '\n'
187 }
188 StartText => at.pos() == 0,
189 EndText => at.pos() == self.len(),
190 WordBoundary => {
191 let (c1, c2) = (self.previous_char(at), self.next_char(at));
192 c1.is_word_char() != c2.is_word_char()
193 }
194 NotWordBoundary => {
195 let (c1, c2) = (self.previous_char(at), self.next_char(at));
196 c1.is_word_char() == c2.is_word_char()
197 }
198 WordBoundaryAscii => {
199 let (c1, c2) = (self.previous_char(at), self.next_char(at));
200 c1.is_word_byte() != c2.is_word_byte()
201 }
202 NotWordBoundaryAscii => {
203 let (c1, c2) = (self.previous_char(at), self.next_char(at));
204 c1.is_word_byte() == c2.is_word_byte()
205 }
206 }
207 }
208
209 fn prefix_at(
210 &self,
211 prefixes: &LiteralSearcher,
212 at: InputAt,
213 ) -> Option<InputAt> {
214 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
215 }
216
217 fn len(&self) -> usize {
218 self.0.len()
219 }
220
221 fn as_bytes(&self) -> &[u8] {
222 self.0
223 }
224 }
225
226 /// An input reader over bytes.
227 #[derive(Clone, Copy, Debug)]
228 pub struct ByteInput<'t> {
229 text: &'t [u8],
230 only_utf8: bool,
231 }
232
233 impl<'t> ByteInput<'t> {
234 /// Return a new byte-based input reader for the given string.
235 pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
236 ByteInput { text: text, only_utf8: only_utf8 }
237 }
238 }
239
240 impl<'t> ops::Deref for ByteInput<'t> {
241 type Target = [u8];
242
243 fn deref(&self) -> &[u8] {
244 self.text
245 }
246 }
247
248 impl<'t> Input for ByteInput<'t> {
249 fn at(&self, i: usize) -> InputAt {
250 if i >= self.len() {
251 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
252 } else {
253 InputAt {
254 pos: i,
255 c: None.into(),
256 byte: self.get(i).cloned(),
257 len: 1,
258 }
259 }
260 }
261
262 fn next_char(&self, at: InputAt) -> Char {
263 decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
264 }
265
266 fn previous_char(&self, at: InputAt) -> Char {
267 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
268 }
269
270 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
271 use prog::EmptyLook::*;
272 match empty.look {
273 StartLine => {
274 let c = self.previous_char(at);
275 at.pos() == 0 || c == '\n'
276 }
277 EndLine => {
278 let c = self.next_char(at);
279 at.pos() == self.len() || c == '\n'
280 }
281 StartText => at.pos() == 0,
282 EndText => at.pos() == self.len(),
283 WordBoundary => {
284 let (c1, c2) = (self.previous_char(at), self.next_char(at));
285 c1.is_word_char() != c2.is_word_char()
286 }
287 NotWordBoundary => {
288 let (c1, c2) = (self.previous_char(at), self.next_char(at));
289 c1.is_word_char() == c2.is_word_char()
290 }
291 WordBoundaryAscii => {
292 let (c1, c2) = (self.previous_char(at), self.next_char(at));
293 if self.only_utf8 {
294 // If we must match UTF-8, then we can't match word
295 // boundaries at invalid UTF-8.
296 if c1.is_none() && !at.is_start() {
297 return false;
298 }
299 if c2.is_none() && !at.is_end() {
300 return false;
301 }
302 }
303 c1.is_word_byte() != c2.is_word_byte()
304 }
305 NotWordBoundaryAscii => {
306 let (c1, c2) = (self.previous_char(at), self.next_char(at));
307 if self.only_utf8 {
308 // If we must match UTF-8, then we can't match word
309 // boundaries at invalid UTF-8.
310 if c1.is_none() && !at.is_start() {
311 return false;
312 }
313 if c2.is_none() && !at.is_end() {
314 return false;
315 }
316 }
317 c1.is_word_byte() == c2.is_word_byte()
318 }
319 }
320 }
321
322 fn prefix_at(
323 &self,
324 prefixes: &LiteralSearcher,
325 at: InputAt,
326 ) -> Option<InputAt> {
327 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
328 }
329
330 fn len(&self) -> usize {
331 self.text.len()
332 }
333
334 fn as_bytes(&self) -> &[u8] {
335 self.text
336 }
337 }
338
339 /// An inline representation of `Option<char>`.
340 ///
341 /// This eliminates the need to do case analysis on `Option<char>` to determine
342 /// ordinality with other characters.
343 ///
344 /// (The `Option<char>` is not related to encoding. Instead, it is used in the
345 /// matching engines to represent the beginning and ending boundaries of the
346 /// search text.)
347 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
348 pub struct Char(u32);
349
350 impl fmt::Debug for Char {
351 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
352 match char::from_u32(self.0) {
353 None => write!(f, "Empty"),
354 Some(c) => write!(f, "{:?}", c),
355 }
356 }
357 }
358
359 impl Char {
360 /// Returns true iff the character is absent.
361 #[inline]
362 pub fn is_none(self) -> bool {
363 self.0 == u32::MAX
364 }
365
366 /// Returns the length of the character's UTF-8 encoding.
367 ///
368 /// If the character is absent, then `1` is returned.
369 #[inline]
370 pub fn len_utf8(self) -> usize {
371 char::from_u32(self.0).map_or(1, |c| c.len_utf8())
372 }
373
374 /// Returns true iff the character is a word character.
375 ///
376 /// If the character is absent, then false is returned.
377 pub fn is_word_char(self) -> bool {
378 // is_word_character can panic if the Unicode data for \w isn't
379 // available. However, our compiler ensures that if a Unicode word
380 // boundary is used, then the data must also be available. If it isn't,
381 // then the compiler returns an error.
382 char::from_u32(self.0).map_or(false, syntax::is_word_character)
383 }
384
385 /// Returns true iff the byte is a word byte.
386 ///
387 /// If the byte is absent, then false is returned.
388 pub fn is_word_byte(self) -> bool {
389 match char::from_u32(self.0) {
390 Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
391 None | Some(_) => false,
392 }
393 }
394 }
395
396 impl From<char> for Char {
397 fn from(c: char) -> Char {
398 Char(c as u32)
399 }
400 }
401
402 impl From<Option<char>> for Char {
403 fn from(c: Option<char>) -> Char {
404 c.map_or(Char(u32::MAX), |c| c.into())
405 }
406 }
407
408 impl PartialEq<char> for Char {
409 #[inline]
410 fn eq(&self, other: &char) -> bool {
411 self.0 == *other as u32
412 }
413 }
414
415 impl PartialEq<Char> for char {
416 #[inline]
417 fn eq(&self, other: &Char) -> bool {
418 *self as u32 == other.0
419 }
420 }
421
422 impl PartialOrd<char> for Char {
423 #[inline]
424 fn partial_cmp(&self, other: &char) -> Option<Ordering> {
425 self.0.partial_cmp(&(*other as u32))
426 }
427 }
428
429 impl PartialOrd<Char> for char {
430 #[inline]
431 fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
432 (*self as u32).partial_cmp(&other.0)
433 }
434 }