2 use std
::cmp
::Ordering
;
9 use literal
::LiteralSearcher
;
10 use prog
::InstEmptyLook
;
11 use utf8
::{decode_last_utf8, decode_utf8}
;
13 /// Represents a location in the input.
14 #[derive(Clone, Copy, Debug)]
23 /// Returns true iff this position is at the beginning of the input.
24 pub fn is_start(&self) -> bool
{
28 /// Returns true iff this position is past the end of the input.
29 pub fn is_end(&self) -> bool
{
30 self.c
.is_none() && self.byte
.is_none()
33 /// Returns the character at this position.
35 /// If this position is just before or after the input, then an absent
36 /// character is returned.
37 pub fn char(&self) -> Char
{
41 /// Returns the byte at this position.
42 pub fn byte(&self) -> Option
<u8> {
46 /// Returns the UTF-8 width of the character at this position.
47 pub fn len(&self) -> usize {
51 /// Returns whether the UTF-8 width of the character at this position
53 pub fn is_empty(&self) -> bool
{
57 /// Returns the byte offset of this position.
58 pub fn pos(&self) -> usize {
62 /// Returns the byte offset of the next position in the input.
63 pub fn next_pos(&self) -> usize {
68 /// An abstraction over input used in the matching engines.
69 pub trait Input
: fmt
::Debug
{
70 /// Return an encoding of the position at byte offset `i`.
71 fn at(&self, i
: usize) -> InputAt
;
73 /// Return the Unicode character occurring next to `at`.
75 /// If no such character could be decoded, then `Char` is absent.
76 fn next_char(&self, at
: InputAt
) -> Char
;
78 /// Return the Unicode character occurring previous to `at`.
80 /// If no such character could be decoded, then `Char` is absent.
81 fn previous_char(&self, at
: InputAt
) -> Char
;
83 /// Return true if the given empty width instruction matches at the
84 /// input position given.
85 fn is_empty_match(&self, at
: InputAt
, empty
: &InstEmptyLook
) -> bool
;
87 /// Scan the input for a matching prefix.
90 prefixes
: &LiteralSearcher
,
94 /// The number of bytes in the input.
95 fn len(&self) -> usize;
97 /// Whether the input is empty.
98 fn is_empty(&self) -> bool
{
102 /// Return the given input as a sequence of bytes.
103 fn as_bytes(&self) -> &[u8];
106 impl<'a
, T
: Input
> Input
for &'a T
{
107 fn at(&self, i
: usize) -> InputAt
{
111 fn next_char(&self, at
: InputAt
) -> Char
{
112 (**self).next_char(at
)
115 fn previous_char(&self, at
: InputAt
) -> Char
{
116 (**self).previous_char(at
)
119 fn is_empty_match(&self, at
: InputAt
, empty
: &InstEmptyLook
) -> bool
{
120 (**self).is_empty_match(at
, empty
)
125 prefixes
: &LiteralSearcher
,
127 ) -> Option
<InputAt
> {
128 (**self).prefix_at(prefixes
, at
)
131 fn len(&self) -> usize {
135 fn as_bytes(&self) -> &[u8] {
140 /// An input reader over characters.
141 #[derive(Clone, Copy, Debug)]
142 pub struct CharInput
<'t
>(&'t
[u8]);
144 impl<'t
> CharInput
<'t
> {
145 /// Return a new character input reader for the given string.
146 pub fn new(s
: &'t
[u8]) -> CharInput
<'t
> {
151 impl<'t
> ops
::Deref
for CharInput
<'t
> {
154 fn deref(&self) -> &[u8] {
159 impl<'t
> Input
for CharInput
<'t
> {
160 fn at(&self, i
: usize) -> InputAt
{
162 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
164 let c
= decode_utf8(&self[i
..]).map(|(c
, _
)| c
).into();
165 InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
169 fn next_char(&self, at
: InputAt
) -> Char
{
173 fn previous_char(&self, at
: InputAt
) -> Char
{
174 decode_last_utf8(&self[..at
.pos()]).map(|(c
, _
)| c
).into()
177 fn is_empty_match(&self, at
: InputAt
, empty
: &InstEmptyLook
) -> bool
{
178 use prog
::EmptyLook
::*;
181 let c
= self.previous_char(at
);
182 at
.pos() == 0 || c
== '
\n'
185 let c
= self.next_char(at
);
186 at
.pos() == self.len() || c
== '
\n'
188 StartText
=> at
.pos() == 0,
189 EndText
=> at
.pos() == self.len(),
191 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
192 c1
.is_word_char() != c2
.is_word_char()
195 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
196 c1
.is_word_char() == c2
.is_word_char()
198 WordBoundaryAscii
=> {
199 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
200 c1
.is_word_byte() != c2
.is_word_byte()
202 NotWordBoundaryAscii
=> {
203 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
204 c1
.is_word_byte() == c2
.is_word_byte()
211 prefixes
: &LiteralSearcher
,
213 ) -> Option
<InputAt
> {
214 prefixes
.find(&self[at
.pos()..]).map(|(s
, _
)| self.at(at
.pos() + s
))
217 fn len(&self) -> usize {
221 fn as_bytes(&self) -> &[u8] {
226 /// An input reader over bytes.
227 #[derive(Clone, Copy, Debug)]
228 pub struct ByteInput
<'t
> {
233 impl<'t
> ByteInput
<'t
> {
234 /// Return a new byte-based input reader for the given string.
235 pub fn new(text
: &'t
[u8], only_utf8
: bool
) -> ByteInput
<'t
> {
236 ByteInput { text: text, only_utf8: only_utf8 }
240 impl<'t
> ops
::Deref
for ByteInput
<'t
> {
243 fn deref(&self) -> &[u8] {
248 impl<'t
> Input
for ByteInput
<'t
> {
249 fn at(&self, i
: usize) -> InputAt
{
251 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
256 byte
: self.get(i
).cloned(),
262 fn next_char(&self, at
: InputAt
) -> Char
{
263 decode_utf8(&self[at
.pos()..]).map(|(c
, _
)| c
).into()
266 fn previous_char(&self, at
: InputAt
) -> Char
{
267 decode_last_utf8(&self[..at
.pos()]).map(|(c
, _
)| c
).into()
270 fn is_empty_match(&self, at
: InputAt
, empty
: &InstEmptyLook
) -> bool
{
271 use prog
::EmptyLook
::*;
274 let c
= self.previous_char(at
);
275 at
.pos() == 0 || c
== '
\n'
278 let c
= self.next_char(at
);
279 at
.pos() == self.len() || c
== '
\n'
281 StartText
=> at
.pos() == 0,
282 EndText
=> at
.pos() == self.len(),
284 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
285 c1
.is_word_char() != c2
.is_word_char()
288 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
289 c1
.is_word_char() == c2
.is_word_char()
291 WordBoundaryAscii
=> {
292 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
294 // If we must match UTF-8, then we can't match word
295 // boundaries at invalid UTF-8.
296 if c1
.is_none() && !at
.is_start() {
299 if c2
.is_none() && !at
.is_end() {
303 c1
.is_word_byte() != c2
.is_word_byte()
305 NotWordBoundaryAscii
=> {
306 let (c1
, c2
) = (self.previous_char(at
), self.next_char(at
));
308 // If we must match UTF-8, then we can't match word
309 // boundaries at invalid UTF-8.
310 if c1
.is_none() && !at
.is_start() {
313 if c2
.is_none() && !at
.is_end() {
317 c1
.is_word_byte() == c2
.is_word_byte()
324 prefixes
: &LiteralSearcher
,
326 ) -> Option
<InputAt
> {
327 prefixes
.find(&self[at
.pos()..]).map(|(s
, _
)| self.at(at
.pos() + s
))
330 fn len(&self) -> usize {
334 fn as_bytes(&self) -> &[u8] {
339 /// An inline representation of `Option<char>`.
341 /// This eliminates the need to do case analysis on `Option<char>` to determine
342 /// ordinality with other characters.
344 /// (The `Option<char>` is not related to encoding. Instead, it is used in the
345 /// matching engines to represent the beginning and ending boundaries of the
347 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
348 pub struct Char(u32);
350 impl fmt
::Debug
for Char
{
351 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
352 match char::from_u32(self.0) {
353 None
=> write
!(f
, "Empty"),
354 Some(c
) => write
!(f
, "{:?}", c
),
360 /// Returns true iff the character is absent.
362 pub fn is_none(self) -> bool
{
366 /// Returns the length of the character's UTF-8 encoding.
368 /// If the character is absent, then `1` is returned.
370 pub fn len_utf8(self) -> usize {
371 char::from_u32(self.0).map_or(1, |c
| c
.len_utf8())
374 /// Returns true iff the character is a word character.
376 /// If the character is absent, then false is returned.
377 pub fn is_word_char(self) -> bool
{
378 // is_word_character can panic if the Unicode data for \w isn't
379 // available. However, our compiler ensures that if a Unicode word
380 // boundary is used, then the data must also be available. If it isn't,
381 // then the compiler returns an error.
382 char::from_u32(self.0).map_or(false, syntax
::is_word_character
)
385 /// Returns true iff the byte is a word byte.
387 /// If the byte is absent, then false is returned.
388 pub fn is_word_byte(self) -> bool
{
389 match char::from_u32(self.0) {
390 Some(c
) if c
<= '
\u{7F}'
=> syntax
::is_word_byte(c
as u8),
391 None
| Some(_
) => false,
396 impl From
<char> for Char
{
397 fn from(c
: char) -> Char
{
402 impl From
<Option
<char>> for Char
{
403 fn from(c
: Option
<char>) -> Char
{
404 c
.map_or(Char(u32::MAX
), |c
| c
.into())
408 impl PartialEq
<char> for Char
{
410 fn eq(&self, other
: &char) -> bool
{
411 self.0 == *other
as u32
415 impl PartialEq
<Char
> for char {
417 fn eq(&self, other
: &Char
) -> bool
{
418 *self as u32 == other
.0
422 impl PartialOrd
<char> for Char
{
424 fn partial_cmp(&self, other
: &char) -> Option
<Ordering
> {
425 self.0.partial_cmp(&(*other
as u32))
429 impl PartialOrd
<Char
> for char {
431 fn partial_cmp(&self, other
: &Char
) -> Option
<Ordering
> {
432 (*self as u32).partial_cmp(&other
.0)