vendor/regex-1.4.6/src/input.rs

   1 use std::char;
   2 use std::cmp::Ordering;
   3 use std::fmt;
   4 use std::ops;
   5 use std::u32;
   6
   7 use syntax;
   8
   9 use literal::LiteralSearcher;
  10 use prog::InstEmptyLook;
  11 use utf8::{decode_last_utf8, decode_utf8};
  12
  13 /// Represents a location in the input.
  14 #[derive(Clone, Copy, Debug)]
  15 pub struct InputAt {
  16     pos: usize,
  17     c: Char,
  18     byte: Option<u8>,
  19     len: usize,
  20 }
  21
  22 impl InputAt {
  23     /// Returns true iff this position is at the beginning of the input.
  24     pub fn is_start(&self) -> bool {
  25         self.pos == 0
  26     }
  27
  28     /// Returns true iff this position is past the end of the input.
  29     pub fn is_end(&self) -> bool {
  30         self.c.is_none() && self.byte.is_none()
  31     }
  32
  33     /// Returns the character at this position.
  34     ///
  35     /// If this position is just before or after the input, then an absent
  36     /// character is returned.
  37     pub fn char(&self) -> Char {
  38         self.c
  39     }
  40
  41     /// Returns the byte at this position.
  42     pub fn byte(&self) -> Option<u8> {
  43         self.byte
  44     }
  45
  46     /// Returns the UTF-8 width of the character at this position.
  47     pub fn len(&self) -> usize {
  48         self.len
  49     }
  50
  51     /// Returns whether the UTF-8 width of the character at this position
  52     /// is zero.
  53     pub fn is_empty(&self) -> bool {
  54         self.len == 0
  55     }
  56
  57     /// Returns the byte offset of this position.
  58     pub fn pos(&self) -> usize {
  59         self.pos
  60     }
  61
  62     /// Returns the byte offset of the next position in the input.
  63     pub fn next_pos(&self) -> usize {
  64         self.pos + self.len
  65     }
  66 }
  67
  68 /// An abstraction over input used in the matching engines.
  69 pub trait Input: fmt::Debug {
  70     /// Return an encoding of the position at byte offset `i`.
  71     fn at(&self, i: usize) -> InputAt;
  72
  73     /// Return the Unicode character occurring next to `at`.
  74     ///
  75     /// If no such character could be decoded, then `Char` is absent.
  76     fn next_char(&self, at: InputAt) -> Char;
  77
  78     /// Return the Unicode character occurring previous to `at`.
  79     ///
  80     /// If no such character could be decoded, then `Char` is absent.
  81     fn previous_char(&self, at: InputAt) -> Char;
  82
  83     /// Return true if the given empty width instruction matches at the
  84     /// input position given.
  85     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
  86
  87     /// Scan the input for a matching prefix.
  88     fn prefix_at(
  89         &self,
  90         prefixes: &LiteralSearcher,
  91         at: InputAt,
  92     ) -> Option<InputAt>;
  93
  94     /// The number of bytes in the input.
  95     fn len(&self) -> usize;
  96
  97     /// Whether the input is empty.
  98     fn is_empty(&self) -> bool {
  99         self.len() == 0
 100     }
 101
 102     /// Return the given input as a sequence of bytes.
 103     fn as_bytes(&self) -> &[u8];
 104 }
 105
 106 impl<'a, T: Input> Input for &'a T {
 107     fn at(&self, i: usize) -> InputAt {
 108         (**self).at(i)
 109     }
 110
 111     fn next_char(&self, at: InputAt) -> Char {
 112         (**self).next_char(at)
 113     }
 114
 115     fn previous_char(&self, at: InputAt) -> Char {
 116         (**self).previous_char(at)
 117     }
 118
 119     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
 120         (**self).is_empty_match(at, empty)
 121     }
 122
 123     fn prefix_at(
 124         &self,
 125         prefixes: &LiteralSearcher,
 126         at: InputAt,
 127     ) -> Option<InputAt> {
 128         (**self).prefix_at(prefixes, at)
 129     }
 130
 131     fn len(&self) -> usize {
 132         (**self).len()
 133     }
 134
 135     fn as_bytes(&self) -> &[u8] {
 136         (**self).as_bytes()
 137     }
 138 }
 139
 140 /// An input reader over characters.
 141 #[derive(Clone, Copy, Debug)]
 142 pub struct CharInput<'t>(&'t [u8]);
 143
 144 impl<'t> CharInput<'t> {
 145     /// Return a new character input reader for the given string.
 146     pub fn new(s: &'t [u8]) -> CharInput<'t> {
 147         CharInput(s)
 148     }
 149 }
 150
 151 impl<'t> ops::Deref for CharInput<'t> {
 152     type Target = [u8];
 153
 154     fn deref(&self) -> &[u8] {
 155         self.0
 156     }
 157 }
 158
 159 impl<'t> Input for CharInput<'t> {
 160     fn at(&self, i: usize) -> InputAt {
 161         if i >= self.len() {
 162             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
 163         } else {
 164             let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
 165             InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
 166         }
 167     }
 168
 169     fn next_char(&self, at: InputAt) -> Char {
 170         at.char()
 171     }
 172
 173     fn previous_char(&self, at: InputAt) -> Char {
 174         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
 175     }
 176
 177     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
 178         use prog::EmptyLook::*;
 179         match empty.look {
 180             StartLine => {
 181                 let c = self.previous_char(at);
 182                 at.pos() == 0 || c == '\n'
 183             }
 184             EndLine => {
 185                 let c = self.next_char(at);
 186                 at.pos() == self.len() || c == '\n'
 187             }
 188             StartText => at.pos() == 0,
 189             EndText => at.pos() == self.len(),
 190             WordBoundary => {
 191                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 192                 c1.is_word_char() != c2.is_word_char()
 193             }
 194             NotWordBoundary => {
 195                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 196                 c1.is_word_char() == c2.is_word_char()
 197             }
 198             WordBoundaryAscii => {
 199                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 200                 c1.is_word_byte() != c2.is_word_byte()
 201             }
 202             NotWordBoundaryAscii => {
 203                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 204                 c1.is_word_byte() == c2.is_word_byte()
 205             }
 206         }
 207     }
 208
 209     fn prefix_at(
 210         &self,
 211         prefixes: &LiteralSearcher,
 212         at: InputAt,
 213     ) -> Option<InputAt> {
 214         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
 215     }
 216
 217     fn len(&self) -> usize {
 218         self.0.len()
 219     }
 220
 221     fn as_bytes(&self) -> &[u8] {
 222         self.0
 223     }
 224 }
 225
 226 /// An input reader over bytes.
 227 #[derive(Clone, Copy, Debug)]
 228 pub struct ByteInput<'t> {
 229     text: &'t [u8],
 230     only_utf8: bool,
 231 }
 232
 233 impl<'t> ByteInput<'t> {
 234     /// Return a new byte-based input reader for the given string.
 235     pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
 236         ByteInput { text: text, only_utf8: only_utf8 }
 237     }
 238 }
 239
 240 impl<'t> ops::Deref for ByteInput<'t> {
 241     type Target = [u8];
 242
 243     fn deref(&self) -> &[u8] {
 244         self.text
 245     }
 246 }
 247
 248 impl<'t> Input for ByteInput<'t> {
 249     fn at(&self, i: usize) -> InputAt {
 250         if i >= self.len() {
 251             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
 252         } else {
 253             InputAt {
 254                 pos: i,
 255                 c: None.into(),
 256                 byte: self.get(i).cloned(),
 257                 len: 1,
 258             }
 259         }
 260     }
 261
 262     fn next_char(&self, at: InputAt) -> Char {
 263         decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
 264     }
 265
 266     fn previous_char(&self, at: InputAt) -> Char {
 267         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
 268     }
 269
 270     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
 271         use prog::EmptyLook::*;
 272         match empty.look {
 273             StartLine => {
 274                 let c = self.previous_char(at);
 275                 at.pos() == 0 || c == '\n'
 276             }
 277             EndLine => {
 278                 let c = self.next_char(at);
 279                 at.pos() == self.len() || c == '\n'
 280             }
 281             StartText => at.pos() == 0,
 282             EndText => at.pos() == self.len(),
 283             WordBoundary => {
 284                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 285                 c1.is_word_char() != c2.is_word_char()
 286             }
 287             NotWordBoundary => {
 288                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 289                 c1.is_word_char() == c2.is_word_char()
 290             }
 291             WordBoundaryAscii => {
 292                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 293                 if self.only_utf8 {
 294                     // If we must match UTF-8, then we can't match word
 295                     // boundaries at invalid UTF-8.
 296                     if c1.is_none() && !at.is_start() {
 297                         return false;
 298                     }
 299                     if c2.is_none() && !at.is_end() {
 300                         return false;
 301                     }
 302                 }
 303                 c1.is_word_byte() != c2.is_word_byte()
 304             }
 305             NotWordBoundaryAscii => {
 306                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
 307                 if self.only_utf8 {
 308                     // If we must match UTF-8, then we can't match word
 309                     // boundaries at invalid UTF-8.
 310                     if c1.is_none() && !at.is_start() {
 311                         return false;
 312                     }
 313                     if c2.is_none() && !at.is_end() {
 314                         return false;
 315                     }
 316                 }
 317                 c1.is_word_byte() == c2.is_word_byte()
 318             }
 319         }
 320     }
 321
 322     fn prefix_at(
 323         &self,
 324         prefixes: &LiteralSearcher,
 325         at: InputAt,
 326     ) -> Option<InputAt> {
 327         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
 328     }
 329
 330     fn len(&self) -> usize {
 331         self.text.len()
 332     }
 333
 334     fn as_bytes(&self) -> &[u8] {
 335         self.text
 336     }
 337 }
 338
 339 /// An inline representation of `Option<char>`.
 340 ///
 341 /// This eliminates the need to do case analysis on `Option<char>` to determine
 342 /// ordinality with other characters.
 343 ///
 344 /// (The `Option<char>` is not related to encoding. Instead, it is used in the
 345 /// matching engines to represent the beginning and ending boundaries of the
 346 /// search text.)
 347 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
 348 pub struct Char(u32);
 349
 350 impl fmt::Debug for Char {
 351     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 352         match char::from_u32(self.0) {
 353             None => write!(f, "Empty"),
 354             Some(c) => write!(f, "{:?}", c),
 355         }
 356     }
 357 }
 358
 359 impl Char {
 360     /// Returns true iff the character is absent.
 361     #[inline]
 362     pub fn is_none(self) -> bool {
 363         self.0 == u32::MAX
 364     }
 365
 366     /// Returns the length of the character's UTF-8 encoding.
 367     ///
 368     /// If the character is absent, then `1` is returned.
 369     #[inline]
 370     pub fn len_utf8(self) -> usize {
 371         char::from_u32(self.0).map_or(1, |c| c.len_utf8())
 372     }
 373
 374     /// Returns true iff the character is a word character.
 375     ///
 376     /// If the character is absent, then false is returned.
 377     pub fn is_word_char(self) -> bool {
 378         // is_word_character can panic if the Unicode data for \w isn't
 379         // available. However, our compiler ensures that if a Unicode word
 380         // boundary is used, then the data must also be available. If it isn't,
 381         // then the compiler returns an error.
 382         char::from_u32(self.0).map_or(false, syntax::is_word_character)
 383     }
 384
 385     /// Returns true iff the byte is a word byte.
 386     ///
 387     /// If the byte is absent, then false is returned.
 388     pub fn is_word_byte(self) -> bool {
 389         match char::from_u32(self.0) {
 390             Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
 391             None | Some(_) => false,
 392         }
 393     }
 394 }
 395
 396 impl From<char> for Char {
 397     fn from(c: char) -> Char {
 398         Char(c as u32)
 399     }
 400 }
 401
 402 impl From<Option<char>> for Char {
 403     fn from(c: Option<char>) -> Char {
 404         c.map_or(Char(u32::MAX), |c| c.into())
 405     }
 406 }
 407
 408 impl PartialEq<char> for Char {
 409     #[inline]
 410     fn eq(&self, other: &char) -> bool {
 411         self.0 == *other as u32
 412     }
 413 }
 414
 415 impl PartialEq<Char> for char {
 416     #[inline]
 417     fn eq(&self, other: &Char) -> bool {
 418         *self as u32 == other.0
 419     }
 420 }
 421
 422 impl PartialOrd<char> for Char {
 423     #[inline]
 424     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
 425         self.0.partial_cmp(&(*other as u32))
 426     }
 427 }
 428
 429 impl PartialOrd<Char> for char {
 430     #[inline]
 431     fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
 432         (*self as u32).partial_cmp(&other.0)
 433     }
 434 }