vendor/regex-syntax/src/hir/interval.rs

   1 use std::char;
   2 use std::cmp;
   3 use std::fmt::Debug;
   4 use std::slice;
   5 use std::u8;
   6
   7 use crate::unicode;
   8
   9 // This module contains an *internal* implementation of interval sets.
  10 //
  11 // The primary invariant that interval sets guards is canonical ordering. That
  12 // is, every interval set contains an ordered sequence of intervals where
  13 // no two intervals are overlapping or adjacent. While this invariant is
  14 // occasionally broken within the implementation, it should be impossible for
  15 // callers to observe it.
  16 //
  17 // Since case folding (as implemented below) breaks that invariant, we roll
  18 // that into this API even though it is a little out of place in an otherwise
  19 // generic interval set. (Hence the reason why the `unicode` module is imported
  20 // here.)
  21 //
  22 // Some of the implementation complexity here is a result of me wanting to
  23 // preserve the sequential representation without using additional memory.
  24 // In many cases, we do use linear extra memory, but it is at most 2x and it
  25 // is amortized. If we relaxed the memory requirements, this implementation
  26 // could become much simpler. The extra memory is honestly probably OK, but
  27 // character classes (especially of the Unicode variety) can become quite
  28 // large, and it would be nice to keep regex compilation snappy even in debug
  29 // builds. (In the past, I have been careless with this area of code and it has
  30 // caused slow regex compilations in debug mode, so this isn't entirely
  31 // unwarranted.)
  32 //
  33 // Tests on this are relegated to the public API of HIR in src/hir.rs.
  34
  35 #[derive(Clone, Debug, Eq, PartialEq)]
  36 pub struct IntervalSet<I> {
  37     ranges: Vec<I>,
  38 }
  39
  40 impl<I: Interval> IntervalSet<I> {
  41     /// Create a new set from a sequence of intervals. Each interval is
  42     /// specified as a pair of bounds, where both bounds are inclusive.
  43     ///
  44     /// The given ranges do not need to be in any specific order, and ranges
  45     /// may overlap.
  46     pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
  47         let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
  48         set.canonicalize();
  49         set
  50     }
  51
  52     /// Add a new interval to this set.
  53     pub fn push(&mut self, interval: I) {
  54         // TODO: This could be faster. e.g., Push the interval such that
  55         // it preserves canonicalization.
  56         self.ranges.push(interval);
  57         self.canonicalize();
  58     }
  59
  60     /// Return an iterator over all intervals in this set.
  61     ///
  62     /// The iterator yields intervals in ascending order.
  63     pub fn iter(&self) -> IntervalSetIter<'_, I> {
  64         IntervalSetIter(self.ranges.iter())
  65     }
  66
  67     /// Return an immutable slice of intervals in this set.
  68     ///
  69     /// The sequence returned is in canonical ordering.
  70     pub fn intervals(&self) -> &[I] {
  71         &self.ranges
  72     }
  73
  74     /// Expand this interval set such that it contains all case folded
  75     /// characters. For example, if this class consists of the range `a-z`,
  76     /// then applying case folding will result in the class containing both the
  77     /// ranges `a-z` and `A-Z`.
  78     ///
  79     /// This returns an error if the necessary case mapping data is not
  80     /// available.
  81     pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
  82         let len = self.ranges.len();
  83         for i in 0..len {
  84             let range = self.ranges[i];
  85             if let Err(err) = range.case_fold_simple(&mut self.ranges) {
  86                 self.canonicalize();
  87                 return Err(err);
  88             }
  89         }
  90         self.canonicalize();
  91         Ok(())
  92     }
  93
  94     /// Union this set with the given set, in place.
  95     pub fn union(&mut self, other: &IntervalSet<I>) {
  96         // This could almost certainly be done more efficiently.
  97         self.ranges.extend(&other.ranges);
  98         self.canonicalize();
  99     }
 100
 101     /// Intersect this set with the given set, in place.
 102     pub fn intersect(&mut self, other: &IntervalSet<I>) {
 103         if self.ranges.is_empty() {
 104             return;
 105         }
 106         if other.ranges.is_empty() {
 107             self.ranges.clear();
 108             return;
 109         }
 110
 111         // There should be a way to do this in-place with constant memory,
 112         // but I couldn't figure out a simple way to do it. So just append
 113         // the intersection to the end of this range, and then drain it before
 114         // we're done.
 115         let drain_end = self.ranges.len();
 116
 117         let mut ita = (0..drain_end).into_iter();
 118         let mut itb = (0..other.ranges.len()).into_iter();
 119         let mut a = ita.next().unwrap();
 120         let mut b = itb.next().unwrap();
 121         loop {
 122             if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
 123                 self.ranges.push(ab);
 124             }
 125             let (it, aorb) =
 126                 if self.ranges[a].upper() < other.ranges[b].upper() {
 127                     (&mut ita, &mut a)
 128                 } else {
 129                     (&mut itb, &mut b)
 130                 };
 131             match it.next() {
 132                 Some(v) => *aorb = v,
 133                 None => break,
 134             }
 135         }
 136         self.ranges.drain(..drain_end);
 137     }
 138
 139     /// Subtract the given set from this set, in place.
 140     pub fn difference(&mut self, other: &IntervalSet<I>) {
 141         if self.ranges.is_empty() || other.ranges.is_empty() {
 142             return;
 143         }
 144
 145         // This algorithm is (to me) surprisingly complex. A search of the
 146         // interwebs indicate that this is a potentially interesting problem.
 147         // Folks seem to suggest interval or segment trees, but I'd like to
 148         // avoid the overhead (both runtime and conceptual) of that.
 149         //
 150         // The following is basically my Shitty First Draft. Therefore, in
 151         // order to grok it, you probably need to read each line carefully.
 152         // Simplifications are most welcome!
 153         //
 154         // Remember, we can assume the canonical format invariant here, which
 155         // says that all ranges are sorted, not overlapping and not adjacent in
 156         // each class.
 157         let drain_end = self.ranges.len();
 158         let (mut a, mut b) = (0, 0);
 159         'LOOP: while a < drain_end && b < other.ranges.len() {
 160             // Basically, the easy cases are when neither range overlaps with
 161             // each other. If the `b` range is less than our current `a`
 162             // range, then we can skip it and move on.
 163             if other.ranges[b].upper() < self.ranges[a].lower() {
 164                 b += 1;
 165                 continue;
 166             }
 167             // ... similarly for the `a` range. If it's less than the smallest
 168             // `b` range, then we can add it as-is.
 169             if self.ranges[a].upper() < other.ranges[b].lower() {
 170                 let range = self.ranges[a];
 171                 self.ranges.push(range);
 172                 a += 1;
 173                 continue;
 174             }
 175             // Otherwise, we have overlapping ranges.
 176             assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
 177
 178             // This part is tricky and was non-obvious to me without looking
 179             // at explicit examples (see the tests). The trickiness stems from
 180             // two things: 1) subtracting a range from another range could
 181             // yield two ranges and 2) after subtracting a range, it's possible
 182             // that future ranges can have an impact. The loop below advances
 183             // the `b` ranges until they can't possible impact the current
 184             // range.
 185             //
 186             // For example, if our `a` range is `a-t` and our next three `b`
 187             // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
 188             // subtraction three times before moving on to the next `a` range.
 189             let mut range = self.ranges[a];
 190             while b < other.ranges.len()
 191                 && !range.is_intersection_empty(&other.ranges[b])
 192             {
 193                 let old_range = range;
 194                 range = match range.difference(&other.ranges[b]) {
 195                     (None, None) => {
 196                         // We lost the entire range, so move on to the next
 197                         // without adding this one.
 198                         a += 1;
 199                         continue 'LOOP;
 200                     }
 201                     (Some(range1), None) | (None, Some(range1)) => range1,
 202                     (Some(range1), Some(range2)) => {
 203                         self.ranges.push(range1);
 204                         range2
 205                     }
 206                 };
 207                 // It's possible that the `b` range has more to contribute
 208                 // here. In particular, if it is greater than the original
 209                 // range, then it might impact the next `a` range *and* it
 210                 // has impacted the current `a` range as much as possible,
 211                 // so we can quit. We don't bump `b` so that the next `a`
 212                 // range can apply it.
 213                 if other.ranges[b].upper() > old_range.upper() {
 214                     break;
 215                 }
 216                 // Otherwise, the next `b` range might apply to the current
 217                 // `a` range.
 218                 b += 1;
 219             }
 220             self.ranges.push(range);
 221             a += 1;
 222         }
 223         while a < drain_end {
 224             let range = self.ranges[a];
 225             self.ranges.push(range);
 226             a += 1;
 227         }
 228         self.ranges.drain(..drain_end);
 229     }
 230
 231     /// Compute the symmetric difference of the two sets, in place.
 232     ///
 233     /// This computes the symmetric difference of two interval sets. This
 234     /// removes all elements in this set that are also in the given set,
 235     /// but also adds all elements from the given set that aren't in this
 236     /// set. That is, the set will contain all elements in either set,
 237     /// but will not contain any elements that are in both sets.
 238     pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
 239         // TODO(burntsushi): Fix this so that it amortizes allocation.
 240         let mut intersection = self.clone();
 241         intersection.intersect(other);
 242         self.union(other);
 243         self.difference(&intersection);
 244     }
 245
 246     /// Negate this interval set.
 247     ///
 248     /// For all `x` where `x` is any element, if `x` was in this set, then it
 249     /// will not be in this set after negation.
 250     pub fn negate(&mut self) {
 251         if self.ranges.is_empty() {
 252             let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
 253             self.ranges.push(I::create(min, max));
 254             return;
 255         }
 256
 257         // There should be a way to do this in-place with constant memory,
 258         // but I couldn't figure out a simple way to do it. So just append
 259         // the negation to the end of this range, and then drain it before
 260         // we're done.
 261         let drain_end = self.ranges.len();
 262
 263         // We do checked arithmetic below because of the canonical ordering
 264         // invariant.
 265         if self.ranges[0].lower() > I::Bound::min_value() {
 266             let upper = self.ranges[0].lower().decrement();
 267             self.ranges.push(I::create(I::Bound::min_value(), upper));
 268         }
 269         for i in 1..drain_end {
 270             let lower = self.ranges[i - 1].upper().increment();
 271             let upper = self.ranges[i].lower().decrement();
 272             self.ranges.push(I::create(lower, upper));
 273         }
 274         if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
 275             let lower = self.ranges[drain_end - 1].upper().increment();
 276             self.ranges.push(I::create(lower, I::Bound::max_value()));
 277         }
 278         self.ranges.drain(..drain_end);
 279     }
 280
 281     /// Converts this set into a canonical ordering.
 282     fn canonicalize(&mut self) {
 283         if self.is_canonical() {
 284             return;
 285         }
 286         self.ranges.sort();
 287         assert!(!self.ranges.is_empty());
 288
 289         // Is there a way to do this in-place with constant memory? I couldn't
 290         // figure out a way to do it. So just append the canonicalization to
 291         // the end of this range, and then drain it before we're done.
 292         let drain_end = self.ranges.len();
 293         for oldi in 0..drain_end {
 294             // If we've added at least one new range, then check if we can
 295             // merge this range in the previously added range.
 296             if self.ranges.len() > drain_end {
 297                 let (last, rest) = self.ranges.split_last_mut().unwrap();
 298                 if let Some(union) = last.union(&rest[oldi]) {
 299                     *last = union;
 300                     continue;
 301                 }
 302             }
 303             let range = self.ranges[oldi];
 304             self.ranges.push(range);
 305         }
 306         self.ranges.drain(..drain_end);
 307     }
 308
 309     /// Returns true if and only if this class is in a canonical ordering.
 310     fn is_canonical(&self) -> bool {
 311         for pair in self.ranges.windows(2) {
 312             if pair[0] >= pair[1] {
 313                 return false;
 314             }
 315             if pair[0].is_contiguous(&pair[1]) {
 316                 return false;
 317             }
 318         }
 319         true
 320     }
 321 }
 322
 323 /// An iterator over intervals.
 324 #[derive(Debug)]
 325 pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
 326
 327 impl<'a, I> Iterator for IntervalSetIter<'a, I> {
 328     type Item = &'a I;
 329
 330     fn next(&mut self) -> Option<&'a I> {
 331         self.0.next()
 332     }
 333 }
 334
 335 pub trait Interval:
 336     Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
 337 {
 338     type Bound: Bound;
 339
 340     fn lower(&self) -> Self::Bound;
 341     fn upper(&self) -> Self::Bound;
 342     fn set_lower(&mut self, bound: Self::Bound);
 343     fn set_upper(&mut self, bound: Self::Bound);
 344     fn case_fold_simple(
 345         &self,
 346         intervals: &mut Vec<Self>,
 347     ) -> Result<(), unicode::CaseFoldError>;
 348
 349     /// Create a new interval.
 350     fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
 351         let mut int = Self::default();
 352         if lower <= upper {
 353             int.set_lower(lower);
 354             int.set_upper(upper);
 355         } else {
 356             int.set_lower(upper);
 357             int.set_upper(lower);
 358         }
 359         int
 360     }
 361
 362     /// Union the given overlapping range into this range.
 363     ///
 364     /// If the two ranges aren't contiguous, then this returns `None`.
 365     fn union(&self, other: &Self) -> Option<Self> {
 366         if !self.is_contiguous(other) {
 367             return None;
 368         }
 369         let lower = cmp::min(self.lower(), other.lower());
 370         let upper = cmp::max(self.upper(), other.upper());
 371         Some(Self::create(lower, upper))
 372     }
 373
 374     /// Intersect this range with the given range and return the result.
 375     ///
 376     /// If the intersection is empty, then this returns `None`.
 377     fn intersect(&self, other: &Self) -> Option<Self> {
 378         let lower = cmp::max(self.lower(), other.lower());
 379         let upper = cmp::min(self.upper(), other.upper());
 380         if lower <= upper {
 381             Some(Self::create(lower, upper))
 382         } else {
 383             None
 384         }
 385     }
 386
 387     /// Subtract the given range from this range and return the resulting
 388     /// ranges.
 389     ///
 390     /// If subtraction would result in an empty range, then no ranges are
 391     /// returned.
 392     fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
 393         if self.is_subset(other) {
 394             return (None, None);
 395         }
 396         if self.is_intersection_empty(other) {
 397             return (Some(self.clone()), None);
 398         }
 399         let add_lower = other.lower() > self.lower();
 400         let add_upper = other.upper() < self.upper();
 401         // We know this because !self.is_subset(other) and the ranges have
 402         // a non-empty intersection.
 403         assert!(add_lower || add_upper);
 404         let mut ret = (None, None);
 405         if add_lower {
 406             let upper = other.lower().decrement();
 407             ret.0 = Some(Self::create(self.lower(), upper));
 408         }
 409         if add_upper {
 410             let lower = other.upper().increment();
 411             let range = Self::create(lower, self.upper());
 412             if ret.0.is_none() {
 413                 ret.0 = Some(range);
 414             } else {
 415                 ret.1 = Some(range);
 416             }
 417         }
 418         ret
 419     }
 420
 421     /// Compute the symmetric difference the given range from this range. This
 422     /// returns the union of the two ranges minus its intersection.
 423     fn symmetric_difference(
 424         &self,
 425         other: &Self,
 426     ) -> (Option<Self>, Option<Self>) {
 427         let union = match self.union(other) {
 428             None => return (Some(self.clone()), Some(other.clone())),
 429             Some(union) => union,
 430         };
 431         let intersection = match self.intersect(other) {
 432             None => return (Some(self.clone()), Some(other.clone())),
 433             Some(intersection) => intersection,
 434         };
 435         union.difference(&intersection)
 436     }
 437
 438     /// Returns true if and only if the two ranges are contiguous. Two ranges
 439     /// are contiguous if and only if the ranges are either overlapping or
 440     /// adjacent.
 441     fn is_contiguous(&self, other: &Self) -> bool {
 442         let lower1 = self.lower().as_u32();
 443         let upper1 = self.upper().as_u32();
 444         let lower2 = other.lower().as_u32();
 445         let upper2 = other.upper().as_u32();
 446         cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
 447     }
 448
 449     /// Returns true if and only if the intersection of this range and the
 450     /// other range is empty.
 451     fn is_intersection_empty(&self, other: &Self) -> bool {
 452         let (lower1, upper1) = (self.lower(), self.upper());
 453         let (lower2, upper2) = (other.lower(), other.upper());
 454         cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
 455     }
 456
 457     /// Returns true if and only if this range is a subset of the other range.
 458     fn is_subset(&self, other: &Self) -> bool {
 459         let (lower1, upper1) = (self.lower(), self.upper());
 460         let (lower2, upper2) = (other.lower(), other.upper());
 461         (lower2 <= lower1 && lower1 <= upper2)
 462             && (lower2 <= upper1 && upper1 <= upper2)
 463     }
 464 }
 465
 466 pub trait Bound:
 467     Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
 468 {
 469     fn min_value() -> Self;
 470     fn max_value() -> Self;
 471     fn as_u32(self) -> u32;
 472     fn increment(self) -> Self;
 473     fn decrement(self) -> Self;
 474 }
 475
 476 impl Bound for u8 {
 477     fn min_value() -> Self {
 478         u8::MIN
 479     }
 480     fn max_value() -> Self {
 481         u8::MAX
 482     }
 483     fn as_u32(self) -> u32 {
 484         self as u32
 485     }
 486     fn increment(self) -> Self {
 487         self.checked_add(1).unwrap()
 488     }
 489     fn decrement(self) -> Self {
 490         self.checked_sub(1).unwrap()
 491     }
 492 }
 493
 494 impl Bound for char {
 495     fn min_value() -> Self {
 496         '\x00'
 497     }
 498     fn max_value() -> Self {
 499         '\u{10FFFF}'
 500     }
 501     fn as_u32(self) -> u32 {
 502         self as u32
 503     }
 504
 505     fn increment(self) -> Self {
 506         match self {
 507             '\u{D7FF}' => '\u{E000}',
 508             c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
 509         }
 510     }
 511
 512     fn decrement(self) -> Self {
 513         match self {
 514             '\u{E000}' => '\u{D7FF}',
 515             c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
 516         }
 517     }
 518 }
 519
 520 // Tests for interval sets are written in src/hir.rs against the public API.