]> git.proxmox.com Git - cargo.git/blob - vendor/regex-syntax/src/hir/interval.rs
New upstream version 0.52.0
[cargo.git] / vendor / regex-syntax / src / hir / interval.rs
1 use std::char;
2 use std::cmp;
3 use std::fmt::Debug;
4 use std::slice;
5 use std::u8;
6
7 use crate::unicode;
8
9 // This module contains an *internal* implementation of interval sets.
10 //
11 // The primary invariant that interval sets guards is canonical ordering. That
12 // is, every interval set contains an ordered sequence of intervals where
13 // no two intervals are overlapping or adjacent. While this invariant is
14 // occasionally broken within the implementation, it should be impossible for
15 // callers to observe it.
16 //
17 // Since case folding (as implemented below) breaks that invariant, we roll
18 // that into this API even though it is a little out of place in an otherwise
19 // generic interval set. (Hence the reason why the `unicode` module is imported
20 // here.)
21 //
22 // Some of the implementation complexity here is a result of me wanting to
23 // preserve the sequential representation without using additional memory.
24 // In many cases, we do use linear extra memory, but it is at most 2x and it
25 // is amortized. If we relaxed the memory requirements, this implementation
26 // could become much simpler. The extra memory is honestly probably OK, but
27 // character classes (especially of the Unicode variety) can become quite
28 // large, and it would be nice to keep regex compilation snappy even in debug
29 // builds. (In the past, I have been careless with this area of code and it has
30 // caused slow regex compilations in debug mode, so this isn't entirely
31 // unwarranted.)
32 //
33 // Tests on this are relegated to the public API of HIR in src/hir.rs.
34
35 #[derive(Clone, Debug, Eq, PartialEq)]
36 pub struct IntervalSet<I> {
37 ranges: Vec<I>,
38 }
39
40 impl<I: Interval> IntervalSet<I> {
41 /// Create a new set from a sequence of intervals. Each interval is
42 /// specified as a pair of bounds, where both bounds are inclusive.
43 ///
44 /// The given ranges do not need to be in any specific order, and ranges
45 /// may overlap.
46 pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
47 let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
48 set.canonicalize();
49 set
50 }
51
52 /// Add a new interval to this set.
53 pub fn push(&mut self, interval: I) {
54 // TODO: This could be faster. e.g., Push the interval such that
55 // it preserves canonicalization.
56 self.ranges.push(interval);
57 self.canonicalize();
58 }
59
60 /// Return an iterator over all intervals in this set.
61 ///
62 /// The iterator yields intervals in ascending order.
63 pub fn iter(&self) -> IntervalSetIter<'_, I> {
64 IntervalSetIter(self.ranges.iter())
65 }
66
67 /// Return an immutable slice of intervals in this set.
68 ///
69 /// The sequence returned is in canonical ordering.
70 pub fn intervals(&self) -> &[I] {
71 &self.ranges
72 }
73
74 /// Expand this interval set such that it contains all case folded
75 /// characters. For example, if this class consists of the range `a-z`,
76 /// then applying case folding will result in the class containing both the
77 /// ranges `a-z` and `A-Z`.
78 ///
79 /// This returns an error if the necessary case mapping data is not
80 /// available.
81 pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
82 let len = self.ranges.len();
83 for i in 0..len {
84 let range = self.ranges[i];
85 if let Err(err) = range.case_fold_simple(&mut self.ranges) {
86 self.canonicalize();
87 return Err(err);
88 }
89 }
90 self.canonicalize();
91 Ok(())
92 }
93
94 /// Union this set with the given set, in place.
95 pub fn union(&mut self, other: &IntervalSet<I>) {
96 // This could almost certainly be done more efficiently.
97 self.ranges.extend(&other.ranges);
98 self.canonicalize();
99 }
100
101 /// Intersect this set with the given set, in place.
102 pub fn intersect(&mut self, other: &IntervalSet<I>) {
103 if self.ranges.is_empty() {
104 return;
105 }
106 if other.ranges.is_empty() {
107 self.ranges.clear();
108 return;
109 }
110
111 // There should be a way to do this in-place with constant memory,
112 // but I couldn't figure out a simple way to do it. So just append
113 // the intersection to the end of this range, and then drain it before
114 // we're done.
115 let drain_end = self.ranges.len();
116
117 let mut ita = (0..drain_end).into_iter();
118 let mut itb = (0..other.ranges.len()).into_iter();
119 let mut a = ita.next().unwrap();
120 let mut b = itb.next().unwrap();
121 loop {
122 if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
123 self.ranges.push(ab);
124 }
125 let (it, aorb) =
126 if self.ranges[a].upper() < other.ranges[b].upper() {
127 (&mut ita, &mut a)
128 } else {
129 (&mut itb, &mut b)
130 };
131 match it.next() {
132 Some(v) => *aorb = v,
133 None => break,
134 }
135 }
136 self.ranges.drain(..drain_end);
137 }
138
139 /// Subtract the given set from this set, in place.
140 pub fn difference(&mut self, other: &IntervalSet<I>) {
141 if self.ranges.is_empty() || other.ranges.is_empty() {
142 return;
143 }
144
145 // This algorithm is (to me) surprisingly complex. A search of the
146 // interwebs indicate that this is a potentially interesting problem.
147 // Folks seem to suggest interval or segment trees, but I'd like to
148 // avoid the overhead (both runtime and conceptual) of that.
149 //
150 // The following is basically my Shitty First Draft. Therefore, in
151 // order to grok it, you probably need to read each line carefully.
152 // Simplifications are most welcome!
153 //
154 // Remember, we can assume the canonical format invariant here, which
155 // says that all ranges are sorted, not overlapping and not adjacent in
156 // each class.
157 let drain_end = self.ranges.len();
158 let (mut a, mut b) = (0, 0);
159 'LOOP: while a < drain_end && b < other.ranges.len() {
160 // Basically, the easy cases are when neither range overlaps with
161 // each other. If the `b` range is less than our current `a`
162 // range, then we can skip it and move on.
163 if other.ranges[b].upper() < self.ranges[a].lower() {
164 b += 1;
165 continue;
166 }
167 // ... similarly for the `a` range. If it's less than the smallest
168 // `b` range, then we can add it as-is.
169 if self.ranges[a].upper() < other.ranges[b].lower() {
170 let range = self.ranges[a];
171 self.ranges.push(range);
172 a += 1;
173 continue;
174 }
175 // Otherwise, we have overlapping ranges.
176 assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
177
178 // This part is tricky and was non-obvious to me without looking
179 // at explicit examples (see the tests). The trickiness stems from
180 // two things: 1) subtracting a range from another range could
181 // yield two ranges and 2) after subtracting a range, it's possible
182 // that future ranges can have an impact. The loop below advances
183 // the `b` ranges until they can't possible impact the current
184 // range.
185 //
186 // For example, if our `a` range is `a-t` and our next three `b`
187 // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
188 // subtraction three times before moving on to the next `a` range.
189 let mut range = self.ranges[a];
190 while b < other.ranges.len()
191 && !range.is_intersection_empty(&other.ranges[b])
192 {
193 let old_range = range;
194 range = match range.difference(&other.ranges[b]) {
195 (None, None) => {
196 // We lost the entire range, so move on to the next
197 // without adding this one.
198 a += 1;
199 continue 'LOOP;
200 }
201 (Some(range1), None) | (None, Some(range1)) => range1,
202 (Some(range1), Some(range2)) => {
203 self.ranges.push(range1);
204 range2
205 }
206 };
207 // It's possible that the `b` range has more to contribute
208 // here. In particular, if it is greater than the original
209 // range, then it might impact the next `a` range *and* it
210 // has impacted the current `a` range as much as possible,
211 // so we can quit. We don't bump `b` so that the next `a`
212 // range can apply it.
213 if other.ranges[b].upper() > old_range.upper() {
214 break;
215 }
216 // Otherwise, the next `b` range might apply to the current
217 // `a` range.
218 b += 1;
219 }
220 self.ranges.push(range);
221 a += 1;
222 }
223 while a < drain_end {
224 let range = self.ranges[a];
225 self.ranges.push(range);
226 a += 1;
227 }
228 self.ranges.drain(..drain_end);
229 }
230
231 /// Compute the symmetric difference of the two sets, in place.
232 ///
233 /// This computes the symmetric difference of two interval sets. This
234 /// removes all elements in this set that are also in the given set,
235 /// but also adds all elements from the given set that aren't in this
236 /// set. That is, the set will contain all elements in either set,
237 /// but will not contain any elements that are in both sets.
238 pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
239 // TODO(burntsushi): Fix this so that it amortizes allocation.
240 let mut intersection = self.clone();
241 intersection.intersect(other);
242 self.union(other);
243 self.difference(&intersection);
244 }
245
246 /// Negate this interval set.
247 ///
248 /// For all `x` where `x` is any element, if `x` was in this set, then it
249 /// will not be in this set after negation.
250 pub fn negate(&mut self) {
251 if self.ranges.is_empty() {
252 let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
253 self.ranges.push(I::create(min, max));
254 return;
255 }
256
257 // There should be a way to do this in-place with constant memory,
258 // but I couldn't figure out a simple way to do it. So just append
259 // the negation to the end of this range, and then drain it before
260 // we're done.
261 let drain_end = self.ranges.len();
262
263 // We do checked arithmetic below because of the canonical ordering
264 // invariant.
265 if self.ranges[0].lower() > I::Bound::min_value() {
266 let upper = self.ranges[0].lower().decrement();
267 self.ranges.push(I::create(I::Bound::min_value(), upper));
268 }
269 for i in 1..drain_end {
270 let lower = self.ranges[i - 1].upper().increment();
271 let upper = self.ranges[i].lower().decrement();
272 self.ranges.push(I::create(lower, upper));
273 }
274 if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
275 let lower = self.ranges[drain_end - 1].upper().increment();
276 self.ranges.push(I::create(lower, I::Bound::max_value()));
277 }
278 self.ranges.drain(..drain_end);
279 }
280
281 /// Converts this set into a canonical ordering.
282 fn canonicalize(&mut self) {
283 if self.is_canonical() {
284 return;
285 }
286 self.ranges.sort();
287 assert!(!self.ranges.is_empty());
288
289 // Is there a way to do this in-place with constant memory? I couldn't
290 // figure out a way to do it. So just append the canonicalization to
291 // the end of this range, and then drain it before we're done.
292 let drain_end = self.ranges.len();
293 for oldi in 0..drain_end {
294 // If we've added at least one new range, then check if we can
295 // merge this range in the previously added range.
296 if self.ranges.len() > drain_end {
297 let (last, rest) = self.ranges.split_last_mut().unwrap();
298 if let Some(union) = last.union(&rest[oldi]) {
299 *last = union;
300 continue;
301 }
302 }
303 let range = self.ranges[oldi];
304 self.ranges.push(range);
305 }
306 self.ranges.drain(..drain_end);
307 }
308
309 /// Returns true if and only if this class is in a canonical ordering.
310 fn is_canonical(&self) -> bool {
311 for pair in self.ranges.windows(2) {
312 if pair[0] >= pair[1] {
313 return false;
314 }
315 if pair[0].is_contiguous(&pair[1]) {
316 return false;
317 }
318 }
319 true
320 }
321 }
322
323 /// An iterator over intervals.
324 #[derive(Debug)]
325 pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
326
327 impl<'a, I> Iterator for IntervalSetIter<'a, I> {
328 type Item = &'a I;
329
330 fn next(&mut self) -> Option<&'a I> {
331 self.0.next()
332 }
333 }
334
335 pub trait Interval:
336 Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
337 {
338 type Bound: Bound;
339
340 fn lower(&self) -> Self::Bound;
341 fn upper(&self) -> Self::Bound;
342 fn set_lower(&mut self, bound: Self::Bound);
343 fn set_upper(&mut self, bound: Self::Bound);
344 fn case_fold_simple(
345 &self,
346 intervals: &mut Vec<Self>,
347 ) -> Result<(), unicode::CaseFoldError>;
348
349 /// Create a new interval.
350 fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
351 let mut int = Self::default();
352 if lower <= upper {
353 int.set_lower(lower);
354 int.set_upper(upper);
355 } else {
356 int.set_lower(upper);
357 int.set_upper(lower);
358 }
359 int
360 }
361
362 /// Union the given overlapping range into this range.
363 ///
364 /// If the two ranges aren't contiguous, then this returns `None`.
365 fn union(&self, other: &Self) -> Option<Self> {
366 if !self.is_contiguous(other) {
367 return None;
368 }
369 let lower = cmp::min(self.lower(), other.lower());
370 let upper = cmp::max(self.upper(), other.upper());
371 Some(Self::create(lower, upper))
372 }
373
374 /// Intersect this range with the given range and return the result.
375 ///
376 /// If the intersection is empty, then this returns `None`.
377 fn intersect(&self, other: &Self) -> Option<Self> {
378 let lower = cmp::max(self.lower(), other.lower());
379 let upper = cmp::min(self.upper(), other.upper());
380 if lower <= upper {
381 Some(Self::create(lower, upper))
382 } else {
383 None
384 }
385 }
386
387 /// Subtract the given range from this range and return the resulting
388 /// ranges.
389 ///
390 /// If subtraction would result in an empty range, then no ranges are
391 /// returned.
392 fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
393 if self.is_subset(other) {
394 return (None, None);
395 }
396 if self.is_intersection_empty(other) {
397 return (Some(self.clone()), None);
398 }
399 let add_lower = other.lower() > self.lower();
400 let add_upper = other.upper() < self.upper();
401 // We know this because !self.is_subset(other) and the ranges have
402 // a non-empty intersection.
403 assert!(add_lower || add_upper);
404 let mut ret = (None, None);
405 if add_lower {
406 let upper = other.lower().decrement();
407 ret.0 = Some(Self::create(self.lower(), upper));
408 }
409 if add_upper {
410 let lower = other.upper().increment();
411 let range = Self::create(lower, self.upper());
412 if ret.0.is_none() {
413 ret.0 = Some(range);
414 } else {
415 ret.1 = Some(range);
416 }
417 }
418 ret
419 }
420
421 /// Compute the symmetric difference the given range from this range. This
422 /// returns the union of the two ranges minus its intersection.
423 fn symmetric_difference(
424 &self,
425 other: &Self,
426 ) -> (Option<Self>, Option<Self>) {
427 let union = match self.union(other) {
428 None => return (Some(self.clone()), Some(other.clone())),
429 Some(union) => union,
430 };
431 let intersection = match self.intersect(other) {
432 None => return (Some(self.clone()), Some(other.clone())),
433 Some(intersection) => intersection,
434 };
435 union.difference(&intersection)
436 }
437
438 /// Returns true if and only if the two ranges are contiguous. Two ranges
439 /// are contiguous if and only if the ranges are either overlapping or
440 /// adjacent.
441 fn is_contiguous(&self, other: &Self) -> bool {
442 let lower1 = self.lower().as_u32();
443 let upper1 = self.upper().as_u32();
444 let lower2 = other.lower().as_u32();
445 let upper2 = other.upper().as_u32();
446 cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
447 }
448
449 /// Returns true if and only if the intersection of this range and the
450 /// other range is empty.
451 fn is_intersection_empty(&self, other: &Self) -> bool {
452 let (lower1, upper1) = (self.lower(), self.upper());
453 let (lower2, upper2) = (other.lower(), other.upper());
454 cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
455 }
456
457 /// Returns true if and only if this range is a subset of the other range.
458 fn is_subset(&self, other: &Self) -> bool {
459 let (lower1, upper1) = (self.lower(), self.upper());
460 let (lower2, upper2) = (other.lower(), other.upper());
461 (lower2 <= lower1 && lower1 <= upper2)
462 && (lower2 <= upper1 && upper1 <= upper2)
463 }
464 }
465
466 pub trait Bound:
467 Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
468 {
469 fn min_value() -> Self;
470 fn max_value() -> Self;
471 fn as_u32(self) -> u32;
472 fn increment(self) -> Self;
473 fn decrement(self) -> Self;
474 }
475
476 impl Bound for u8 {
477 fn min_value() -> Self {
478 u8::MIN
479 }
480 fn max_value() -> Self {
481 u8::MAX
482 }
483 fn as_u32(self) -> u32 {
484 self as u32
485 }
486 fn increment(self) -> Self {
487 self.checked_add(1).unwrap()
488 }
489 fn decrement(self) -> Self {
490 self.checked_sub(1).unwrap()
491 }
492 }
493
494 impl Bound for char {
495 fn min_value() -> Self {
496 '\x00'
497 }
498 fn max_value() -> Self {
499 '\u{10FFFF}'
500 }
501 fn as_u32(self) -> u32 {
502 self as u32
503 }
504
505 fn increment(self) -> Self {
506 match self {
507 '\u{D7FF}' => '\u{E000}',
508 c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
509 }
510 }
511
512 fn decrement(self) -> Self {
513 match self {
514 '\u{E000}' => '\u{D7FF}',
515 c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
516 }
517 }
518 }
519
520 // Tests for interval sets are written in src/hir.rs against the public API.