vendor/pulldown-cmark/src/scanners.rs

   1 // Copyright 2015 Google Inc. All rights reserved.
   2 //
   3 // Permission is hereby granted, free of charge, to any person obtaining a copy
   4 // of this software and associated documentation files (the "Software"), to deal
   5 // in the Software without restriction, including without limitation the rights
   6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7 // copies of the Software, and to permit persons to whom the Software is
   8 // furnished to do so, subject to the following conditions:
   9 //
  10 // The above copyright notice and this permission notice shall be included in
  11 // all copies or substantial portions of the Software.
  12 //
  13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19 // THE SOFTWARE.
  20
  21 //! Scanners for fragments of CommonMark syntax
  22
  23 use std::char;
  24 use std::convert::TryInto;
  25
  26 use crate::entities;
  27 use crate::parse::{Alignment, HtmlScanGuard, LinkType};
  28 pub use crate::puncttable::{is_ascii_punctuation, is_punctuation};
  29 use crate::strings::CowStr;
  30
  31 use memchr::memchr;
  32
  33 // sorted for binary search
  34 const HTML_TAGS: [&str; 62] = [
  35     "address",
  36     "article",
  37     "aside",
  38     "base",
  39     "basefont",
  40     "blockquote",
  41     "body",
  42     "caption",
  43     "center",
  44     "col",
  45     "colgroup",
  46     "dd",
  47     "details",
  48     "dialog",
  49     "dir",
  50     "div",
  51     "dl",
  52     "dt",
  53     "fieldset",
  54     "figcaption",
  55     "figure",
  56     "footer",
  57     "form",
  58     "frame",
  59     "frameset",
  60     "h1",
  61     "h2",
  62     "h3",
  63     "h4",
  64     "h5",
  65     "h6",
  66     "head",
  67     "header",
  68     "hr",
  69     "html",
  70     "iframe",
  71     "legend",
  72     "li",
  73     "link",
  74     "main",
  75     "menu",
  76     "menuitem",
  77     "nav",
  78     "noframes",
  79     "ol",
  80     "optgroup",
  81     "option",
  82     "p",
  83     "param",
  84     "section",
  85     "source",
  86     "summary",
  87     "table",
  88     "tbody",
  89     "td",
  90     "tfoot",
  91     "th",
  92     "thead",
  93     "title",
  94     "tr",
  95     "track",
  96     "ul",
  97 ];
  98
  99 /// Analysis of the beginning of a line, including indentation and container
 100 /// markers.
 101 #[derive(Clone)]
 102 pub struct LineStart<'a> {
 103     bytes: &'a [u8],
 104     tab_start: usize,
 105     ix: usize,
 106     spaces_remaining: usize,
 107     // no thematic breaks can occur before this offset.
 108     // this prevents scanning over and over up to a certain point
 109     min_hrule_offset: usize,
 110 }
 111
 112 impl<'a> LineStart<'a> {
 113     pub(crate) fn new(bytes: &[u8]) -> LineStart {
 114         LineStart {
 115             bytes,
 116             tab_start: 0,
 117             ix: 0,
 118             spaces_remaining: 0,
 119             min_hrule_offset: 0,
 120         }
 121     }
 122
 123     /// Try to scan a number of spaces.
 124     ///
 125     /// Returns true if all spaces were consumed.
 126     ///
 127     /// Note: consumes some spaces even if not successful.
 128     pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
 129         self.scan_space_inner(n_space) == 0
 130     }
 131
 132     /// Scan a number of spaces up to a maximum.
 133     ///
 134     /// Returns number of spaces scanned.
 135     pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
 136         n_space - self.scan_space_inner(n_space)
 137     }
 138
 139     /// Returns unused remainder of spaces.
 140     fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
 141         let n_from_remaining = self.spaces_remaining.min(n_space);
 142         self.spaces_remaining -= n_from_remaining;
 143         n_space -= n_from_remaining;
 144         while n_space > 0 && self.ix < self.bytes.len() {
 145             match self.bytes[self.ix] {
 146                 b' ' => {
 147                     self.ix += 1;
 148                     n_space -= 1;
 149                 }
 150                 b'\t' => {
 151                     let spaces = 4 - (self.ix - self.tab_start) % 4;
 152                     self.ix += 1;
 153                     self.tab_start = self.ix;
 154                     let n = spaces.min(n_space);
 155                     n_space -= n;
 156                     self.spaces_remaining = spaces - n;
 157                 }
 158                 _ => break,
 159             }
 160         }
 161         n_space
 162     }
 163
 164     /// Scan all available ASCII whitespace (not including eol).
 165     pub(crate) fn scan_all_space(&mut self) {
 166         self.spaces_remaining = 0;
 167         self.ix += self.bytes[self.ix..]
 168             .iter()
 169             .take_while(|&&b| b == b' ' || b == b'\t')
 170             .count();
 171     }
 172
 173     /// Determine whether we're at end of line (includes end of file).
 174     pub(crate) fn is_at_eol(&self) -> bool {
 175         self.bytes
 176             .get(self.ix)
 177             .map(|&c| c == b'\r' || c == b'\n')
 178             .unwrap_or(true)
 179     }
 180
 181     fn scan_ch(&mut self, c: u8) -> bool {
 182         if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
 183             self.ix += 1;
 184             true
 185         } else {
 186             false
 187         }
 188     }
 189
 190     pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
 191         let save = self.clone();
 192         let _ = self.scan_space(3);
 193         if self.scan_ch(b'>') {
 194             let _ = self.scan_space(1);
 195             true
 196         } else {
 197             *self = save;
 198             false
 199         }
 200     }
 201
 202     /// Scan a list marker.
 203     ///
 204     /// Return value is the character, the start index, and the indent in spaces.
 205     /// For ordered list markers, the character will be one of b'.' or b')'. For
 206     /// bullet list markers, it will be one of b'-', b'+', or b'*'.
 207     pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
 208         let save = self.clone();
 209         let indent = self.scan_space_upto(3);
 210         if self.ix < self.bytes.len() {
 211             let c = self.bytes[self.ix];
 212             if c == b'-' || c == b'+' || c == b'*' {
 213                 if self.ix >= self.min_hrule_offset {
 214                     // there could be an hrule here
 215                     if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
 216                         self.min_hrule_offset = min_offset;
 217                     } else {
 218                         *self = save;
 219                         return None;
 220                     }
 221                 }
 222                 self.ix += 1;
 223                 if self.scan_space(1) || self.is_at_eol() {
 224                     return self.finish_list_marker(c, 0, indent + 2);
 225                 }
 226             } else if c >= b'0' && c <= b'9' {
 227                 let start_ix = self.ix;
 228                 let mut ix = self.ix + 1;
 229                 let mut val = u64::from(c - b'0');
 230                 while ix < self.bytes.len() && ix - start_ix < 10 {
 231                     let c = self.bytes[ix];
 232                     ix += 1;
 233                     if c >= b'0' && c <= b'9' {
 234                         val = val * 10 + u64::from(c - b'0');
 235                     } else if c == b')' || c == b'.' {
 236                         self.ix = ix;
 237                         if self.scan_space(1) || self.is_at_eol() {
 238                             return self.finish_list_marker(c, val, indent + self.ix - start_ix);
 239                         } else {
 240                             break;
 241                         }
 242                     } else {
 243                         break;
 244                     }
 245                 }
 246             }
 247         }
 248         *self = save;
 249         None
 250     }
 251
 252     fn finish_list_marker(
 253         &mut self,
 254         c: u8,
 255         start: u64,
 256         mut indent: usize,
 257     ) -> Option<(u8, u64, usize)> {
 258         let save = self.clone();
 259
 260         // skip the rest of the line if it's blank
 261         if scan_blank_line(&self.bytes[self.ix..]).is_some() {
 262             return Some((c, start, indent));
 263         }
 264
 265         let post_indent = self.scan_space_upto(4);
 266         if post_indent < 4 {
 267             indent += post_indent;
 268         } else {
 269             *self = save;
 270         }
 271         Some((c, start, indent))
 272     }
 273
 274     /// Returns Some(is_checked) when a task list marker was found. Resets itself
 275     /// to original state otherwise.
 276     pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
 277         let save = self.clone();
 278         self.scan_space_upto(3);
 279
 280         if !self.scan_ch(b'[') {
 281             *self = save;
 282             return None;
 283         }
 284         let is_checked = match self.bytes.get(self.ix) {
 285             Some(&c) if is_ascii_whitespace_no_nl(c) => {
 286                 self.ix += 1;
 287                 false
 288             }
 289             Some(b'x') | Some(b'X') => {
 290                 self.ix += 1;
 291                 true
 292             }
 293             _ => {
 294                 *self = save;
 295                 return None;
 296             }
 297         };
 298         if !self.scan_ch(b']') {
 299             *self = save;
 300             return None;
 301         }
 302         if !self
 303             .bytes
 304             .get(self.ix)
 305             .map(|&b| is_ascii_whitespace_no_nl(b))
 306             .unwrap_or(false)
 307         {
 308             *self = save;
 309             return None;
 310         }
 311         Some(is_checked)
 312     }
 313
 314     pub(crate) fn bytes_scanned(&self) -> usize {
 315         self.ix
 316     }
 317
 318     pub(crate) fn remaining_space(&self) -> usize {
 319         self.spaces_remaining
 320     }
 321 }
 322
 323 pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
 324     (c >= 0x09 && c <= 0x0d) || c == b' '
 325 }
 326
 327 pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
 328     c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
 329 }
 330
 331 fn is_ascii_alpha(c: u8) -> bool {
 332     match c {
 333         b'a'..=b'z' | b'A'..=b'Z' => true,
 334         _ => false,
 335     }
 336 }
 337
 338 fn is_ascii_alphanumeric(c: u8) -> bool {
 339     match c {
 340         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
 341         _ => false,
 342     }
 343 }
 344
 345 fn is_ascii_letterdigitdash(c: u8) -> bool {
 346     c == b'-' || is_ascii_alphanumeric(c)
 347 }
 348
 349 fn is_digit(c: u8) -> bool {
 350     b'0' <= c && c <= b'9'
 351 }
 352
 353 fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
 354     match c {
 355         b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
 356         _ => true,
 357     }
 358 }
 359
 360 // scan a single character
 361 pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
 362     if !data.is_empty() && data[0] == c {
 363         1
 364     } else {
 365         0
 366     }
 367 }
 368
 369 pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
 370 where
 371     F: FnMut(u8) -> bool,
 372 {
 373     data.iter().take_while(|&&c| f(c)).count()
 374 }
 375
 376 pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
 377 where
 378     F: FnMut(u8) -> bool,
 379 {
 380     data.iter().rev().take_while(|&&c| f(c)).count()
 381 }
 382
 383 pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
 384     scan_while(data, |x| x == c)
 385 }
 386
 387 // Note: this scans ASCII whitespace only, for Unicode whitespace use
 388 // a different function.
 389 pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
 390     scan_while(data, is_ascii_whitespace_no_nl)
 391 }
 392
 393 fn scan_attr_value_chars(data: &[u8]) -> usize {
 394     scan_while(data, is_valid_unquoted_attr_value_char)
 395 }
 396
 397 pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
 398     if bytes.is_empty() {
 399         return Some(0);
 400     }
 401     match bytes[0] {
 402         b'\n' => Some(1),
 403         b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
 404         _ => None,
 405     }
 406 }
 407
 408 pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
 409     let i = scan_whitespace_no_nl(bytes);
 410     scan_eol(&bytes[i..]).map(|n| i + n)
 411 }
 412
 413 pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
 414     memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
 415 }
 416
 417 // return: end byte for closing code fence, or None
 418 // if the line is not a closing code fence
 419 pub(crate) fn scan_closing_code_fence(
 420     bytes: &[u8],
 421     fence_char: u8,
 422     n_fence_char: usize,
 423 ) -> Option<usize> {
 424     if bytes.is_empty() {
 425         return Some(0);
 426     }
 427     let mut i = 0;
 428     let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
 429     if num_fence_chars_found < n_fence_char {
 430         return None;
 431     }
 432     i += num_fence_chars_found;
 433     let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
 434     i += num_trailing_spaces;
 435     scan_eol(&bytes[i..]).map(|_| i)
 436 }
 437
 438 // returned pair is (number of bytes, number of spaces)
 439 fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
 440     let mut spaces = 0;
 441     let mut offset = 0;
 442
 443     for (i, &b) in text.iter().enumerate() {
 444         match b {
 445             b' ' => {
 446                 spaces += 1;
 447                 if spaces == max {
 448                     break;
 449                 }
 450             }
 451             b'\t' => {
 452                 let new_spaces = spaces + 4 - (spaces & 3);
 453                 if new_spaces > max {
 454                     break;
 455                 }
 456                 spaces = new_spaces;
 457             }
 458             _ => break,
 459         }
 460         offset = i;
 461     }
 462
 463     (offset, spaces)
 464 }
 465
 466 /// Scan hrule opening sequence.
 467 ///
 468 /// Returns Ok(x) when it finds an hrule, where x is the
 469 /// size of line containing the hrule, including the trailing newline.
 470 ///
 471 /// Returns Err(x) when it does not find an hrule and x is
 472 /// the offset in data before no hrule can appear.
 473 pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
 474     if bytes.len() < 3 {
 475         return Err(0);
 476     }
 477     let c = bytes[0];
 478     if !(c == b'*' || c == b'-' || c == b'_') {
 479         return Err(0);
 480     }
 481     let mut n = 0;
 482     let mut i = 0;
 483
 484     while i < bytes.len() {
 485         match bytes[i] {
 486             b'\n' | b'\r' => {
 487                 i += scan_eol(&bytes[i..]).unwrap_or(0);
 488                 break;
 489             }
 490             c2 if c2 == c => {
 491                 n += 1;
 492             }
 493             b' ' | b'\t' => (),
 494             _ => return Err(i),
 495         }
 496         i += 1;
 497     }
 498     if n >= 3 {
 499         Ok(i)
 500     } else {
 501         Err(i)
 502     }
 503 }
 504
 505 /// Scan an ATX heading opening sequence.
 506 ///
 507 /// Returns number of bytes in prefix and level.
 508 pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
 509     let level = scan_ch_repeat(data, b'#');
 510     if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
 511         Some(level)
 512     } else {
 513         None
 514     }
 515 }
 516
 517 /// Scan a setext heading underline.
 518 ///
 519 /// Returns number of bytes in line (including trailing newline) and level.
 520 pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
 521     let c = *data.get(0)?;
 522     if !(c == b'-' || c == b'=') {
 523         return None;
 524     }
 525     let mut i = 1 + scan_ch_repeat(&data[1..], c);
 526     i += scan_blank_line(&data[i..])?;
 527     let level = if c == b'=' { 1 } else { 2 };
 528     Some((i, level))
 529 }
 530
 531 // returns number of bytes in line (including trailing
 532 // newline) and column alignments
 533 pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
 534     let (mut i, spaces) = calc_indent(data, 4);
 535     if spaces > 3 || i == data.len() {
 536         return (0, vec![]);
 537     }
 538     let mut cols = vec![];
 539     let mut active_col = Alignment::None;
 540     let mut start_col = true;
 541     if data[i] == b'|' {
 542         i += 1;
 543     }
 544     for c in &data[i..] {
 545         if let Some(n) = scan_eol(&data[i..]) {
 546             i += n;
 547             break;
 548         }
 549         match *c {
 550             b' ' => (),
 551             b':' => {
 552                 active_col = match (start_col, active_col) {
 553                     (true, Alignment::None) => Alignment::Left,
 554                     (false, Alignment::Left) => Alignment::Center,
 555                     (false, Alignment::None) => Alignment::Right,
 556                     _ => active_col,
 557                 };
 558                 start_col = false;
 559             }
 560             b'-' => {
 561                 start_col = false;
 562             }
 563             b'|' => {
 564                 start_col = true;
 565                 cols.push(active_col);
 566                 active_col = Alignment::None;
 567             }
 568             _ => {
 569                 cols = vec![];
 570                 start_col = true;
 571                 break;
 572             }
 573         }
 574         i += 1;
 575     }
 576
 577     if !start_col {
 578         cols.push(active_col);
 579     }
 580
 581     (i, cols)
 582 }
 583
 584 /// Scan code fence.
 585 ///
 586 /// Returns number of bytes scanned and the char that is repeated to make the code fence.
 587 pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
 588     let c = *data.get(0)?;
 589     if !(c == b'`' || c == b'~') {
 590         return None;
 591     }
 592     let i = 1 + scan_ch_repeat(&data[1..], c);
 593     if i >= 3 {
 594         if c == b'`' {
 595             let suffix = &data[i..];
 596             let next_line = i + scan_nextline(suffix);
 597             // FIXME: make sure this is correct
 598             if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
 599                 return None;
 600             }
 601         }
 602         Some((i, c))
 603     } else {
 604         None
 605     }
 606 }
 607
 608 pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
 609     if data.starts_with(b"> ") {
 610         Some(2)
 611     } else {
 612         None
 613     }
 614 }
 615
 616 /// This already assumes the list item has been scanned.
 617 pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
 618     let mut ix = 0;
 619     for _ in 0..2 {
 620         if let Some(bytes) = scan_blank_line(&data[ix..]) {
 621             ix += bytes;
 622         } else {
 623             return false;
 624         }
 625     }
 626     true
 627 }
 628
 629 // return number of bytes scanned, delimiter, start index, and indent
 630 pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
 631     let mut c = *bytes.get(0)?;
 632     let (w, start) = match c {
 633         b'-' | b'+' | b'*' => (1, 0),
 634         b'0'..=b'9' => {
 635             let (length, start) = parse_decimal(bytes);
 636             c = *bytes.get(length)?;
 637             if !(c == b'.' || c == b')') {
 638                 return None;
 639             }
 640             (length + 1, start)
 641         }
 642         _ => {
 643             return None;
 644         }
 645     };
 646     // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
 647     let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
 648     if postindent == 0 {
 649         scan_eol(&bytes[w..])?;
 650         postindent += 1;
 651     } else if postindent > 4 {
 652         postn = 1;
 653         postindent = 1;
 654     }
 655     if scan_blank_line(&bytes[w..]).is_some() {
 656         postn = 0;
 657         postindent = 1;
 658     }
 659     Some((w + postn, c, start, w + postindent))
 660 }
 661
 662 // returns (number of bytes, parsed decimal)
 663 fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
 664     match bytes
 665         .iter()
 666         .take_while(|&&b| is_digit(b))
 667         .try_fold((0, 0usize), |(count, acc), c| {
 668             let digit = usize::from(c - b'0');
 669             match acc
 670                 .checked_mul(10)
 671                 .and_then(|ten_acc| ten_acc.checked_add(digit))
 672             {
 673                 Some(number) => Ok((count + 1, number)),
 674                 // stop early on overflow
 675                 None => Err((count, acc)),
 676             }
 677         }) {
 678         Ok(p) | Err(p) => p,
 679     }
 680 }
 681
 682 // returns (number of bytes, parsed hex)
 683 fn parse_hex(bytes: &[u8]) -> (usize, usize) {
 684     match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
 685         let mut c = *c;
 686         let digit = if c >= b'0' && c <= b'9' {
 687             usize::from(c - b'0')
 688         } else {
 689             // make lower case
 690             c |= 0x20;
 691             if c >= b'a' && c <= b'f' {
 692                 usize::from(c - b'a' + 10)
 693             } else {
 694                 return Err((count, acc));
 695             }
 696         };
 697         match acc
 698             .checked_mul(16)
 699             .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
 700         {
 701             Some(number) => Ok((count + 1, number)),
 702             // stop early on overflow
 703             None => Err((count, acc)),
 704         }
 705     }) {
 706         Ok(p) | Err(p) => p,
 707     }
 708 }
 709
 710 fn char_from_codepoint(input: usize) -> Option<char> {
 711     let mut codepoint = input.try_into().ok()?;
 712     if codepoint == 0 {
 713         codepoint = 0xFFFD;
 714     }
 715     char::from_u32(codepoint)
 716 }
 717
 718 // doesn't bother to check data[0] == '&'
 719 pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
 720     let mut end = 1;
 721     if scan_ch(&bytes[end..], b'#') == 1 {
 722         end += 1;
 723         let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
 724             end += 1;
 725             parse_hex(&bytes[end..])
 726         } else {
 727             parse_decimal(&bytes[end..])
 728         };
 729         end += bytecount;
 730         return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
 731             (0, None)
 732         } else if let Some(c) = char_from_codepoint(codepoint) {
 733             (end + 1, Some(c.into()))
 734         } else {
 735             (0, None)
 736         };
 737     }
 738     end += scan_while(&bytes[end..], is_ascii_alphanumeric);
 739     if scan_ch(&bytes[end..], b';') == 1 {
 740         if let Some(value) = entities::get_entity(&bytes[1..end]) {
 741             return (end + 1, Some(value.into()));
 742         }
 743     }
 744     (0, None)
 745 }
 746
 747 // FIXME: we can most likely re-use other scanners
 748 // returns (bytelength, title_str)
 749 pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
 750     let mut chars = text.chars().peekable();
 751     let closing_delim = match chars.next()? {
 752         '\'' => '\'',
 753         '"' => '"',
 754         '(' => ')',
 755         _ => return None,
 756     };
 757     let mut bytecount = 1;
 758
 759     while let Some(c) = chars.next() {
 760         match c {
 761             '\n' => {
 762                 bytecount += 1;
 763                 let mut next = *chars.peek()?;
 764                 while is_ascii_whitespace_no_nl(next as u8) {
 765                     bytecount += chars.next()?.len_utf8();
 766                     next = *chars.peek()?;
 767                 }
 768                 if *chars.peek()? == '\n' {
 769                     // blank line - not allowed
 770                     return None;
 771                 }
 772             }
 773             '\\' => {
 774                 let next_char = chars.next()?;
 775                 bytecount += 1 + next_char.len_utf8();
 776             }
 777             c if c == closing_delim => {
 778                 return Some((bytecount + 1, &text[1..bytecount]));
 779             }
 780             c => {
 781                 bytecount += c.len_utf8();
 782             }
 783         }
 784     }
 785     None
 786 }
 787
 788 // note: dest returned is raw, still needs to be unescaped
 789 // TODO: check that nested parens are really not allowed for refdefs
 790 // TODO(performance): this func should probably its own unescaping
 791 pub(crate) fn scan_link_dest(
 792     data: &str,
 793     start_ix: usize,
 794     max_next: usize,
 795 ) -> Option<(usize, &str)> {
 796     let bytes = &data.as_bytes()[start_ix..];
 797     let mut i = scan_ch(bytes, b'<');
 798
 799     if i != 0 {
 800         // pointy links
 801         while i < bytes.len() {
 802             match bytes[i] {
 803                 b'\n' | b'\r' | b'<' => return None,
 804                 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
 805                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
 806                     i += 1;
 807                 }
 808                 _ => {}
 809             }
 810             i += 1;
 811         }
 812         None
 813     } else {
 814         // non-pointy links
 815         let mut nest = 0;
 816         while i < bytes.len() {
 817             match bytes[i] {
 818                 0x0..=0x20 => {
 819                     break;
 820                 }
 821                 b'(' => {
 822                     if nest > max_next {
 823                         return None;
 824                     }
 825                     nest += 1;
 826                 }
 827                 b')' => {
 828                     if nest == 0 {
 829                         break;
 830                     }
 831                     nest -= 1;
 832                 }
 833                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
 834                     i += 1;
 835                 }
 836                 _ => {}
 837             }
 838             i += 1;
 839         }
 840         Some((i, &data[start_ix..(start_ix + i)]))
 841     }
 842 }
 843
 844 /// Returns bytes scanned
 845 fn scan_attribute_name(data: &[u8]) -> Option<usize> {
 846     let (&c, tail) = data.split_first()?;
 847     if is_ascii_alpha(c) || c == b'_' || c == b':' {
 848         Some(
 849             1 + scan_while(tail, |c| {
 850                 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
 851             }),
 852         )
 853     } else {
 854         None
 855     }
 856 }
 857
 858 /// Returns byte scanned (TODO: should it return new offset?)
 859 // TODO: properly use the newline handler here
 860 fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
 861     let allow_newline = newline_handler.is_some();
 862     let whitespace_scanner =
 863         |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
 864     let mut ix = scan_attribute_name(data)?;
 865     let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
 866     ix += n_whitespace;
 867     if scan_ch(&data[ix..], b'=') == 1 {
 868         ix += 1;
 869         ix += scan_while(&data[ix..], whitespace_scanner);
 870         ix += scan_attribute_value(&data[ix..], newline_handler)?;
 871     } else if n_whitespace > 0 {
 872         // Leave whitespace for next attribute.
 873         ix -= 1;
 874     }
 875     Some(ix)
 876 }
 877
 878 fn scan_attribute_value(
 879     data: &[u8],
 880     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
 881 ) -> Option<usize> {
 882     let mut i = 0;
 883     match *data.get(0)? {
 884         b @ b'"' | b @ b'\'' => {
 885             i += 1;
 886             while i < data.len() {
 887                 if data[i] == b {
 888                     return Some(i + 1);
 889                 }
 890                 if let Some(eol_bytes) = scan_eol(&data[i..]) {
 891                     let handler = newline_handler?;
 892                     i += eol_bytes;
 893                     i += handler(&data[i..]);
 894                 } else {
 895                     i += 1;
 896                 }
 897             }
 898             return None;
 899         }
 900         b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
 901             return None;
 902         }
 903         _ => {
 904             // unquoted attribute value
 905             i += scan_attr_value_chars(&data[i..]);
 906         }
 907     }
 908     Some(i)
 909 }
 910
 911 // Remove backslash escapes and resolve entities
 912 pub(crate) fn unescape(input: &str) -> CowStr<'_> {
 913     let mut result = String::new();
 914     let mut mark = 0;
 915     let mut i = 0;
 916     let bytes = input.as_bytes();
 917     while i < bytes.len() {
 918         match bytes[i] {
 919             b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
 920                 result.push_str(&input[mark..i]);
 921                 mark = i + 1;
 922                 i += 2;
 923             }
 924             b'&' => match scan_entity(&bytes[i..]) {
 925                 (n, Some(value)) => {
 926                     result.push_str(&input[mark..i]);
 927                     result.push_str(&value);
 928                     i += n;
 929                     mark = i;
 930                 }
 931                 _ => i += 1,
 932             },
 933             b'\r' => {
 934                 result.push_str(&input[mark..i]);
 935                 i += 1;
 936                 mark = i;
 937             }
 938             _ => i += 1,
 939         }
 940     }
 941     if mark == 0 {
 942         input.into()
 943     } else {
 944         result.push_str(&input[mark..]);
 945         result.into()
 946     }
 947 }
 948
 949 /// Assumes `data` is preceded by `<`.
 950 pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
 951     let i = scan_ch(data, b'/');
 952     let n = scan_while(&data[i..], is_ascii_alphanumeric);
 953     // TODO: scan attributes and >
 954     (i + n, &data[i..i + n])
 955 }
 956
 957 pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
 958     HTML_TAGS
 959         .binary_search_by(|probe| {
 960             let probe_bytes_iter = probe.as_bytes().iter();
 961             let tag_bytes_iter = tag.iter();
 962
 963             probe_bytes_iter
 964                 .zip(tag_bytes_iter)
 965                 .find_map(|(&a, &b)| {
 966                     // We can compare case insensitively because the probes are
 967                     // all lower case alpha strings.
 968                     match a.cmp(&(b | 0x20)) {
 969                         std::cmp::Ordering::Equal => None,
 970                         inequality => Some(inequality),
 971                     }
 972                 })
 973                 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
 974         })
 975         .is_ok()
 976 }
 977
 978 /// Assumes that `data` is preceded by `<`.
 979 pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
 980     // Block type html does not allow for newlines, so we
 981     // do not pass a newline handler.
 982     let i = scan_html_block_inner(data, None)?;
 983     scan_blank_line(&data[i..])?;
 984     Some(i)
 985 }
 986
 987 // FIXME: instead of a newline handler, maybe this should receive
 988 // a whitespace handler instead.
 989 // With signature `&dyn Fn(&[u8]) -> Option<usize>`.
 990 // We currently need to implement whitespace handling in all of
 991 // this function's dependencies as well.
 992 pub(crate) fn scan_html_block_inner(
 993     data: &[u8],
 994     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
 995 ) -> Option<usize> {
 996     let close_tag_bytes = scan_ch(data, b'/');
 997     let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
 998     if l == 0 {
 999         return None;
1000     }
1001     let mut i = close_tag_bytes + l;
1002     i += scan_while(&data[i..], is_ascii_letterdigitdash);
1003
1004     if close_tag_bytes == 0 {
1005         loop {
1006             let old_i = i;
1007             loop {
1008                 i += scan_whitespace_no_nl(&data[i..]);
1009                 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1010                     if eol_bytes == 0 {
1011                         return None;
1012                     }
1013                     if let Some(handler) = newline_handler {
1014                         i += eol_bytes;
1015                         i += handler(&data[i..]);
1016                     } else {
1017                         return None;
1018                     }
1019                 } else {
1020                     break;
1021                 }
1022             }
1023             if let Some(b'/') | Some(b'>') = data.get(i) {
1024                 break;
1025             }
1026             if old_i == i {
1027                 // No whitespace, which is mandatory.
1028                 return None;
1029             }
1030             i += scan_attribute(&data[i..], newline_handler)?;
1031         }
1032     }
1033
1034     i += scan_whitespace_no_nl(&data[i..]);
1035
1036     if close_tag_bytes == 0 {
1037         i += scan_ch(&data[i..], b'/');
1038     }
1039
1040     if scan_ch(&data[i..], b'>') == 0 {
1041         None
1042     } else {
1043         Some(i + 1)
1044     }
1045 }
1046
1047 /// Returns (next_byte_offset, uri, type)
1048 pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1049     scan_uri(text, start_ix)
1050         .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1051         .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1052 }
1053
1054 /// Returns (next_byte_offset, uri)
1055 fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1056     let bytes = &text.as_bytes()[start_ix..];
1057
1058     // scheme's first byte must be an ascii letter
1059     if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1060         return None;
1061     }
1062
1063     let mut i = 1;
1064
1065     while i < bytes.len() {
1066         let c = bytes[i];
1067         i += 1;
1068         match c {
1069             c if is_ascii_alphanumeric(c) => (),
1070             b'.' | b'-' | b'+' => (),
1071             b':' => break,
1072             _ => return None,
1073         }
1074     }
1075
1076     // scheme length must be between 2 and 32 characters long. scheme
1077     // must be followed by colon
1078     if i < 3 || i > 33 {
1079         return None;
1080     }
1081
1082     while i < bytes.len() {
1083         match bytes[i] {
1084             b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1085             b'\0'..=b' ' | b'<' => return None,
1086             _ => (),
1087         }
1088         i += 1;
1089     }
1090
1091     None
1092 }
1093
1094 /// Returns (next_byte_offset, email)
1095 fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1096     // using a regex library would be convenient, but doing it by hand is not too bad
1097     let bytes = &text.as_bytes()[start_ix..];
1098     let mut i = 0;
1099
1100     while i < bytes.len() {
1101         let c = bytes[i];
1102         i += 1;
1103         match c {
1104             c if is_ascii_alphanumeric(c) => (),
1105             b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1106             | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1107             b'@' => break,
1108             _ => return None,
1109         }
1110     }
1111
1112     loop {
1113         let label_start_ix = i;
1114         let mut fresh_label = true;
1115
1116         while i < bytes.len() {
1117             match bytes[i] {
1118                 c if is_ascii_alphanumeric(c) => (),
1119                 b'-' if fresh_label => {
1120                     return None;
1121                 }
1122                 b'-' => (),
1123                 _ => break,
1124             }
1125             fresh_label = false;
1126             i += 1;
1127         }
1128
1129         if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1130             return None;
1131         }
1132
1133         if scan_ch(&bytes[i..], b'.') == 0 {
1134             break;
1135         }
1136         i += 1;
1137     }
1138
1139     if scan_ch(&bytes[i..], b'>') == 0 {
1140         return None;
1141     }
1142
1143     Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1144 }
1145
1146 /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1147 /// Returns byte offset on match.
1148 pub(crate) fn scan_inline_html_comment(
1149     bytes: &[u8],
1150     mut ix: usize,
1151     scan_guard: &mut HtmlScanGuard,
1152 ) -> Option<usize> {
1153     let c = *bytes.get(ix)?;
1154     ix += 1;
1155     match c {
1156         b'-' => {
1157             let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1158             if dashes < 1 {
1159                 return None;
1160             }
1161             // Saw "<!--", scan comment.
1162             ix += dashes;
1163             if scan_ch(&bytes[ix..], b'>') == 1 {
1164                 return None;
1165             }
1166
1167             while let Some(x) = memchr(b'-', &bytes[ix..]) {
1168                 ix += x + 1;
1169                 if scan_ch(&bytes[ix..], b'-') == 1 {
1170                     ix += 1;
1171                     return if scan_ch(&bytes[ix..], b'>') == 1 {
1172                         Some(ix + 1)
1173                     } else {
1174                         None
1175                     };
1176                 }
1177             }
1178             None
1179         }
1180         b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1181             ix += b"CDATA[".len();
1182             ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1183             let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1184             ix += close_brackets;
1185
1186             if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1187                 scan_guard.cdata = ix;
1188                 None
1189             } else {
1190                 Some(ix + 1)
1191             }
1192         }
1193         b'A'..=b'Z' if ix > scan_guard.declaration => {
1194             // Scan declaration.
1195             ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1196             let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1197             if whitespace == 0 {
1198                 return None;
1199             }
1200             ix += whitespace;
1201             ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1202             if scan_ch(&bytes[ix..], b'>') == 0 {
1203                 scan_guard.declaration = ix;
1204                 None
1205             } else {
1206                 Some(ix + 1)
1207             }
1208         }
1209         _ => None,
1210     }
1211 }
1212
1213 /// Scan processing directive, with initial "<?" already consumed.
1214 /// Returns the next byte offset on success.
1215 pub(crate) fn scan_inline_html_processing(
1216     bytes: &[u8],
1217     mut ix: usize,
1218     scan_guard: &mut HtmlScanGuard,
1219 ) -> Option<usize> {
1220     if ix <= scan_guard.processing {
1221         return None;
1222     }
1223     while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1224         ix += offset + 1;
1225         if scan_ch(&bytes[ix..], b'>') == 1 {
1226             return Some(ix + 1);
1227         }
1228     }
1229     scan_guard.processing = ix;
1230     None
1231 }
1232
1233 #[cfg(test)]
1234 mod test {
1235     use super::*;
1236     #[test]
1237     fn overflow_list() {
1238         assert!(
1239             scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1240         );
1241     }
1242
1243     #[test]
1244     fn overflow_by_addition() {
1245         assert!(scan_listitem(b"1844674407370955161615!").is_none());
1246     }
1247 }