1 // Copyright 2015 Google Inc. All rights reserved.
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 //! Scanners for fragments of CommonMark syntax
24 use std
::convert
::TryInto
;
27 use crate::parse
::{Alignment, HtmlScanGuard, LinkType}
;
28 pub use crate::puncttable
::{is_ascii_punctuation, is_punctuation}
;
29 use crate::strings
::CowStr
;
33 // sorted for binary search
34 const HTML_TAGS
: [&str; 62] = [
99 /// Analysis of the beginning of a line, including indentation and container
102 pub struct LineStart
<'a
> {
106 spaces_remaining
: usize,
107 // no thematic breaks can occur before this offset.
108 // this prevents scanning over and over up to a certain point
109 min_hrule_offset
: usize,
112 impl<'a
> LineStart
<'a
> {
113 pub(crate) fn new(bytes
: &[u8]) -> LineStart
{
123 /// Try to scan a number of spaces.
125 /// Returns true if all spaces were consumed.
127 /// Note: consumes some spaces even if not successful.
128 pub(crate) fn scan_space(&mut self, n_space
: usize) -> bool
{
129 self.scan_space_inner(n_space
) == 0
132 /// Scan a number of spaces up to a maximum.
134 /// Returns number of spaces scanned.
135 pub(crate) fn scan_space_upto(&mut self, n_space
: usize) -> usize {
136 n_space
- self.scan_space_inner(n_space
)
139 /// Returns unused remainder of spaces.
140 fn scan_space_inner(&mut self, mut n_space
: usize) -> usize {
141 let n_from_remaining
= self.spaces_remaining
.min(n_space
);
142 self.spaces_remaining
-= n_from_remaining
;
143 n_space
-= n_from_remaining
;
144 while n_space
> 0 && self.ix
< self.bytes
.len() {
145 match self.bytes
[self.ix
] {
151 let spaces
= 4 - (self.ix
- self.tab_start
) % 4;
153 self.tab_start
= self.ix
;
154 let n
= spaces
.min(n_space
);
156 self.spaces_remaining
= spaces
- n
;
164 /// Scan all available ASCII whitespace (not including eol).
165 pub(crate) fn scan_all_space(&mut self) {
166 self.spaces_remaining
= 0;
167 self.ix
+= self.bytes
[self.ix
..]
169 .take_while(|&&b
| b
== b' '
|| b
== b'
\t'
)
173 /// Determine whether we're at end of line (includes end of file).
174 pub(crate) fn is_at_eol(&self) -> bool
{
177 .map(|&c
| c
== b'
\r'
|| c
== b'
\n'
)
181 fn scan_ch(&mut self, c
: u8) -> bool
{
182 if self.ix
< self.bytes
.len() && self.bytes
[self.ix
] == c
{
190 pub(crate) fn scan_blockquote_marker(&mut self) -> bool
{
191 let save
= self.clone();
192 let _
= self.scan_space(3);
193 if self.scan_ch(b'
>'
) {
194 let _
= self.scan_space(1);
202 /// Scan a list marker.
204 /// Return value is the character, the start index, and the indent in spaces.
205 /// For ordered list markers, the character will be one of b'.' or b')'. For
206 /// bullet list markers, it will be one of b'-', b'+', or b'*'.
207 pub(crate) fn scan_list_marker(&mut self) -> Option
<(u8, u64, usize)> {
208 let save
= self.clone();
209 let indent
= self.scan_space_upto(3);
210 if self.ix
< self.bytes
.len() {
211 let c
= self.bytes
[self.ix
];
212 if c
== b'
-'
|| c
== b'
+'
|| c
== b'
*'
{
213 if self.ix
>= self.min_hrule_offset
{
214 // there could be an hrule here
215 if let Err(min_offset
) = scan_hrule(&self.bytes
[self.ix
..]) {
216 self.min_hrule_offset
= min_offset
;
223 if self.scan_space(1) || self.is_at_eol() {
224 return self.finish_list_marker(c
, 0, indent
+ 2);
226 } else if c
>= b'
0'
&& c
<= b'
9'
{
227 let start_ix
= self.ix
;
228 let mut ix
= self.ix
+ 1;
229 let mut val
= u64::from(c
- b'
0'
);
230 while ix
< self.bytes
.len() && ix
- start_ix
< 10 {
231 let c
= self.bytes
[ix
];
233 if c
>= b'
0'
&& c
<= b'
9'
{
234 val
= val
* 10 + u64::from(c
- b'
0'
);
235 } else if c
== b'
)'
|| c
== b'
.'
{
237 if self.scan_space(1) || self.is_at_eol() {
238 return self.finish_list_marker(c
, val
, indent
+ self.ix
- start_ix
);
252 fn finish_list_marker(
257 ) -> Option
<(u8, u64, usize)> {
258 let save
= self.clone();
260 // skip the rest of the line if it's blank
261 if scan_blank_line(&self.bytes
[self.ix
..]).is_some() {
262 return Some((c
, start
, indent
));
265 let post_indent
= self.scan_space_upto(4);
267 indent
+= post_indent
;
271 Some((c
, start
, indent
))
274 /// Returns Some(is_checked) when a task list marker was found. Resets itself
275 /// to original state otherwise.
276 pub(crate) fn scan_task_list_marker(&mut self) -> Option
<bool
> {
277 let save
= self.clone();
278 self.scan_space_upto(3);
280 if !self.scan_ch(b'
['
) {
284 let is_checked
= match self.bytes
.get(self.ix
) {
285 Some(&c
) if is_ascii_whitespace_no_nl(c
) => {
289 Some(b'x'
) | Some(b'X'
) => {
298 if !self.scan_ch(b'
]'
) {
305 .map(|&b
| is_ascii_whitespace_no_nl(b
))
314 pub(crate) fn bytes_scanned(&self) -> usize {
318 pub(crate) fn remaining_space(&self) -> usize {
319 self.spaces_remaining
323 pub(crate) fn is_ascii_whitespace(c
: u8) -> bool
{
324 (c
>= 0x09 && c
<= 0x0d) || c
== b' '
327 pub(crate) fn is_ascii_whitespace_no_nl(c
: u8) -> bool
{
328 c
== b'
\t'
|| c
== 0x0b || c
== 0x0c || c
== b' '
331 fn is_ascii_alpha(c
: u8) -> bool
{
333 b'a'
..=b'z'
| b'A'
..=b'Z'
=> true,
338 fn is_ascii_alphanumeric(c
: u8) -> bool
{
340 b'
0'
..=b'
9'
| b'a'
..=b'z'
| b'A'
..=b'Z'
=> true,
345 fn is_ascii_letterdigitdash(c
: u8) -> bool
{
346 c
== b'
-'
|| is_ascii_alphanumeric(c
)
349 fn is_digit(c
: u8) -> bool
{
350 b'
0'
<= c
&& c
<= b'
9'
353 fn is_valid_unquoted_attr_value_char(c
: u8) -> bool
{
355 b'
\''
| b'
"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
360 // scan a single character
361 pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
362 if !data.is_empty() && data[0] == c {
369 pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
371 F: FnMut(u8) -> bool,
373 data.iter().take_while(|&&c| f(c)).count()
376 pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
378 F: FnMut(u8) -> bool,
380 data.iter().rev().take_while(|&&c| f(c)).count()
383 pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
384 scan_while(data, |x| x == c)
387 // Note: this scans ASCII whitespace only, for Unicode whitespace use
388 // a different function.
389 pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
390 scan_while(data, is_ascii_whitespace_no_nl)
393 fn scan_attr_value_chars(data: &[u8]) -> usize {
394 scan_while(data, is_valid_unquoted_attr_value_char)
397 pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
398 if bytes.is_empty() {
403 b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
408 pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
409 let i = scan_whitespace_no_nl(bytes);
410 scan_eol(&bytes[i..]).map(|n| i + n)
413 pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
414 memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
417 // return: end byte for closing code fence, or None
418 // if the line is not a closing code fence
419 pub(crate) fn scan_closing_code_fence(
424 if bytes.is_empty() {
428 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
429 if num_fence_chars_found < n_fence_char {
432 i += num_fence_chars_found;
433 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
434 i += num_trailing_spaces;
435 scan_eol(&bytes[i..]).map(|_| i)
438 // returned pair is (number of bytes, number of spaces)
439 fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
443 for (i, &b) in text.iter().enumerate() {
452 let new_spaces = spaces + 4 - (spaces & 3);
453 if new_spaces > max {
466 /// Scan hrule opening sequence.
468 /// Returns Ok(x) when it finds an hrule, where x is the
469 /// size of line containing the hrule, including the trailing newline.
471 /// Returns Err(x) when it does not find an hrule and x is
472 /// the offset in data before no hrule can appear.
473 pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
478 if !(c == b'*' || c == b'-' || c == b'_') {
484 while i < bytes.len() {
487 i += scan_eol(&bytes[i..]).unwrap_or(0);
505 /// Scan an ATX heading opening sequence.
507 /// Returns number of bytes in prefix and level.
508 pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
509 let level = scan_ch_repeat(data, b'#');
510 if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
517 /// Scan a setext heading underline.
519 /// Returns number of bytes in line (including trailing newline) and level.
520 pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
521 let c = *data.get(0)?;
522 if !(c == b'-' || c == b'=') {
525 let mut i = 1 + scan_ch_repeat(&data[1..], c);
526 i += scan_blank_line(&data[i..])?;
527 let level = if c == b'=' { 1 } else { 2 };
531 // returns number of bytes in line (including trailing
532 // newline) and column alignments
533 pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
534 let (mut i, spaces) = calc_indent(data, 4);
535 if spaces > 3 || i == data.len() {
538 let mut cols = vec![];
539 let mut active_col = Alignment::None;
540 let mut start_col = true;
544 for c in &data[i..] {
545 if let Some(n) = scan_eol(&data[i..]) {
552 active_col = match (start_col, active_col) {
553 (true, Alignment::None) => Alignment::Left,
554 (false, Alignment::Left) => Alignment::Center,
555 (false, Alignment::None) => Alignment::Right,
565 cols.push(active_col);
566 active_col = Alignment::None;
578 cols.push(active_col);
586 /// Returns number of bytes scanned and the char that is repeated to make the code fence.
587 pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
588 let c = *data.get(0)?;
589 if !(c == b'`' || c == b'~') {
592 let i = 1 + scan_ch_repeat(&data[1..], c);
595 let suffix = &data[i..];
596 let next_line = i + scan_nextline(suffix);
597 // FIXME: make sure this is correct
598 if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
608 pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
609 if data.starts_with(b"> ") {
616 /// This already assumes the list item has been scanned.
617 pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
620 if let Some(bytes) = scan_blank_line(&data[ix..]) {
629 // return number of bytes scanned, delimiter, start index, and indent
630 pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
631 let mut c = *bytes.get(0)?;
632 let (w, start) = match c {
633 b'-' | b'+' | b'*' => (1, 0),
635 let (length, start) = parse_decimal(bytes);
636 c = *bytes.get(length)?;
637 if !(c == b'.' || c == b')') {
646 // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
647 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
649 scan_eol(&bytes[w..])?;
651 } else if postindent > 4 {
655 if scan_blank_line(&bytes[w..]).is_some() {
659 Some((w + postn, c, start, w + postindent))
662 // returns (number of bytes, parsed decimal)
663 fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
666 .take_while(|&&b| is_digit(b))
667 .try_fold((0, 0usize), |(count, acc), c| {
668 let digit = usize::from(c - b'0');
671 .and_then(|ten_acc| ten_acc.checked_add(digit))
673 Some(number) => Ok((count + 1, number)),
674 // stop early on overflow
675 None => Err((count, acc)),
682 // returns (number of bytes, parsed hex)
683 fn parse_hex(bytes: &[u8]) -> (usize, usize) {
684 match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
686 let digit = if c >= b'0' && c <= b'9' {
687 usize::from(c - b'0')
691 if c >= b'a' && c <= b'f' {
692 usize::from(c - b'a' + 10)
694 return Err((count, acc));
699 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
701 Some(number) => Ok((count + 1, number)),
702 // stop early on overflow
703 None => Err((count, acc)),
710 fn char_from_codepoint(input: usize) -> Option<char> {
711 let mut codepoint = input.try_into().ok()?;
715 char::from_u32(codepoint)
718 // doesn't bother to check data[0] == '&'
719 pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
721 if scan_ch(&bytes[end..], b'#') == 1 {
723 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
725 parse_hex(&bytes[end..])
727 parse_decimal(&bytes[end..])
730 return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
732 } else if let Some(c) = char_from_codepoint(codepoint) {
733 (end + 1, Some(c.into()))
738 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
739 if scan_ch(&bytes[end..], b';') == 1 {
740 if let Some(value) = entities::get_entity(&bytes[1..end]) {
741 return (end + 1, Some(value.into()));
747 // FIXME: we can most likely re-use other scanners
748 // returns (bytelength, title_str)
749 pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
750 let mut chars = text.chars().peekable();
751 let closing_delim = match chars.next()? {
757 let mut bytecount = 1;
759 while let Some(c) = chars.next() {
763 let mut next = *chars.peek()?;
764 while is_ascii_whitespace_no_nl(next as u8) {
765 bytecount += chars.next()?.len_utf8();
766 next = *chars.peek()?;
768 if *chars.peek()? == '\n' {
769 // blank line - not allowed
774 let next_char = chars.next()?;
775 bytecount += 1 + next_char.len_utf8();
777 c if c == closing_delim => {
778 return Some((bytecount + 1, &text[1..bytecount]));
781 bytecount += c.len_utf8();
788 // note: dest returned is raw, still needs to be unescaped
789 // TODO: check that nested parens are really not allowed for refdefs
790 // TODO(performance): this func should probably its own unescaping
791 pub(crate) fn scan_link_dest(
795 ) -> Option<(usize, &str)> {
796 let bytes = &data.as_bytes()[start_ix..];
797 let mut i = scan_ch(bytes, b'<');
801 while i < bytes.len() {
803 b'\n' | b'\r' | b'<' => return None,
804 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
805 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
816 while i < bytes.len() {
833 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
840 Some((i, &data[start_ix..(start_ix + i)]))
844 /// Returns bytes scanned
845 fn scan_attribute_name(data: &[u8]) -> Option<usize> {
846 let (&c, tail) = data.split_first()?;
847 if is_ascii_alpha(c) || c == b'_' || c == b':' {
849 1 + scan_while(tail, |c| {
850 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
858 /// Returns byte scanned (TODO: should it return new offset?)
859 // TODO: properly use the newline handler here
860 fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
861 let allow_newline = newline_handler.is_some();
862 let whitespace_scanner =
863 |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
864 let mut ix = scan_attribute_name(data)?;
865 let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
867 if scan_ch(&data[ix..], b'=') == 1 {
869 ix += scan_while(&data[ix..], whitespace_scanner);
870 ix += scan_attribute_value(&data[ix..], newline_handler)?;
871 } else if n_whitespace > 0 {
872 // Leave whitespace for next attribute.
878 fn scan_attribute_value(
880 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
883 match *data.get(0)? {
884 b @ b'"'
| b @ b'
\''
=> {
886 while i
< data
.len() {
890 if let Some(eol_bytes
) = scan_eol(&data
[i
..]) {
891 let handler
= newline_handler?
;
893 i
+= handler(&data
[i
..]);
900 b' '
| b'
='
| b'
>'
| b'
<'
| b'`'
| b'
\n'
| b'
\r'
=> {
904 // unquoted attribute value
905 i
+= scan_attr_value_chars(&data
[i
..]);
911 // Remove backslash escapes and resolve entities
912 pub(crate) fn unescape(input
: &str) -> CowStr
<'_
> {
913 let mut result
= String
::new();
916 let bytes
= input
.as_bytes();
917 while i
< bytes
.len() {
919 b'
\\'
if i
+ 1 < bytes
.len() && is_ascii_punctuation(bytes
[i
+ 1]) => {
920 result
.push_str(&input
[mark
..i
]);
924 b'
&'
=> match scan_entity(&bytes
[i
..]) {
925 (n
, Some(value
)) => {
926 result
.push_str(&input
[mark
..i
]);
927 result
.push_str(&value
);
934 result
.push_str(&input
[mark
..i
]);
944 result
.push_str(&input
[mark
..]);
949 /// Assumes `data` is preceded by `<`.
950 pub(crate) fn scan_html_block_tag(data
: &[u8]) -> (usize, &[u8]) {
951 let i
= scan_ch(data
, b'
/'
);
952 let n
= scan_while(&data
[i
..], is_ascii_alphanumeric
);
953 // TODO: scan attributes and >
954 (i
+ n
, &data
[i
..i
+ n
])
957 pub(crate) fn is_html_tag(tag
: &[u8]) -> bool
{
959 .binary_search_by(|probe
| {
960 let probe_bytes_iter
= probe
.as_bytes().iter();
961 let tag_bytes_iter
= tag
.iter();
965 .find_map(|(&a
, &b
)| {
966 // We can compare case insensitively because the probes are
967 // all lower case alpha strings.
968 match a
.cmp(&(b
| 0x20)) {
969 std
::cmp
::Ordering
::Equal
=> None
,
970 inequality
=> Some(inequality
),
973 .unwrap_or_else(|| probe
.len().cmp(&tag
.len()))
978 /// Assumes that `data` is preceded by `<`.
979 pub(crate) fn scan_html_type_7(data
: &[u8]) -> Option
<usize> {
980 // Block type html does not allow for newlines, so we
981 // do not pass a newline handler.
982 let i
= scan_html_block_inner(data
, None
)?
;
983 scan_blank_line(&data
[i
..])?
;
987 // FIXME: instead of a newline handler, maybe this should receive
988 // a whitespace handler instead.
989 // With signature `&dyn Fn(&[u8]) -> Option<usize>`.
990 // We currently need to implement whitespace handling in all of
991 // this function's dependencies as well.
992 pub(crate) fn scan_html_block_inner(
994 newline_handler
: Option
<&dyn Fn(&[u8]) -> usize>,
996 let close_tag_bytes
= scan_ch(data
, b'
/'
);
997 let l
= scan_while(&data
[close_tag_bytes
..], is_ascii_alpha
);
1001 let mut i
= close_tag_bytes
+ l
;
1002 i
+= scan_while(&data
[i
..], is_ascii_letterdigitdash
);
1004 if close_tag_bytes
== 0 {
1008 i
+= scan_whitespace_no_nl(&data
[i
..]);
1009 if let Some(eol_bytes
) = scan_eol(&data
[i
..]) {
1013 if let Some(handler
) = newline_handler
{
1015 i
+= handler(&data
[i
..]);
1023 if let Some(b'
/'
) | Some(b'
>'
) = data
.get(i
) {
1027 // No whitespace, which is mandatory.
1030 i
+= scan_attribute(&data
[i
..], newline_handler
)?
;
1034 i
+= scan_whitespace_no_nl(&data
[i
..]);
1036 if close_tag_bytes
== 0 {
1037 i
+= scan_ch(&data
[i
..], b'
/'
);
1040 if scan_ch(&data
[i
..], b'
>'
) == 0 {
1047 /// Returns (next_byte_offset, uri, type)
1048 pub(crate) fn scan_autolink(text
: &str, start_ix
: usize) -> Option
<(usize, CowStr
<'_
>, LinkType
)> {
1049 scan_uri(text
, start_ix
)
1050 .map(|(bytes
, uri
)| (bytes
, uri
, LinkType
::Autolink
))
1051 .or_else(|| scan_email(text
, start_ix
).map(|(bytes
, uri
)| (bytes
, uri
, LinkType
::Email
)))
1054 /// Returns (next_byte_offset, uri)
1055 fn scan_uri(text
: &str, start_ix
: usize) -> Option
<(usize, CowStr
<'_
>)> {
1056 let bytes
= &text
.as_bytes()[start_ix
..];
1058 // scheme's first byte must be an ascii letter
1059 if bytes
.is_empty() || !is_ascii_alpha(bytes
[0]) {
1065 while i
< bytes
.len() {
1069 c
if is_ascii_alphanumeric(c
) => (),
1070 b'
.'
| b'
-'
| b'
+'
=> (),
1076 // scheme length must be between 2 and 32 characters long. scheme
1077 // must be followed by colon
1078 if i
< 3 || i
> 33 {
1082 while i
< bytes
.len() {
1084 b'
>'
=> return Some((start_ix
+ i
+ 1, text
[start_ix
..(start_ix
+ i
)].into())),
1085 b'
\0'
..=b' '
| b'
<'
=> return None
,
1094 /// Returns (next_byte_offset, email)
1095 fn scan_email(text
: &str, start_ix
: usize) -> Option
<(usize, CowStr
<'_
>)> {
1096 // using a regex library would be convenient, but doing it by hand is not too bad
1097 let bytes
= &text
.as_bytes()[start_ix
..];
1100 while i
< bytes
.len() {
1104 c
if is_ascii_alphanumeric(c
) => (),
1105 b'
.'
| b'
!'
| b'
#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1106 | b'
^' | b'_'
| b'`'
| b'{' | b'|' | b'}'
| b'
~'
| b'
-'
=> (),
1113 let label_start_ix
= i
;
1114 let mut fresh_label
= true;
1116 while i
< bytes
.len() {
1118 c
if is_ascii_alphanumeric(c
) => (),
1119 b'
-'
if fresh_label
=> {
1125 fresh_label
= false;
1129 if i
== label_start_ix
|| i
- label_start_ix
> 63 || bytes
[i
- 1] == b'
-'
{
1133 if scan_ch(&bytes
[i
..], b'
.'
) == 0 {
1139 if scan_ch(&bytes
[i
..], b'
>'
) == 0 {
1143 Some((start_ix
+ i
+ 1, text
[start_ix
..(start_ix
+ i
)].into()))
1146 /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1147 /// Returns byte offset on match.
1148 pub(crate) fn scan_inline_html_comment(
1151 scan_guard
: &mut HtmlScanGuard
,
1152 ) -> Option
<usize> {
1153 let c
= *bytes
.get(ix
)?
;
1157 let dashes
= scan_ch_repeat(&bytes
[ix
..], b'
-'
);
1161 // Saw "<!--", scan comment.
1163 if scan_ch(&bytes
[ix
..], b'
>'
) == 1 {
1167 while let Some(x
) = memchr(b'
-'
, &bytes
[ix
..]) {
1169 if scan_ch(&bytes
[ix
..], b'
-'
) == 1 {
1171 return if scan_ch(&bytes
[ix
..], b'
>'
) == 1 {
1180 b'
['
if bytes
[ix
..].starts_with(b
"CDATA[") && ix
> scan_guard
.cdata
=> {
1181 ix
+= b
"CDATA[".len();
1182 ix
= memchr(b'
]'
, &bytes
[ix
..]).map_or(bytes
.len(), |x
| ix
+ x
);
1183 let close_brackets
= scan_ch_repeat(&bytes
[ix
..], b'
]'
);
1184 ix
+= close_brackets
;
1186 if close_brackets
== 0 || scan_ch(&bytes
[ix
..], b'
>'
) == 0 {
1187 scan_guard
.cdata
= ix
;
1193 b'A'
..=b'Z'
if ix
> scan_guard
.declaration
=> {
1194 // Scan declaration.
1195 ix
+= scan_while(&bytes
[ix
..], |c
| c
>= b'A'
&& c
<= b'Z'
);
1196 let whitespace
= scan_while(&bytes
[ix
..], is_ascii_whitespace
);
1197 if whitespace
== 0 {
1201 ix
= memchr(b'
>'
, &bytes
[ix
..]).map_or(bytes
.len(), |x
| ix
+ x
);
1202 if scan_ch(&bytes
[ix
..], b'
>'
) == 0 {
1203 scan_guard
.declaration
= ix
;
1213 /// Scan processing directive, with initial "<?" already consumed.
1214 /// Returns the next byte offset on success.
1215 pub(crate) fn scan_inline_html_processing(
1218 scan_guard
: &mut HtmlScanGuard
,
1219 ) -> Option
<usize> {
1220 if ix
<= scan_guard
.processing
{
1223 while let Some(offset
) = memchr(b'?'
, &bytes
[ix
..]) {
1225 if scan_ch(&bytes
[ix
..], b'
>'
) == 1 {
1226 return Some(ix
+ 1);
1229 scan_guard
.processing
= ix
;
1237 fn overflow_list() {
1239 scan_listitem(b
"4444444444444444444444444444444444444444444444444444444444!").is_none()
1244 fn overflow_by_addition() {
1245 assert
!(scan_listitem(b
"1844674407370955161615!").is_none());