1 // Copyright 2017 Google Inc. All rights reserved.
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 //! Tree-based two pass parser.
23 use std
::cmp
::{max, min}
;
24 use std
::collections
::{HashMap, VecDeque}
;
25 use std
::ops
::{Index, Range}
;
29 use crate::linklabel
::{scan_link_label_rest, LinkLabel, ReferenceLabel}
;
30 use crate::scanners
::*;
31 use crate::strings
::CowStr
;
32 use crate::tree
::{Tree, TreeIndex}
;
34 // Allowing arbitrary depth nested parentheses inside link destinations
35 // can create denial of service vulnerabilities if we're not careful.
36 // The simplest countermeasure is to limit their depth, which is
37 // explicitly allowed by the spec as long as the limit is at least 3:
38 // https://spec.commonmark.org/0.29/#link-destination
39 const LINK_MAX_NESTED_PARENS
: usize = 5;
42 #[derive(Clone, Debug, PartialEq)]
43 pub enum CodeBlockKind
<'a
> {
45 /// The value contained in the tag describes the language of the code, which may be empty.
49 impl<'a
> CodeBlockKind
<'a
> {
50 pub fn is_indented(&self) -> bool
{
52 CodeBlockKind
::Indented
=> true,
57 pub fn is_fenced(&self) -> bool
{
59 CodeBlockKind
::Fenced(_
) => true,
65 /// Tags for elements that can contain other elements.
66 #[derive(Clone, Debug, PartialEq)]
68 /// A paragraph of text and other inline elements.
71 /// A heading. The field indicates the level of the heading.
76 CodeBlock(CodeBlockKind
<'a
>),
78 /// A list. If the list is ordered the field indicates the number of the first item.
79 /// Contains only list items.
80 List(Option
<u64>), // TODO: add delim and tight for ast (not needed for html)
83 /// A footnote definition. The value contained is the footnote's label by which it can
85 FootnoteDefinition(CowStr
<'a
>),
87 /// A table. Contains a vector describing the text-alignment for each of its columns.
88 Table(Vec
<Alignment
>),
89 /// A table header. Contains only `TableRow`s. Note that the table body starts immediately
90 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
92 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
101 /// A link. The first field is the link type, the second the destination URL and the third is a title.
102 Link(LinkType
, CowStr
<'a
>, CowStr
<'a
>),
104 /// An image. The first field is the link type, the second the destination URL and the third is a title.
105 Image(LinkType
, CowStr
<'a
>, CowStr
<'a
>),
108 /// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
109 #[derive(Clone, Debug, PartialEq, Copy)]
111 /// Inline link like `[foo](bar)`
113 /// Reference link like `[foo][bar]`
115 /// Reference without destination in the document, but resolved by the broken_link_callback
117 /// Collapsed link like `[foo][]`
119 /// Collapsed link without destination in the document, but resolved by the broken_link_callback
121 /// Shortcut link like `[foo]`
123 /// Shortcut without destination in the document, but resolved by the broken_link_callback
125 /// Autolink like `<http://foo.bar/baz>`
127 /// Email address in autolink like `<john@example.org>`
132 fn to_unknown(self) -> Self {
134 LinkType
::Reference
=> LinkType
::ReferenceUnknown
,
135 LinkType
::Collapsed
=> LinkType
::CollapsedUnknown
,
136 LinkType
::Shortcut
=> LinkType
::ShortcutUnknown
,
142 /// Markdown events that are generated in a preorder traversal of the document
143 /// tree, with additional `End` events whenever all of an inner node's children
144 /// have been visited.
145 #[derive(Clone, Debug, PartialEq)]
147 /// Start of a tagged element. Events that are yielded after this event
148 /// and before its corresponding `End` event are inside this element.
149 /// Start and end events are guaranteed to be balanced.
151 /// End of a tagged element.
155 /// An inline code node.
159 /// A reference to a footnote with given label, which may or may not be defined
160 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
161 /// occur in any order.
162 FootnoteReference(CowStr
<'a
>),
163 /// A soft line break.
165 /// A hard line break.
167 /// A horizontal ruler.
169 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
170 TaskListMarker(bool
),
173 /// Table column text alignment.
174 #[derive(Copy, Clone, Debug, PartialEq)]
176 /// Default text alignment.
184 /// Option struct containing flags for enabling extra features
185 /// that are not part of the CommonMark spec.
186 pub struct Options
: u32 {
187 const ENABLE_TABLES
= 1 << 1;
188 const ENABLE_FOOTNOTES
= 1 << 2;
189 const ENABLE_STRIKETHROUGH
= 1 << 3;
190 const ENABLE_TASKLISTS
= 1 << 4;
194 #[derive(Debug, Default, Clone, Copy)]
201 #[derive(Debug, PartialEq, Clone, Copy)]
208 // These are possible inline items, need to be resolved in second pass.
210 // repeats, can_open, can_close
211 MaybeEmphasis(usize, bool
, bool
),
212 MaybeCode(usize, bool
), // number of backticks, preceeded by backslash
218 // These are inline items after resolution.
225 FootnoteReference(CowIndex
),
226 TaskListMarker(bool
), // true for checked
229 Heading(u32), // heading level
230 FencedCodeBlock(CowIndex
),
234 List(bool
, u8, u64), // is_tight, list character, list start index
235 ListItem(usize), // indent level
236 SynthesizeText(CowIndex
),
237 FootnoteDefinition(CowIndex
),
240 Table(AlignmentIndex
),
245 // Dummy node at the top of the tree - should not be used otherwise!
250 fn is_inline(&self) -> bool
{
252 ItemBody
::MaybeEmphasis(..)
253 | ItemBody
::MaybeHtml
254 | ItemBody
::MaybeCode(..)
255 | ItemBody
::MaybeLinkOpen
256 | ItemBody
::MaybeLinkClose
257 | ItemBody
::MaybeImage
=> true,
263 impl<'a
> Default
for ItemBody
{
264 fn default() -> Self {
269 /// Scanning modes for `Parser`'s `parse_line` method.
270 #[derive(PartialEq, Eq, Copy, Clone)]
271 enum TableParseMode
{
272 /// Inside a paragraph, scanning for table headers.
276 /// Inside a paragraph, not scanning for table headers.
280 /// State for the first parsing pass.
282 /// The first pass resolves all block structure, generating an AST. Within a block, items
283 /// are in a linear chain with potential inline markup identified.
284 struct FirstPass
<'a
> {
287 begin_list_item
: bool
,
288 last_line_blank
: bool
,
289 allocs
: Allocations
<'a
>,
294 impl<'a
> FirstPass
<'a
> {
295 fn new(text
: &'a
str, options
: Options
) -> FirstPass
{
296 // This is a very naive heuristic for the number of nodes
298 let start_capacity
= max(128, text
.len() / 32);
299 let tree
= Tree
::with_capacity(start_capacity
);
300 let begin_list_item
= false;
301 let last_line_blank
= false;
302 let allocs
= Allocations
::new();
314 fn run(mut self) -> (Tree
<Item
>, Allocations
<'a
>) {
316 while ix
< self.text
.len() {
317 ix
= self.parse_block(ix
);
319 for _
in 0..self.tree
.spine_len() {
322 (self.tree
, self.allocs
)
325 /// Returns offset after block.
326 fn parse_block(&mut self, mut start_ix
: usize) -> usize {
327 let bytes
= self.text
.as_bytes();
328 let mut line_start
= LineStart
::new(&bytes
[start_ix
..]);
330 let i
= scan_containers(&self.tree
, &mut line_start
);
331 for _
in i
..self.tree
.spine_len() {
335 if self.options
.contains(Options
::ENABLE_FOOTNOTES
) {
336 // finish footnote if it's still open and was preceeded by blank line
337 if let Some(node_ix
) = self.tree
.peek_up() {
338 if let ItemBody
::FootnoteDefinition(..) = self.tree
[node_ix
].item
.body
{
339 if self.last_line_blank
{
345 // Footnote definitions of the form
348 let container_start
= start_ix
+ line_start
.bytes_scanned();
349 if let Some(bytecount
) = self.parse_footnote(container_start
) {
350 start_ix
= container_start
+ bytecount
;
351 start_ix
+= scan_blank_line(&bytes
[start_ix
..]).unwrap_or(0);
352 line_start
= LineStart
::new(&bytes
[start_ix
..]);
356 // Process new containers
358 let container_start
= start_ix
+ line_start
.bytes_scanned();
359 if let Some((ch
, index
, indent
)) = line_start
.scan_list_marker() {
360 let after_marker_index
= start_ix
+ line_start
.bytes_scanned();
361 self.continue_list(container_start
, ch
, index
);
362 self.tree
.append(Item
{
363 start
: container_start
,
364 end
: after_marker_index
, // will get updated later if item not empty
365 body
: ItemBody
::ListItem(indent
),
368 if let Some(n
) = scan_blank_line(&bytes
[after_marker_index
..]) {
369 self.begin_list_item
= true;
370 return after_marker_index
+ n
;
372 if self.options
.contains(Options
::ENABLE_TASKLISTS
) {
373 if let Some(is_checked
) = line_start
.scan_task_list_marker() {
374 self.tree
.append(Item
{
375 start
: after_marker_index
,
376 end
: start_ix
+ line_start
.bytes_scanned(),
377 body
: ItemBody
::TaskListMarker(is_checked
),
381 } else if line_start
.scan_blockquote_marker() {
382 self.finish_list(start_ix
);
383 self.tree
.append(Item
{
384 start
: container_start
,
385 end
: 0, // will get set later
386 body
: ItemBody
::BlockQuote
,
394 let ix
= start_ix
+ line_start
.bytes_scanned();
396 if let Some(n
) = scan_blank_line(&bytes
[ix
..]) {
397 if let Some(node_ix
) = self.tree
.peek_up() {
398 match self.tree
[node_ix
].item
.body
{
399 ItemBody
::BlockQuote
=> (),
401 if self.begin_list_item
{
402 // A list item can begin with at most one blank line.
405 self.last_line_blank
= true;
412 self.begin_list_item
= false;
413 self.finish_list(start_ix
);
415 // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
416 let remaining_space
= line_start
.remaining_space();
418 let indent
= line_start
.scan_space_upto(4);
420 let ix
= start_ix
+ line_start
.bytes_scanned();
421 let remaining_space
= line_start
.remaining_space();
422 return self.parse_indented_code_block(ix
, remaining_space
);
425 let ix
= start_ix
+ line_start
.bytes_scanned();
428 if bytes
[ix
] == b'
<'
{
429 // Types 1-5 are all detected by one function and all end with the same
431 if let Some(html_end_tag
) = get_html_end_tag(&bytes
[(ix
+ 1)..]) {
432 return self.parse_html_block_type_1_to_5(ix
, html_end_tag
, remaining_space
);
436 let possible_tag
= scan_html_block_tag(&bytes
[(ix
+ 1)..]).1;
437 if is_html_tag(possible_tag
) {
438 return self.parse_html_block_type_6_or_7(ix
, remaining_space
);
442 if let Some(_html_bytes
) = scan_html_type_7(&bytes
[(ix
+ 1)..]) {
443 return self.parse_html_block_type_6_or_7(ix
, remaining_space
);
447 if let Ok(n
) = scan_hrule(&bytes
[ix
..]) {
448 return self.parse_hrule(n
, ix
);
451 if let Some(atx_size
) = scan_atx_heading(&bytes
[ix
..]) {
452 return self.parse_atx_heading(ix
, atx_size
);
456 if let Some((bytecount
, label
, link_def
)) = self.parse_refdef_total(ix
) {
457 self.allocs
.refdefs
.entry(label
).or_insert(link_def
);
458 let ix
= ix
+ bytecount
;
459 // try to read trailing whitespace or it will register as a completely blank line
460 // TODO: shouldn't we do this for all block level items?
461 return ix
+ scan_blank_line(&bytes
[ix
..]).unwrap_or(0);
464 if let Some((n
, fence_ch
)) = scan_code_fence(&bytes
[ix
..]) {
465 return self.parse_fenced_code_block(ix
, indent
, fence_ch
, n
);
467 self.parse_paragraph(ix
)
470 /// Returns the offset of the first line after the table.
471 /// Assumptions: current focus is a table element and the table header
472 /// matches the separator line (same number of columns).
473 fn parse_table(&mut self, table_cols
: usize, head_start
: usize, body_start
: usize) -> usize {
474 // parse header. this shouldn't fail because we made sure the table header is ok
475 let (_sep_start
, thead_ix
) = self.parse_table_row_inner(head_start
, table_cols
);
476 self.tree
[thead_ix
].item
.body
= ItemBody
::TableHead
;
479 let mut ix
= body_start
;
480 while let Some((next_ix
, _row_ix
)) = self.parse_table_row(ix
, table_cols
) {
488 /// Call this when containers are taken care of.
489 /// Returns bytes scanned, row_ix
490 fn parse_table_row_inner(&mut self, mut ix
: usize, row_cells
: usize) -> (usize, TreeIndex
) {
491 let bytes
= self.text
.as_bytes();
493 let mut final_cell_ix
= None
;
495 let row_ix
= self.tree
.append(Item
{
497 end
: 0, // set at end of this function
498 body
: ItemBody
::TableRow
,
503 ix
+= scan_ch(&bytes
[ix
..], b'
|'
);
504 ix
+= scan_whitespace_no_nl(&bytes
[ix
..]);
506 if let Some(eol_bytes
) = scan_eol(&bytes
[ix
..]) {
511 let cell_ix
= self.tree
.append(Item
{
514 body
: ItemBody
::TableCell
,
517 let (next_ix
, _brk
) = self.parse_line(ix
, TableParseMode
::Active
);
518 let trailing_whitespace
= scan_rev_while(&bytes
[..next_ix
], is_ascii_whitespace
);
520 if let Some(cur_ix
) = self.tree
.cur() {
521 self.tree
[cur_ix
].item
.end
-= trailing_whitespace
;
524 self.tree
[cell_ix
].item
.end
= next_ix
- trailing_whitespace
;
530 if cells
== row_cells
{
531 final_cell_ix
= Some(cell_ix
);
535 // fill empty cells if needed
536 // note: this is where GFM and commonmark-extra diverge. we follow
538 for _
in cells
..row_cells
{
539 self.tree
.append(Item
{
542 body
: ItemBody
::TableCell
,
547 if let Some(cell_ix
) = final_cell_ix
{
548 self.tree
[cell_ix
].next
= None
;
556 /// Returns first offset after the row and the tree index of the row.
557 fn parse_table_row(&mut self, mut ix
: usize, row_cells
: usize) -> Option
<(usize, TreeIndex
)> {
558 let bytes
= self.text
.as_bytes();
559 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
560 let containers
= scan_containers(&self.tree
, &mut line_start
);
561 if containers
!= self.tree
.spine_len() {
564 line_start
.scan_all_space();
565 ix
+= line_start
.bytes_scanned();
566 if scan_paragraph_interrupt(&bytes
[ix
..]) {
570 let (ix
, row_ix
) = self.parse_table_row_inner(ix
, row_cells
);
574 /// Returns offset of line start after paragraph.
575 fn parse_paragraph(&mut self, start_ix
: usize) -> usize {
576 let node_ix
= self.tree
.append(Item
{
578 end
: 0, // will get set later
579 body
: ItemBody
::Paragraph
,
582 let bytes
= self.text
.as_bytes();
584 let mut ix
= start_ix
;
586 let scan_mode
= if self.options
.contains(Options
::ENABLE_TABLES
) && ix
== start_ix
{
589 TableParseMode
::Disabled
591 let (next_ix
, brk
) = self.parse_line(ix
, scan_mode
);
593 // break out when we find a table
595 body
: ItemBody
::Table(alignment_ix
),
599 let table_cols
= self.allocs
[alignment_ix
].len();
600 self.tree
[node_ix
].item
.body
= ItemBody
::Table(alignment_ix
);
601 // this clears out any stuff we may have appended - but there may
603 self.tree
[node_ix
].child
= None
;
606 return self.parse_table(table_cols
, ix
, next_ix
);
610 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
611 let n_containers
= scan_containers(&self.tree
, &mut line_start
);
612 if !line_start
.scan_space(4) {
613 let ix_new
= ix
+ line_start
.bytes_scanned();
614 if n_containers
== self.tree
.spine_len() {
615 if let Some(ix_setext
) = self.parse_setext_heading(ix_new
, node_ix
) {
618 body
: ItemBody
::HardBreak
,
622 if bytes
[start
] == b'
\\'
{
623 self.tree
.append_text(start
, start
+ 1);
630 // first check for non-empty lists, then for other interrupts
631 let suffix
= &bytes
[ix_new
..];
632 if self.interrupt_paragraph_by_list(suffix
) || scan_paragraph_interrupt(suffix
) {
636 line_start
.scan_all_space();
637 if line_start
.is_at_eol() {
640 ix
= next_ix
+ line_start
.bytes_scanned();
641 if let Some(item
) = brk
{
642 self.tree
.append(item
);
650 /// Returns end ix of setext_heading on success.
651 fn parse_setext_heading(&mut self, ix
: usize, node_ix
: TreeIndex
) -> Option
<usize> {
652 let bytes
= self.text
.as_bytes();
653 let (n
, level
) = scan_setext_heading(&bytes
[ix
..])?
;
654 self.tree
[node_ix
].item
.body
= ItemBody
::Heading(level
);
656 // strip trailing whitespace
657 if let Some(cur_ix
) = self.tree
.cur() {
658 self.tree
[cur_ix
].item
.end
-= scan_rev_while(
659 &bytes
[..self.tree
[cur_ix
].item
.end
],
660 is_ascii_whitespace_no_nl
,
667 /// Parse a line of input, appending text and items to tree.
669 /// Returns: index after line and an item representing the break.
670 fn parse_line(&mut self, start
: usize, mode
: TableParseMode
) -> (usize, Option
<Item
>) {
671 let bytes
= &self.text
.as_bytes();
673 let mut last_pipe_ix
= start
;
674 let mut begin_text
= start
;
676 let (final_ix
, brk
) = iterate_special_bytes(bytes
, start
, |ix
, byte
| {
679 if let TableParseMode
::Active
= mode
{
680 return LoopInstruction
::BreakAtWith(ix
, None
);
684 let eol_bytes
= scan_eol(&bytes
[ix
..]).unwrap();
685 if mode
== TableParseMode
::Scan
&& pipes
> 0 {
686 // check if we may be parsing a table
687 let next_line_ix
= ix
+ eol_bytes
;
688 let mut line_start
= LineStart
::new(&bytes
[next_line_ix
..]);
689 if scan_containers(&self.tree
, &mut line_start
) == self.tree
.spine_len() {
690 let table_head_ix
= next_line_ix
+ line_start
.bytes_scanned();
691 let (table_head_bytes
, alignment
) =
692 scan_table_head(&bytes
[table_head_ix
..]);
694 if table_head_bytes
> 0 {
695 // computing header count from number of pipes
697 count_header_cols(bytes
, pipes
, start
, last_pipe_ix
);
699 // make sure they match the number of columns we find in separator line
700 if alignment
.len() == header_count
{
701 let alignment_ix
= self.allocs
.allocate_alignment(alignment
);
702 let end_ix
= table_head_ix
+ table_head_bytes
;
703 return LoopInstruction
::BreakAtWith(
707 end
: end_ix
, // must update later
708 body
: ItemBody
::Table(alignment_ix
),
716 let end_ix
= ix
+ eol_bytes
;
717 let trailing_backslashes
= scan_rev_while(&bytes
[..ix
], |b
| b
== b'
\\'
);
718 if trailing_backslashes
% 2 == 1 && end_ix
< self.text
.len() {
720 self.tree
.append_text(begin_text
, i
);
721 return LoopInstruction
::BreakAtWith(
726 body
: ItemBody
::HardBreak
,
730 let trailing_whitespace
=
731 scan_rev_while(&bytes
[..ix
], is_ascii_whitespace_no_nl
);
732 if trailing_whitespace
>= 2 {
733 i
-= trailing_whitespace
;
734 self.tree
.append_text(begin_text
, i
);
735 return LoopInstruction
::BreakAtWith(
740 body
: ItemBody
::HardBreak
,
745 self.tree
.append_text(begin_text
, ix
);
746 LoopInstruction
::BreakAtWith(
751 body
: ItemBody
::SoftBreak
,
756 if ix
+ 1 < self.text
.len() && is_ascii_punctuation(bytes
[ix
+ 1]) {
757 self.tree
.append_text(begin_text
, ix
);
758 if bytes
[ix
+ 1] == b'`'
{
759 let count
= 1 + scan_ch_repeat(&bytes
[(ix
+ 2)..], b'`'
);
760 self.tree
.append(Item
{
763 body
: ItemBody
::MaybeCode(count
, true),
765 begin_text
= ix
+ 1 + count
;
766 LoopInstruction
::ContinueAndSkip(count
)
769 LoopInstruction
::ContinueAndSkip(1)
772 LoopInstruction
::ContinueAndSkip(0)
775 c @ b'
*'
| c @ b'_'
| c @ b'
~'
=> {
776 let string_suffix
= &self.text
[ix
..];
777 let count
= 1 + scan_ch_repeat(&string_suffix
.as_bytes()[1..], c
);
778 let can_open
= delim_run_can_open(self.text
, string_suffix
, count
, ix
);
779 let can_close
= delim_run_can_close(self.text
, string_suffix
, count
, ix
);
780 let is_valid_seq
= c
!= b'
~'
781 || count
== 2 && self.options
.contains(Options
::ENABLE_STRIKETHROUGH
);
783 if (can_open
|| can_close
) && is_valid_seq
{
784 self.tree
.append_text(begin_text
, ix
);
786 self.tree
.append(Item
{
789 body
: ItemBody
::MaybeEmphasis(count
- i
, can_open
, can_close
),
792 begin_text
= ix
+ count
;
794 LoopInstruction
::ContinueAndSkip(count
- 1)
797 self.tree
.append_text(begin_text
, ix
);
798 let count
= 1 + scan_ch_repeat(&bytes
[(ix
+ 1)..], b'`'
);
799 self.tree
.append(Item
{
802 body
: ItemBody
::MaybeCode(count
, false),
804 begin_text
= ix
+ count
;
805 LoopInstruction
::ContinueAndSkip(count
- 1)
808 // Note: could detect some non-HTML cases and early escape here, but not
809 // clear that's a win.
810 self.tree
.append_text(begin_text
, ix
);
811 self.tree
.append(Item
{
814 body
: ItemBody
::MaybeHtml
,
817 LoopInstruction
::ContinueAndSkip(0)
820 if ix
+ 1 < self.text
.len() && bytes
[ix
+ 1] == b'
['
{
821 self.tree
.append_text(begin_text
, ix
);
822 self.tree
.append(Item
{
825 body
: ItemBody
::MaybeImage
,
828 LoopInstruction
::ContinueAndSkip(1)
830 LoopInstruction
::ContinueAndSkip(0)
834 self.tree
.append_text(begin_text
, ix
);
835 self.tree
.append(Item
{
838 body
: ItemBody
::MaybeLinkOpen
,
841 LoopInstruction
::ContinueAndSkip(0)
844 self.tree
.append_text(begin_text
, ix
);
845 self.tree
.append(Item
{
848 body
: ItemBody
::MaybeLinkClose
,
851 LoopInstruction
::ContinueAndSkip(0)
853 b'
&'
=> match scan_entity(&bytes
[ix
..]) {
854 (n
, Some(value
)) => {
855 self.tree
.append_text(begin_text
, ix
);
856 self.tree
.append(Item
{
859 body
: ItemBody
::SynthesizeText(self.allocs
.allocate_cow(value
)),
862 LoopInstruction
::ContinueAndSkip(n
- 1)
864 _
=> LoopInstruction
::ContinueAndSkip(0),
867 if let TableParseMode
::Active
= mode
{
868 LoopInstruction
::BreakAtWith(ix
, None
)
872 LoopInstruction
::ContinueAndSkip(0)
875 _
=> LoopInstruction
::ContinueAndSkip(0),
880 // need to close text at eof
881 self.tree
.append_text(begin_text
, final_ix
);
886 /// Check whether we should allow a paragraph interrupt by lists. Only non-empty
887 /// lists are allowed.
888 fn interrupt_paragraph_by_list(&self, suffix
: &[u8]) -> bool
{
889 scan_listitem(suffix
).map_or(false, |(ix
, delim
, index
, _
)| {
890 self.list_nesting
> 0 ||
891 // we don't allow interruption by either empty lists or
892 // numbered lists starting at an index other than 1
893 !scan_empty_list(&suffix
[ix
..]) && (delim
== b'
*'
|| delim
== b'
-'
|| index
== 1)
897 /// When start_ix is at the beginning of an HTML block of type 1 to 5,
898 /// this will find the end of the block, adding the block itself to the
899 /// tree and also keeping track of the lines of HTML within the block.
901 /// The html_end_tag is the tag that must be found on a line to end the block.
902 fn parse_html_block_type_1_to_5(
906 mut remaining_space
: usize,
908 let bytes
= self.text
.as_bytes();
909 let mut ix
= start_ix
;
911 let line_start_ix
= ix
;
912 ix
+= scan_nextline(&bytes
[ix
..]);
913 self.append_html_line(remaining_space
, line_start_ix
, ix
);
915 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
916 let n_containers
= scan_containers(&self.tree
, &mut line_start
);
917 if n_containers
< self.tree
.spine_len() {
921 if (&self.text
[line_start_ix
..ix
]).contains(html_end_tag
) {
925 let next_line_ix
= ix
+ line_start
.bytes_scanned();
926 if next_line_ix
== self.text
.len() {
930 remaining_space
= line_start
.remaining_space();
935 /// When start_ix is at the beginning of an HTML block of type 6 or 7,
936 /// this will consume lines until there is a blank line and keep track of
937 /// the HTML within the block.
938 fn parse_html_block_type_6_or_7(
941 mut remaining_space
: usize,
943 let bytes
= self.text
.as_bytes();
944 let mut ix
= start_ix
;
946 let line_start_ix
= ix
;
947 ix
+= scan_nextline(&bytes
[ix
..]);
948 self.append_html_line(remaining_space
, line_start_ix
, ix
);
950 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
951 let n_containers
= scan_containers(&self.tree
, &mut line_start
);
952 if n_containers
< self.tree
.spine_len() || line_start
.is_at_eol() {
956 let next_line_ix
= ix
+ line_start
.bytes_scanned();
957 if next_line_ix
== self.text
.len() || scan_blank_line(&bytes
[next_line_ix
..]).is_some()
962 remaining_space
= line_start
.remaining_space();
967 fn parse_indented_code_block(&mut self, start_ix
: usize, mut remaining_space
: usize) -> usize {
968 self.tree
.append(Item
{
970 end
: 0, // will get set later
971 body
: ItemBody
::IndentCodeBlock
,
974 let bytes
= self.text
.as_bytes();
975 let mut last_nonblank_child
= None
;
976 let mut last_nonblank_ix
= 0;
978 let mut last_line_blank
= false;
980 let mut ix
= start_ix
;
982 let line_start_ix
= ix
;
983 ix
+= scan_nextline(&bytes
[ix
..]);
984 self.append_code_text(remaining_space
, line_start_ix
, ix
);
985 // TODO(spec clarification): should we synthesize newline at EOF?
987 if !last_line_blank
{
988 last_nonblank_child
= self.tree
.cur();
989 last_nonblank_ix
= ix
;
993 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
994 let n_containers
= scan_containers(&self.tree
, &mut line_start
);
995 if n_containers
< self.tree
.spine_len()
996 || !(line_start
.scan_space(4) || line_start
.is_at_eol())
1000 let next_line_ix
= ix
+ line_start
.bytes_scanned();
1001 if next_line_ix
== self.text
.len() {
1005 remaining_space
= line_start
.remaining_space();
1006 last_line_blank
= scan_blank_line(&bytes
[ix
..]).is_some();
1009 // Trim trailing blank lines.
1010 if let Some(child
) = last_nonblank_child
{
1011 self.tree
[child
].next
= None
;
1012 self.tree
[child
].item
.end
= last_nonblank_ix
;
1018 fn parse_fenced_code_block(
1023 n_fence_char
: usize,
1025 let bytes
= self.text
.as_bytes();
1026 let mut info_start
= start_ix
+ n_fence_char
;
1027 info_start
+= scan_whitespace_no_nl(&bytes
[info_start
..]);
1028 // TODO: info strings are typically very short. wouldnt it be faster
1029 // to just do a forward scan here?
1030 let mut ix
= info_start
+ scan_nextline(&bytes
[info_start
..]);
1031 let info_end
= ix
- scan_rev_while(&bytes
[info_start
..ix
], is_ascii_whitespace
);
1032 let info_string
= unescape(&self.text
[info_start
..info_end
]);
1033 self.tree
.append(Item
{
1035 end
: 0, // will get set later
1036 body
: ItemBody
::FencedCodeBlock(self.allocs
.allocate_cow(info_string
)),
1040 let mut line_start
= LineStart
::new(&bytes
[ix
..]);
1041 let n_containers
= scan_containers(&self.tree
, &mut line_start
);
1042 if n_containers
< self.tree
.spine_len() {
1045 line_start
.scan_space(indent
);
1046 let mut close_line_start
= line_start
.clone();
1047 if !close_line_start
.scan_space(4) {
1048 let close_ix
= ix
+ close_line_start
.bytes_scanned();
1049 if let Some(n
) = scan_closing_code_fence(&bytes
[close_ix
..], fence_ch
, n_fence_char
)
1055 let remaining_space
= line_start
.remaining_space();
1056 ix
+= line_start
.bytes_scanned();
1057 let next_ix
= ix
+ scan_nextline(&bytes
[ix
..]);
1058 self.append_code_text(remaining_space
, ix
, next_ix
);
1064 // try to read trailing whitespace or it will register as a completely blank line
1065 ix
+ scan_blank_line(&bytes
[ix
..]).unwrap_or(0)
1068 fn append_code_text(&mut self, remaining_space
: usize, start
: usize, end
: usize) {
1069 if remaining_space
> 0 {
1070 let cow_ix
= self.allocs
.allocate_cow(" "[..remaining_space
].into());
1071 self.tree
.append(Item
{
1074 body
: ItemBody
::SynthesizeText(cow_ix
),
1077 if self.text
.as_bytes()[end
- 2] == b'
\r'
{
1078 // Normalize CRLF to LF
1079 self.tree
.append_text(start
, end
- 2);
1080 self.tree
.append_text(end
- 1, end
);
1082 self.tree
.append_text(start
, end
);
1086 /// Appends a line of HTML to the tree.
1087 fn append_html_line(&mut self, remaining_space
: usize, start
: usize, end
: usize) {
1088 if remaining_space
> 0 {
1089 let cow_ix
= self.allocs
.allocate_cow(" "[..remaining_space
].into());
1090 self.tree
.append(Item
{
1093 // TODO: maybe this should synthesize to html rather than text?
1094 body
: ItemBody
::SynthesizeText(cow_ix
),
1097 if self.text
.as_bytes()[end
- 2] == b'
\r'
{
1098 // Normalize CRLF to LF
1099 self.tree
.append(Item
{
1102 body
: ItemBody
::Html
,
1104 self.tree
.append(Item
{
1107 body
: ItemBody
::Html
,
1110 self.tree
.append(Item
{
1113 body
: ItemBody
::Html
,
1118 /// Pop a container, setting its end.
1119 fn pop(&mut self, ix
: usize) {
1120 let cur_ix
= self.tree
.pop().unwrap();
1121 self.tree
[cur_ix
].item
.end
= ix
;
1122 if let ItemBody
::List(true, _
, _
) = self.tree
[cur_ix
].item
.body
{
1123 surgerize_tight_list(&mut self.tree
, cur_ix
);
1127 /// Close a list if it's open. Also set loose if last line was blank
1128 fn finish_list(&mut self, ix
: usize) {
1129 if let Some(node_ix
) = self.tree
.peek_up() {
1130 if let ItemBody
::List(_
, _
, _
) = self.tree
[node_ix
].item
.body
{
1132 self.list_nesting
-= 1;
1135 if self.last_line_blank
{
1136 if let Some(node_ix
) = self.tree
.peek_grandparent() {
1137 if let ItemBody
::List(ref mut is_tight
, _
, _
) = self.tree
[node_ix
].item
.body
{
1141 self.last_line_blank
= false;
1145 /// Continue an existing list or start a new one if there's not an open
1146 /// list that matches.
1147 fn continue_list(&mut self, start
: usize, ch
: u8, index
: u64) {
1148 if let Some(node_ix
) = self.tree
.peek_up() {
1149 if let ItemBody
::List(ref mut is_tight
, existing_ch
, _
) = self.tree
[node_ix
].item
.body
{
1150 if existing_ch
== ch
{
1151 if self.last_line_blank
{
1153 self.last_line_blank
= false;
1158 // TODO: this is not the best choice for end; maybe get end from last list item.
1159 self.finish_list(start
);
1161 self.tree
.append(Item
{
1163 end
: 0, // will get set later
1164 body
: ItemBody
::List(true, ch
, index
),
1166 self.list_nesting
+= 1;
1168 self.last_line_blank
= false;
1171 /// Parse a thematic break.
1173 /// Returns index of start of next line.
1174 fn parse_hrule(&mut self, hrule_size
: usize, ix
: usize) -> usize {
1175 self.tree
.append(Item
{
1177 end
: ix
+ hrule_size
,
1178 body
: ItemBody
::Rule
,
1183 /// Parse an ATX heading.
1185 /// Returns index of start of next line.
1186 fn parse_atx_heading(&mut self, mut ix
: usize, atx_size
: usize) -> usize {
1187 let heading_ix
= self.tree
.append(Item
{
1189 end
: 0, // set later
1190 body
: ItemBody
::Heading(atx_size
as u32),
1193 // next char is space or eol (guaranteed by scan_atx_heading)
1194 let bytes
= self.text
.as_bytes();
1195 if let Some(eol_bytes
) = scan_eol(&bytes
[ix
..]) {
1196 self.tree
[heading_ix
].item
.end
= ix
+ eol_bytes
;
1197 return ix
+ eol_bytes
;
1199 // skip leading spaces
1200 let skip_spaces
= scan_whitespace_no_nl(&bytes
[ix
..]);
1203 // now handle the header text
1204 let header_start
= ix
;
1205 let header_node_idx
= self.tree
.push(); // so that we can set the endpoint later
1206 ix
= self.parse_line(ix
, TableParseMode
::Disabled
).0;
1207 self.tree
[header_node_idx
].item
.end
= ix
;
1209 // remove trailing matter from header text
1210 if let Some(cur_ix
) = self.tree
.cur() {
1211 let header_text
= &bytes
[header_start
..ix
];
1212 let mut limit
= header_text
1214 .rposition(|&b
| !(b
== b'
\n'
|| b
== b'
\r'
|| b
== b' '
))
1215 .map_or(0, |i
| i
+ 1);
1216 let closer
= header_text
[..limit
]
1218 .rposition(|&b
| b
!= b'
#')
1219 .map_or(0, |i
| i
+ 1);
1223 let spaces
= scan_rev_while(&header_text
[..closer
], |b
| b
== b' '
);
1225 limit
= closer
- spaces
;
1228 self.tree
[cur_ix
].item
.end
= limit
+ header_start
;
1235 /// Returns the number of bytes scanned on success.
1236 fn parse_footnote(&mut self, start
: usize) -> Option
<usize> {
1237 let bytes
= &self.text
.as_bytes()[start
..];
1238 if !bytes
.starts_with(b
"[^") {
1241 let (mut i
, label
) = self.parse_refdef_label(start
+ 2)?
;
1243 if scan_ch(&bytes
[i
..], b'
:'
) == 0 {
1247 self.finish_list(start
);
1248 self.tree
.append(Item
{
1250 end
: 0, // will get set later
1251 // TODO: check whether the label here is strictly necessary
1252 body
: ItemBody
::FootnoteDefinition(self.allocs
.allocate_cow(label
)),
1258 /// Tries to parse a reference label, which can be interrupted by new blocks.
1259 /// On success, returns the number of bytes of the label and the label itself.
1260 fn parse_refdef_label(&self, start
: usize) -> Option
<(usize, CowStr
<'a
>)> {
1261 scan_link_label_rest(&self.text
[start
..], &|bytes
| {
1262 let mut line_start
= LineStart
::new(bytes
);
1263 let _
= scan_containers(&self.tree
, &mut line_start
);
1264 let bytes_scanned
= line_start
.bytes_scanned();
1266 let suffix
= &bytes
[bytes_scanned
..];
1267 if self.interrupt_paragraph_by_list(suffix
) || scan_paragraph_interrupt(suffix
) {
1275 /// Returns number of bytes scanned, label and definition on success.
1276 fn parse_refdef_total(&mut self, start
: usize) -> Option
<(usize, LinkLabel
<'a
>, LinkDef
<'a
>)> {
1277 let bytes
= &self.text
.as_bytes()[start
..];
1278 if scan_ch(bytes
, b'
['
) == 0 {
1281 let (mut i
, label
) = self.parse_refdef_label(start
+ 1)?
;
1283 if scan_ch(&bytes
[i
..], b'
:'
) == 0 {
1287 let (bytecount
, link_def
) = self.scan_refdef(start
+ i
)?
;
1288 Some((bytecount
+ i
, UniCase
::new(label
), link_def
))
1291 /// Returns number of bytes and number of newlines
1292 fn scan_refdef_space(&self, bytes
: &[u8], mut i
: usize) -> Option
<(usize, usize)> {
1293 let mut newlines
= 0;
1295 let whitespaces
= scan_whitespace_no_nl(&bytes
[i
..]);
1297 if let Some(eol_bytes
) = scan_eol(&bytes
[i
..]) {
1306 let mut line_start
= LineStart
::new(&bytes
[i
..]);
1307 if self.tree
.spine_len() != scan_containers(&self.tree
, &mut line_start
) {
1310 i
+= line_start
.bytes_scanned();
1315 /// Returns # of bytes and definition.
1316 /// Assumes the label of the reference including colon has already been scanned.
1317 fn scan_refdef(&self, start
: usize) -> Option
<(usize, LinkDef
<'a
>)> {
1318 let bytes
= self.text
.as_bytes();
1320 // whitespace between label and url (including up to one newline)
1321 let (mut i
, _newlines
) = self.scan_refdef_space(bytes
, start
)?
;
1324 let (dest_length
, dest
) = scan_link_dest(self.text
, i
, 1)?
;
1325 if dest_length
== 0 {
1328 let dest
= unescape(dest
);
1332 let mut backup
= (i
- start
, LinkDef { dest, title: None }
);
1334 // scan whitespace between dest and label
1335 let (mut i
, newlines
) =
1336 if let Some((new_i
, mut newlines
)) = self.scan_refdef_space(bytes
, i
) {
1337 if i
== self.text
.len() {
1340 if new_i
== i
&& newlines
== 0 {
1344 return Some(backup
);
1348 return Some(backup
);
1352 // if this fails but newline == 1, return also a refdef without title
1353 if let Some((title_length
, title
)) = scan_refdef_title(&self.text
[i
..]) {
1355 backup
.1.title
= Some(unescape(title
));
1356 } else if newlines
> 0 {
1357 return Some(backup
);
1363 if let Some(bytes
) = scan_blank_line(&bytes
[i
..]) {
1364 backup
.0 = i
+ bytes
- start
;
1366 } else if newlines
> 0 {
1374 /// Returns number of containers scanned.
1375 fn scan_containers(tree
: &Tree
<Item
>, line_start
: &mut LineStart
) -> usize {
1377 for &node_ix
in tree
.walk_spine() {
1378 match tree
[node_ix
].item
.body
{
1379 ItemBody
::BlockQuote
=> {
1380 let save
= line_start
.clone();
1381 if !line_start
.scan_blockquote_marker() {
1386 ItemBody
::ListItem(indent
) => {
1387 let save
= line_start
.clone();
1388 if !line_start
.scan_space(indent
) {
1389 if !line_start
.is_at_eol() {
1402 /// Computes the number of header columns in a table line by computing the number of dividing pipes
1403 /// that aren't followed or preceeded by whitespace.
1404 fn count_header_cols(
1408 last_pipe_ix
: usize,
1410 // was first pipe preceeded by whitespace? if so, subtract one
1411 start
+= scan_whitespace_no_nl(&bytes
[start
..]);
1412 if bytes
[start
] == b'
|'
{
1416 // was last pipe followed by whitespace? if so, sub one
1417 if scan_blank_line(&bytes
[(last_pipe_ix
+ 1)..]).is_some() {
1424 impl<'a
> Tree
<Item
> {
1425 fn append_text(&mut self, start
: usize, end
: usize) {
1427 if let Some(ix
) = self.cur() {
1428 if ItemBody
::Text
== self[ix
].item
.body
&& self[ix
].item
.end
== start
{
1429 self[ix
].item
.end
= end
;
1436 body
: ItemBody
::Text
,
1442 /// Determines whether the delimiter run starting at given index is
1443 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1445 /// suffix is &s[ix..], which is passed in as an optimization, since taking
1446 /// a string subslice is O(n).
1447 fn delim_run_can_open(s
: &str, suffix
: &str, run_len
: usize, ix
: usize) -> bool
{
1448 let next_char
= if let Some(c
) = suffix
.chars().nth(run_len
) {
1453 if next_char
.is_whitespace() {
1459 let delim
= suffix
.chars().next().unwrap();
1460 if delim
== '
*'
&& !is_punctuation(next_char
) {
1464 let prev_char
= s
[..ix
].chars().last().unwrap();
1466 prev_char
.is_whitespace() || is_punctuation(prev_char
)
1469 /// Determines whether the delimiter run starting at given index is
1470 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1472 fn delim_run_can_close(s
: &str, suffix
: &str, run_len
: usize, ix
: usize) -> bool
{
1476 let prev_char
= s
[..ix
].chars().last().unwrap();
1477 if prev_char
.is_whitespace() {
1480 let next_char
= if let Some(c
) = suffix
.chars().nth(run_len
) {
1485 let delim
= suffix
.chars().next().unwrap();
1486 if delim
== '
*'
&& !is_punctuation(prev_char
) {
1490 next_char
.is_whitespace() || is_punctuation(next_char
)
1493 /// Checks whether we should break a paragraph on the given input.
1494 /// Note: lists are dealt with in `interrupt_paragraph_by_list`, because determing
1495 /// whether to break on a list requires additional context.
1496 fn scan_paragraph_interrupt(bytes
: &[u8]) -> bool
{
1497 if scan_eol(bytes
).is_some()
1498 || scan_hrule(bytes
).is_ok()
1499 || scan_atx_heading(bytes
).is_some()
1500 || scan_code_fence(bytes
).is_some()
1501 || scan_blockquote_start(bytes
).is_some()
1505 bytes
.starts_with(b
"<")
1506 && (get_html_end_tag(&bytes
[1..]).is_some()
1507 || is_html_tag(scan_html_block_tag(&bytes
[1..]).1))
1510 /// Assumes `text_bytes` is preceded by `<`.
1511 fn get_html_end_tag(text_bytes
: &[u8]) -> Option
<&'
static str> {
1512 static BEGIN_TAGS
: &[&[u8]; 3] = &[b
"pre", b
"style", b
"script"];
1513 static ST_BEGIN_TAGS
: &[&[u8]; 3] = &[b
"!--", b
"?", b
"![CDATA["];
1515 for (beg_tag
, end_tag
) in BEGIN_TAGS
1517 .zip(["</pre>", "</style>", "</script>"].iter())
1519 let tag_len
= beg_tag
.len();
1521 if text_bytes
.len() < tag_len
{
1522 // begin tags are increasing in size
1526 if !text_bytes
[..tag_len
].eq_ignore_ascii_case(beg_tag
) {
1530 // Must either be the end of the line...
1531 if text_bytes
.len() == tag_len
{
1532 return Some(end_tag
);
1535 // ...or be followed by whitespace, newline, or '>'.
1536 let s
= text_bytes
[tag_len
];
1537 if is_ascii_whitespace(s
) || s
== b'
>'
{
1538 return Some(end_tag
);
1542 for (beg_tag
, end_tag
) in ST_BEGIN_TAGS
.iter().zip(["-->", "?>", "]]>"].iter()) {
1543 if text_bytes
.starts_with(beg_tag
) {
1544 return Some(end_tag
);
1548 if text_bytes
.len() > 1
1549 && text_bytes
[0] == b'
!'
1550 && text_bytes
[1] >= b'A'
1551 && text_bytes
[1] <= b'Z'
1559 #[derive(Copy, Clone, Debug)]
1561 start
: TreeIndex
, // offset of tree node
1563 c
: u8, // b'*' or b'_'
1564 both
: bool
, // can both open and close
1567 #[derive(Debug, Clone, Default)]
1568 struct InlineStack
{
1569 stack
: Vec
<InlineEl
>,
1570 // Lower bounds for matching indices in the stack. For example
1571 // a strikethrough delimiter will never match with any element
1572 // in the stack with index smaller than
1573 // `lower_bounds[InlineStack::TILDES]`.
1574 lower_bounds
: [usize; 7],
1578 /// These are indices into the lower bounds array.
1579 /// Not both refers to the property that the delimiter can not both
1580 /// be opener as a closer.
1581 const UNDERSCORE_NOT_BOTH
: usize = 0;
1582 const ASTERISK_NOT_BOTH
: usize = 1;
1583 const ASTERISK_BASE
: usize = 2;
1584 const TILDES
: usize = 5;
1585 const UNDERSCORE_BOTH
: usize = 6;
1587 fn pop_all(&mut self, tree
: &mut Tree
<Item
>) {
1588 for el
in self.stack
.drain(..) {
1589 for i
in 0..el
.count
{
1590 tree
[el
.start
+ i
].item
.body
= ItemBody
::Text
;
1593 self.lower_bounds
= [0; 7];
1596 fn get_lowerbound(&self, c
: u8, count
: usize, both
: bool
) -> usize {
1599 self.lower_bounds
[InlineStack
::UNDERSCORE_BOTH
]
1601 self.lower_bounds
[InlineStack
::UNDERSCORE_NOT_BOTH
]
1603 } else if c
== b'
*'
{
1604 let mod3_lower
= self.lower_bounds
[InlineStack
::ASTERISK_BASE
+ count
% 3];
1610 self.lower_bounds
[InlineStack
::ASTERISK_NOT_BOTH
],
1614 self.lower_bounds
[InlineStack
::TILDES
]
1618 fn set_lowerbound(&mut self, c
: u8, count
: usize, both
: bool
, new_bound
: usize) {
1621 self.lower_bounds
[InlineStack
::UNDERSCORE_BOTH
] = new_bound
;
1623 self.lower_bounds
[InlineStack
::UNDERSCORE_NOT_BOTH
] = new_bound
;
1625 } else if c
== b'
*'
{
1626 self.lower_bounds
[InlineStack
::ASTERISK_BASE
+ count
% 3] = new_bound
;
1628 self.lower_bounds
[InlineStack
::ASTERISK_NOT_BOTH
] = new_bound
;
1631 self.lower_bounds
[InlineStack
::TILDES
] = new_bound
;
1637 tree
: &mut Tree
<Item
>,
1641 ) -> Option
<InlineEl
> {
1642 let lowerbound
= min(self.stack
.len(), self.get_lowerbound(c
, count
, both
));
1643 let res
= self.stack
[lowerbound
..]
1648 el
.c
== c
&& (!both
&& !el
.both
|| (count
+ el
.count
) % 3 != 0 || count
% 3 == 0)
1651 if let Some((matching_ix
, matching_el
)) = res
{
1652 let matching_ix
= matching_ix
+ lowerbound
;
1653 for el
in &self.stack
[(matching_ix
+ 1)..] {
1654 for i
in 0..el
.count
{
1655 tree
[el
.start
+ i
].item
.body
= ItemBody
::Text
;
1658 self.stack
.truncate(matching_ix
);
1661 self.set_lowerbound(c
, count
, both
, self.stack
.len());
1666 fn push(&mut self, el
: InlineEl
) {
1671 #[derive(Debug, Clone)]
1673 // label, next node index, source ix of label end
1674 LinkLabel(CowStr
<'a
>, Option
<TreeIndex
>, usize),
1675 // contains next node index
1676 Collapsed(Option
<TreeIndex
>),
1680 /// Skips forward within a block to a node which spans (ends inclusive) the given
1681 /// index into the source.
1682 fn scan_nodes_to_ix(
1684 mut node
: Option
<TreeIndex
>,
1686 ) -> Option
<TreeIndex
> {
1687 while let Some(node_ix
) = node
{
1688 if tree
[node_ix
].item
.end
<= ix
{
1689 node
= tree
[node_ix
].next
;
1697 /// Scans an inline link label, which cannot be interrupted.
1698 /// Returns number of bytes (including brackets) and label on success.
1699 fn scan_link_label
<'text
, 'tree
>(
1700 tree
: &'tree Tree
<Item
>,
1702 ) -> Option
<(usize, ReferenceLabel
<'text
>)> {
1703 let bytes
= &text
.as_bytes();
1704 if bytes
.len() < 2 || bytes
[0] != b'
['
{
1707 let linebreak_handler
= |bytes
: &[u8]| {
1708 let mut line_start
= LineStart
::new(bytes
);
1709 let _
= scan_containers(tree
, &mut line_start
);
1710 Some(line_start
.bytes_scanned())
1712 let pair
= if b'
^' == bytes
[1] {
1713 let (byte_index
, cow
) = scan_link_label_rest(&text
[2..], &linebreak_handler
)?
;
1714 (byte_index
+ 2, ReferenceLabel
::Footnote(cow
))
1716 let (byte_index
, cow
) = scan_link_label_rest(&text
[1..], &linebreak_handler
)?
;
1717 (byte_index
+ 1, ReferenceLabel
::Link(cow
))
1722 fn scan_reference
<'a
, 'b
>(
1723 tree
: &'a Tree
<Item
>,
1725 cur
: Option
<TreeIndex
>,
1727 let cur_ix
= match cur
{
1728 None
=> return RefScan
::Failed
,
1729 Some(cur_ix
) => cur_ix
,
1731 let start
= tree
[cur_ix
].item
.start
;
1732 let tail
= &text
.as_bytes()[start
..];
1734 if tail
.starts_with(b
"[]") {
1735 let closing_node
= tree
[cur_ix
].next
.unwrap();
1736 RefScan
::Collapsed(tree
[closing_node
].next
)
1737 } else if let Some((ix
, ReferenceLabel
::Link(label
))) = scan_link_label(tree
, &text
[start
..]) {
1738 let next_node
= scan_nodes_to_ix(tree
, cur
, start
+ ix
);
1739 RefScan
::LinkLabel(label
, next_node
, start
+ ix
)
1745 #[derive(Clone, Default)]
1747 inner
: Vec
<LinkStackEl
>,
1752 fn push(&mut self, el
: LinkStackEl
) {
1753 self.inner
.push(el
);
1756 fn pop(&mut self) -> Option
<LinkStackEl
> {
1757 let el
= self.inner
.pop();
1758 self.disabled_ix
= std
::cmp
::min(self.disabled_ix
, self.inner
.len());
1762 fn clear(&mut self) {
1764 self.disabled_ix
= 0;
1767 fn disable_all_links(&mut self) {
1768 for el
in &mut self.inner
[self.disabled_ix
..] {
1769 if el
.ty
== LinkStackTy
::Link
{
1770 el
.ty
= LinkStackTy
::Disabled
;
1773 self.disabled_ix
= self.inner
.len();
1777 #[derive(Clone, Debug)]
1778 struct LinkStackEl
{
1783 #[derive(PartialEq, Clone, Debug)]
1791 struct LinkDef
<'a
> {
1793 title
: Option
<CowStr
<'a
>>,
1796 /// Tracks tree indices of code span delimiters of each length. It should prevent
1797 /// quadratic scanning behaviours by providing (amortized) constant time lookups.
1799 inner
: HashMap
<usize, VecDeque
<TreeIndex
>>,
1806 inner
: Default
::default(),
1811 fn insert(&mut self, count
: usize, ix
: TreeIndex
) {
1812 if self.seen_first
{
1815 .or_insert_with(Default
::default)
1818 // Skip the first insert, since that delimiter will always
1819 // be an opener and not a closer.
1820 self.seen_first
= true;
1824 fn is_populated(&self) -> bool
{
1825 !self.inner
.is_empty()
1828 fn find(&mut self, open_ix
: TreeIndex
, count
: usize) -> Option
<TreeIndex
> {
1829 while let Some(ix
) = self.inner
.get_mut(&count
)?
.pop_front() {
1837 fn clear(&mut self) {
1839 self.seen_first
= false;
1843 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1844 struct LinkIndex(usize);
1846 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1847 struct CowIndex(usize);
1849 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1850 struct AlignmentIndex(usize);
1853 struct Allocations
<'a
> {
1854 refdefs
: HashMap
<LinkLabel
<'a
>, LinkDef
<'a
>>,
1855 links
: Vec
<(LinkType
, CowStr
<'a
>, CowStr
<'a
>)>,
1856 cows
: Vec
<CowStr
<'a
>>,
1857 alignments
: Vec
<Vec
<Alignment
>>,
1860 impl<'a
> Allocations
<'a
> {
1863 refdefs
: HashMap
::new(),
1864 links
: Vec
::with_capacity(128),
1866 alignments
: Vec
::new(),
1870 fn allocate_cow(&mut self, cow
: CowStr
<'a
>) -> CowIndex
{
1871 let ix
= self.cows
.len();
1872 self.cows
.push(cow
);
1876 fn allocate_link(&mut self, ty
: LinkType
, url
: CowStr
<'a
>, title
: CowStr
<'a
>) -> LinkIndex
{
1877 let ix
= self.links
.len();
1878 self.links
.push((ty
, url
, title
));
1882 fn allocate_alignment(&mut self, alignment
: Vec
<Alignment
>) -> AlignmentIndex
{
1883 let ix
= self.alignments
.len();
1884 self.alignments
.push(alignment
);
1889 impl<'a
> Index
<CowIndex
> for Allocations
<'a
> {
1890 type Output
= CowStr
<'a
>;
1892 fn index(&self, ix
: CowIndex
) -> &Self::Output
{
1893 self.cows
.index(ix
.0)
1897 impl<'a
> Index
<LinkIndex
> for Allocations
<'a
> {
1898 type Output
= (LinkType
, CowStr
<'a
>, CowStr
<'a
>);
1900 fn index(&self, ix
: LinkIndex
) -> &Self::Output
{
1901 self.links
.index(ix
.0)
1905 impl<'a
> Index
<AlignmentIndex
> for Allocations
<'a
> {
1906 type Output
= Vec
<Alignment
>;
1908 fn index(&self, ix
: AlignmentIndex
) -> &Self::Output
{
1909 self.alignments
.index(ix
.0)
1913 /// A struct containing information on the reachability of certain inline HTML
1914 /// elements. In particular, for cdata elements (`<![CDATA[`), processing
1915 /// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
1916 /// represent the indices before which a scan will always fail and can hence
1918 #[derive(Clone, Default)]
1919 pub(crate) struct HtmlScanGuard
{
1921 pub processing
: usize,
1922 pub declaration
: usize,
1925 /// Markdown event iterator.
1927 pub struct Parser
<'a
> {
1930 allocs
: Allocations
<'a
>,
1931 broken_link_callback
: Option
<&'a
dyn Fn(&str, &str) -> Option
<(String
, String
)>>,
1932 html_scan_guard
: HtmlScanGuard
,
1934 // used by inline passes. store them here for reuse
1935 inline_stack
: InlineStack
,
1936 link_stack
: LinkStack
,
1939 impl<'a
> Parser
<'a
> {
1940 /// Creates a new event iterator for a markdown string without any options enabled.
1941 pub fn new(text
: &'a
str) -> Parser
<'a
> {
1942 Parser
::new_ext(text
, Options
::empty())
1945 /// Creates a new event iterator for a markdown string with given options.
1946 pub fn new_ext(text
: &'a
str, options
: Options
) -> Parser
<'a
> {
1947 Parser
::new_with_broken_link_callback(text
, options
, None
)
1950 /// In case the parser encounters any potential links that have a broken
1951 /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
1952 /// the provided callback will be called with the reference name,
1953 /// and the returned pair will be used as the link name and title if it is not
1955 pub fn new_with_broken_link_callback(
1958 broken_link_callback
: Option
<&'a
dyn Fn(&str, &str) -> Option
<(String
, String
)>>,
1960 let first_pass
= FirstPass
::new(text
, options
);
1961 let (mut tree
, allocs
) = first_pass
.run();
1963 let inline_stack
= Default
::default();
1964 let link_stack
= Default
::default();
1965 let html_scan_guard
= Default
::default();
1970 broken_link_callback
,
1977 /// Handle inline markup.
1979 /// When the parser encounters any item indicating potential inline markup, all
1980 /// inline markup passes are run on the remainder of the chain.
1982 /// Note: there's some potential for optimization here, but that's future work.
1983 fn handle_inline(&mut self) {
1984 self.handle_inline_pass1();
1985 self.handle_emphasis();
1988 /// Handle inline HTML, code spans, and links.
1990 /// This function handles both inline HTML and code spans, because they have
1991 /// the same precedence. It also handles links, even though they have lower
1992 /// precedence, because the URL of links must not be processed.
1993 fn handle_inline_pass1(&mut self) {
1994 let mut code_delims
= CodeDelims
::new();
1995 let mut cur
= self.tree
.cur();
1996 let mut prev
= None
;
1998 let block_end
= self.tree
[self.tree
.peek_up().unwrap()].item
.end
;
1999 let block_text
= &self.text
[..block_end
];
2001 while let Some(mut cur_ix
) = cur
{
2002 match self.tree
[cur_ix
].item
.body
{
2003 ItemBody
::MaybeHtml
=> {
2004 let next
= self.tree
[cur_ix
].next
;
2005 let autolink
= if let Some(next_ix
) = next
{
2006 scan_autolink(block_text
, self.tree
[next_ix
].item
.start
)
2011 if let Some((ix
, uri
, link_type
)) = autolink
{
2012 let node
= scan_nodes_to_ix(&self.tree
, next
, ix
);
2013 let text_node
= self.tree
.create_node(Item
{
2014 start
: self.tree
[cur_ix
].item
.start
+ 1,
2016 body
: ItemBody
::Text
,
2018 let link_ix
= self.allocs
.allocate_link(link_type
, uri
, "".into());
2019 self.tree
[cur_ix
].item
.body
= ItemBody
::Link(link_ix
);
2020 self.tree
[cur_ix
].item
.end
= ix
;
2021 self.tree
[cur_ix
].next
= node
;
2022 self.tree
[cur_ix
].child
= Some(text_node
);
2025 if let Some(node_ix
) = cur
{
2026 self.tree
[node_ix
].item
.start
= max(self.tree
[node_ix
].item
.start
, ix
);
2030 let inline_html
= if let Some(next_ix
) = next
{
2031 self.scan_inline_html(
2032 block_text
.as_bytes(),
2033 self.tree
[next_ix
].item
.start
,
2038 if let Some(ix
) = inline_html
{
2039 let node
= scan_nodes_to_ix(&self.tree
, next
, ix
);
2040 self.tree
[cur_ix
].item
.body
= ItemBody
::Html
;
2041 self.tree
[cur_ix
].item
.end
= ix
;
2042 self.tree
[cur_ix
].next
= node
;
2045 if let Some(node_ix
) = cur
{
2046 self.tree
[node_ix
].item
.start
=
2047 max(self.tree
[node_ix
].item
.start
, ix
);
2052 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2054 ItemBody
::MaybeCode(mut search_count
, preceded_by_backslash
) => {
2055 if preceded_by_backslash
{
2057 if search_count
== 0 {
2058 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2060 cur
= self.tree
[cur_ix
].next
;
2065 if code_delims
.is_populated() {
2066 // we have previously scanned all codeblock delimiters,
2067 // so we can reuse that work
2068 if let Some(scan_ix
) = code_delims
.find(cur_ix
, search_count
) {
2069 self.make_code_span(cur_ix
, scan_ix
, preceded_by_backslash
);
2071 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2074 // we haven't previously scanned all codeblock delimiters,
2076 let mut scan
= if search_count
> 0 {
2077 self.tree
[cur_ix
].next
2081 while let Some(scan_ix
) = scan
{
2082 if let ItemBody
::MaybeCode(delim_count
, _
) =
2083 self.tree
[scan_ix
].item
.body
2085 if search_count
== delim_count
{
2086 self.make_code_span(cur_ix
, scan_ix
, preceded_by_backslash
);
2087 code_delims
.clear();
2090 code_delims
.insert(delim_count
, scan_ix
);
2093 scan
= self.tree
[scan_ix
].next
;
2096 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2100 ItemBody
::MaybeLinkOpen
=> {
2101 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2102 self.link_stack
.push(LinkStackEl
{
2104 ty
: LinkStackTy
::Link
,
2107 ItemBody
::MaybeImage
=> {
2108 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2109 self.link_stack
.push(LinkStackEl
{
2111 ty
: LinkStackTy
::Image
,
2114 ItemBody
::MaybeLinkClose
=> {
2115 self.tree
[cur_ix
].item
.body
= ItemBody
::Text
;
2116 if let Some(tos
) = self.link_stack
.pop() {
2117 if tos
.ty
== LinkStackTy
::Disabled
{
2120 let next
= self.tree
[cur_ix
].next
;
2121 if let Some((next_ix
, url
, title
)) =
2122 self.scan_inline_link(block_text
, self.tree
[cur_ix
].item
.end
, next
)
2124 let next_node
= scan_nodes_to_ix(&self.tree
, next
, next_ix
);
2125 if let Some(prev_ix
) = prev
{
2126 self.tree
[prev_ix
].next
= None
;
2128 cur
= Some(tos
.node
);
2130 let link_ix
= self.allocs
.allocate_link(LinkType
::Inline
, url
, title
);
2131 self.tree
[cur_ix
].item
.body
= if tos
.ty
== LinkStackTy
::Image
{
2132 ItemBody
::Image(link_ix
)
2134 ItemBody
::Link(link_ix
)
2136 self.tree
[cur_ix
].child
= self.tree
[cur_ix
].next
;
2137 self.tree
[cur_ix
].next
= next_node
;
2138 self.tree
[cur_ix
].item
.end
= next_ix
;
2139 if let Some(next_node_ix
) = next_node
{
2140 self.tree
[next_node_ix
].item
.start
=
2141 max(self.tree
[next_node_ix
].item
.start
, next_ix
);
2144 if tos
.ty
== LinkStackTy
::Link
{
2145 self.link_stack
.disable_all_links();
2148 // ok, so its not an inline link. maybe it is a reference
2149 // to a defined link?
2150 let scan_result
= scan_reference(&self.tree
, block_text
, next
);
2151 let (node_after_link
, link_type
) = match scan_result
{
2152 // [label][reference]
2153 RefScan
::LinkLabel(_
, next_node
, _
) => {
2154 (next_node
, LinkType
::Reference
)
2157 RefScan
::Collapsed(next_node
) => (next_node
, LinkType
::Collapsed
),
2160 // [shortcut]: /blah
2161 RefScan
::Failed
=> (next
, LinkType
::Shortcut
),
2164 // (label, source_ix end)
2165 let label
: Option
<(ReferenceLabel
<'a
>, usize)> = match scan_result
{
2166 RefScan
::LinkLabel(l
, _
, end_ix
) => {
2167 Some((ReferenceLabel
::Link(l
), end_ix
))
2169 RefScan
::Collapsed(..) | RefScan
::Failed
=> {
2170 // No label? maybe it is a shortcut reference
2171 let label_start
= self.tree
[tos
.node
].item
.end
- 1;
2174 &self.text
[label_start
..self.tree
[cur_ix
].item
.end
],
2176 .map(|(ix
, label
)| (label
, label_start
+ ix
))
2180 // see if it's a footnote reference
2181 if let Some((ReferenceLabel
::Footnote(l
), end
)) = label
{
2182 self.tree
[tos
.node
].next
= node_after_link
;
2183 self.tree
[tos
.node
].child
= None
;
2184 self.tree
[tos
.node
].item
.body
=
2185 ItemBody
::FootnoteReference(self.allocs
.allocate_cow(l
));
2186 self.tree
[tos
.node
].item
.end
= end
;
2187 prev
= Some(tos
.node
);
2188 cur
= node_after_link
;
2189 self.link_stack
.clear();
2191 } else if let Some((ReferenceLabel
::Link(link_label
), end
)) = label
{
2192 let type_url_title
= self
2195 .get(&UniCase
::new(link_label
.as_ref().into()))
2196 .map(|matching_def
| {
2197 // found a matching definition!
2198 let title
= matching_def
2202 .unwrap_or_else(|| "".into());
2203 let url
= matching_def
.dest
.clone();
2204 (link_type
, url
, title
)
2207 self.broken_link_callback
2208 .and_then(|callback
| {
2209 // looked for matching definition, but didn't find it. try to fix
2210 // link with callback, if it is defined
2211 callback(link_label
.as_ref(), link_label
.as_ref())
2213 .map(|(url
, title
)| {
2214 (link_type
.to_unknown(), url
.into(), title
.into())
2218 if let Some((def_link_type
, url
, title
)) = type_url_title
{
2220 self.allocs
.allocate_link(def_link_type
, url
, title
);
2221 self.tree
[tos
.node
].item
.body
= if tos
.ty
== LinkStackTy
::Image
2223 ItemBody
::Image(link_ix
)
2225 ItemBody
::Link(link_ix
)
2227 let label_node
= self.tree
[tos
.node
].next
;
2229 // lets do some tree surgery to add the link to the tree
2230 // 1st: skip the label node and close node
2231 self.tree
[tos
.node
].next
= node_after_link
;
2233 // then, if it exists, add the label node as a child to the link node
2234 if label_node
!= cur
{
2235 self.tree
[tos
.node
].child
= label_node
;
2237 // finally: disconnect list of children
2238 if let Some(prev_ix
) = prev
{
2239 self.tree
[prev_ix
].next
= None
;
2243 self.tree
[tos
.node
].item
.end
= end
;
2245 // set up cur so next node will be node_after_link
2246 cur
= Some(tos
.node
);
2249 if tos
.ty
== LinkStackTy
::Link
{
2250 self.link_stack
.disable_all_links();
2260 cur
= self.tree
[cur_ix
].next
;
2262 self.link_stack
.clear();
2265 fn handle_emphasis(&mut self) {
2266 let mut prev
= None
;
2267 let mut prev_ix
: TreeIndex
;
2268 let mut cur
= self.tree
.cur();
2269 while let Some(mut cur_ix
) = cur
{
2270 if let ItemBody
::MaybeEmphasis(mut count
, can_open
, can_close
) =
2271 self.tree
[cur_ix
].item
.body
2273 let c
= self.text
.as_bytes()[self.tree
[cur_ix
].item
.start
];
2274 let both
= can_open
&& can_close
;
2276 while let Some(el
) =
2277 self.inline_stack
.find_match(&mut self.tree
, c
, count
, both
)
2280 if let Some(prev_ix
) = prev
{
2281 self.tree
[prev_ix
].next
= None
;
2283 let match_count
= min(count
, el
.count
);
2284 // start, end are tree node indices
2285 let mut end
= cur_ix
- 1;
2286 let mut start
= el
.start
+ el
.count
;
2288 // work from the inside out
2289 while start
> el
.start
+ el
.count
- match_count
{
2290 let (inc
, ty
) = if c
== b'
~'
{
2291 (2, ItemBody
::Strikethrough
)
2292 } else if start
> el
.start
+ el
.count
- match_count
+ 1 {
2293 (2, ItemBody
::Strong
)
2295 (1, ItemBody
::Emphasis
)
2298 let root
= start
- inc
;
2300 self.tree
[root
].item
.body
= ty
;
2301 self.tree
[root
].item
.end
= self.tree
[end
].item
.end
;
2302 self.tree
[root
].child
= Some(start
);
2303 self.tree
[root
].next
= None
;
2307 // set next for top most emph level
2308 prev_ix
= el
.start
+ el
.count
- match_count
;
2309 prev
= Some(prev_ix
);
2310 cur
= self.tree
[cur_ix
+ match_count
- 1].next
;
2311 self.tree
[prev_ix
].next
= cur
;
2313 if el
.count
> match_count
{
2314 self.inline_stack
.push(InlineEl
{
2316 count
: el
.count
- match_count
,
2321 count
-= match_count
;
2323 cur_ix
= cur
.unwrap();
2331 self.inline_stack
.push(InlineEl
{
2339 self.tree
[cur_ix
+ i
].item
.body
= ItemBody
::Text
;
2342 prev_ix
= cur_ix
+ count
- 1;
2343 prev
= Some(prev_ix
);
2344 cur
= self.tree
[prev_ix
].next
;
2348 cur
= self.tree
[cur_ix
].next
;
2351 self.inline_stack
.pop_all(&mut self.tree
);
2354 /// Returns next byte index, url and title.
2355 fn scan_inline_link(
2357 underlying
: &'a
str,
2359 node
: Option
<TreeIndex
>,
2360 ) -> Option
<(usize, CowStr
<'a
>, CowStr
<'a
>)> {
2361 if scan_ch(&underlying
.as_bytes()[ix
..], b'
('
) == 0 {
2365 ix
+= scan_while(&underlying
.as_bytes()[ix
..], is_ascii_whitespace
);
2367 let (dest_length
, dest
) = scan_link_dest(underlying
, ix
, LINK_MAX_NESTED_PARENS
)?
;
2368 let dest
= unescape(dest
);
2371 ix
+= scan_while(&underlying
.as_bytes()[ix
..], is_ascii_whitespace
);
2373 let title
= if let Some((bytes_scanned
, t
)) = self.scan_link_title(underlying
, ix
, node
) {
2374 ix
+= bytes_scanned
;
2375 ix
+= scan_while(&underlying
.as_bytes()[ix
..], is_ascii_whitespace
);
2380 if scan_ch(&underlying
.as_bytes()[ix
..], b'
)'
) == 0 {
2385 Some((ix
, dest
, title
))
2388 // returns (bytes scanned, title cow)
2393 node
: Option
<TreeIndex
>,
2394 ) -> Option
<(usize, CowStr
<'a
>)> {
2395 let bytes
= text
.as_bytes();
2396 let open
= match bytes
.get(start_ix
) {
2397 Some(b @ b'
\''
) | Some(b @ b'
\"'
) | Some(b @ b'
('
) => *b
,
2400 let close
= if open
== b'
(' { b')' }
else { open }
;
2402 let mut title
= String
::new();
2403 let mut mark
= start_ix
+ 1;
2404 let mut i
= start_ix
+ 1;
2406 while i
< bytes
.len() {
2410 let cow
= if mark
== 1 {
2411 (i
- start_ix
+ 1, text
[mark
..i
].into())
2413 title
.push_str(&text
[mark
..i
]);
2414 (i
- start_ix
+ 1, title
.into())
2423 if c
== b'
\n'
|| c
== b'
\r'
{
2424 if let Some(node_ix
) = scan_nodes_to_ix(&self.tree
, node
, i
+ 1) {
2425 if self.tree
[node_ix
].item
.start
> i
{
2426 title
.push_str(&text
[mark
..i
]);
2428 i
= self.tree
[node_ix
].item
.start
;
2435 if let (n
, Some(value
)) = scan_entity(&bytes
[i
..]) {
2436 title
.push_str(&text
[mark
..i
]);
2437 title
.push_str(&value
);
2443 if c
== b'
\\'
&& i
+ 1 < bytes
.len() && is_ascii_punctuation(bytes
[i
+ 1]) {
2444 title
.push_str(&text
[mark
..i
]);
2455 /// Make a code span.
2457 /// Both `open` and `close` are matching MaybeCode items.
2458 fn make_code_span(&mut self, open
: TreeIndex
, close
: TreeIndex
, preceding_backslash
: bool
) {
2459 let first_ix
= open
+ 1;
2460 let last_ix
= close
- 1;
2461 let bytes
= self.text
.as_bytes();
2462 let mut span_start
= self.tree
[open
].item
.end
;
2463 let mut span_end
= self.tree
[close
].item
.start
;
2464 let mut buf
: Option
<String
> = None
;
2466 // detect all-space sequences, since they are kept as-is as of commonmark 0.29
2467 if !bytes
[span_start
..span_end
].iter().all(|&b
| b
== b' '
) {
2468 let opening
= match bytes
[span_start
] {
2469 b' '
| b'
\r'
| b'
\n'
=> true,
2472 let closing
= match bytes
[span_end
- 1] {
2473 b' '
| b'
\r'
| b'
\n'
=> true,
2476 let drop_enclosing_whitespace
= opening
&& closing
;
2478 if drop_enclosing_whitespace
{
2480 if span_start
< span_end
{
2485 let mut ix
= first_ix
;
2488 if let ItemBody
::HardBreak
| ItemBody
::SoftBreak
= self.tree
[ix
].item
.body
{
2489 if drop_enclosing_whitespace
{
2490 // check whether break should be ignored
2493 span_start
= min(span_end
, self.tree
[ix
].item
.start
);
2495 } else if ix
== last_ix
&& last_ix
> first_ix
{
2501 let end
= bytes
[self.tree
[ix
].item
.start
..]
2503 .position(|&b
| b
== b'
\r'
|| b
== b'
\n'
)
2505 + self.tree
[ix
].item
.start
;
2506 if let Some(ref mut buf
) = buf
{
2507 buf
.push_str(&self.text
[self.tree
[ix
].item
.start
..end
]);
2510 let mut new_buf
= String
::with_capacity(span_end
- span_start
);
2511 new_buf
.push_str(&self.text
[span_start
..end
]);
2513 buf
= Some(new_buf
);
2515 } else if let Some(ref mut buf
) = buf
{
2516 let end
= if ix
== last_ix
{
2519 self.tree
[ix
].item
.end
2521 buf
.push_str(&self.text
[self.tree
[ix
].item
.start
..end
]);
2527 let cow
= if let Some(buf
) = buf
{
2530 self.text
[span_start
..span_end
].into()
2532 if preceding_backslash
{
2533 self.tree
[open
].item
.body
= ItemBody
::Text
;
2534 self.tree
[open
].item
.end
= self.tree
[open
].item
.start
+ 1;
2535 self.tree
[open
].next
= Some(close
);
2536 self.tree
[close
].item
.body
= ItemBody
::Code(self.allocs
.allocate_cow(cow
));
2537 self.tree
[close
].item
.start
= self.tree
[open
].item
.start
+ 1;
2539 self.tree
[open
].item
.body
= ItemBody
::Code(self.allocs
.allocate_cow(cow
));
2540 self.tree
[open
].item
.end
= self.tree
[close
].item
.end
;
2541 self.tree
[open
].next
= self.tree
[close
].next
;
2545 /// Returns the next byte offset on success.
2546 fn scan_inline_html(&mut self, bytes
: &[u8], ix
: usize) -> Option
<usize> {
2547 let c
= *bytes
.get(ix
)?
;
2549 scan_inline_html_comment(bytes
, ix
+ 1, &mut self.html_scan_guard
)
2550 } else if c
== b'?'
{
2551 scan_inline_html_processing(bytes
, ix
+ 1, &mut self.html_scan_guard
)
2553 let i
= scan_html_block_inner(
2556 let mut line_start
= LineStart
::new(bytes
);
2557 let _
= scan_containers(&self.tree
, &mut line_start
);
2558 line_start
.bytes_scanned()
2565 /// Consumes the event iterator and produces an iterator that produces
2566 /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
2567 /// range in the markdown source.
2568 pub fn into_offset_iter(self) -> OffsetIter
<'a
> {
2569 OffsetIter { inner: self }
2573 pub(crate) enum LoopInstruction
<T
> {
2574 /// Continue looking for more special bytes, but skip next few bytes.
2575 ContinueAndSkip(usize),
2576 /// Break looping immediately, returning with the given index and value.
2577 BreakAtWith(usize, T
),
2580 /// This function walks the byte slices from the given index and
2581 /// calls the callback function on all bytes (and their indices) that are in the following set:
2582 /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
2583 /// It is guaranteed not call the callback on other bytes.
2584 /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
2585 /// will not be called with an index that is less than `ix + n + 1`.
2586 /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
2587 /// called and the function returns immediately with the return value `(end_ix, opt_val)`.
2588 /// If `BreakAtWith(..)` is never returned, this function will return the first
2589 /// index that is outside the byteslice bound and a `None` value.
2590 fn iterate_special_bytes
<F
, T
>(bytes
: &[u8], ix
: usize, callback
: F
) -> (usize, Option
<T
>)
2592 F
: FnMut(usize, u8) -> LoopInstruction
<Option
<T
>>,
2594 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2596 crate::simd
::iterate_special_bytes(bytes
, ix
, callback
)
2598 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2600 scalar_iterate_special_bytes(bytes
, ix
, callback
)
2604 const fn special_bytes() -> [bool
; 256] {
2605 let mut bytes
= [false; 256];
2606 bytes
[b'
<'
as usize] = true;
2607 bytes
[b'
!'
as usize] = true;
2608 bytes
[b'
['
as usize] = true;
2609 bytes
[b'
~'
as usize] = true;
2610 bytes
[b'`'
as usize] = true;
2611 bytes
[b'
|'
as usize] = true;
2612 bytes
[b'
\\'
as usize] = true;
2613 bytes
[b'
*'
as usize] = true;
2614 bytes
[b'_'
as usize] = true;
2615 bytes
[b'
\r'
as usize] = true;
2616 bytes
[b'
\n'
as usize] = true;
2617 bytes
[b'
]'
as usize] = true;
2618 bytes
[b'
&'
as usize] = true;
2622 pub(crate) fn scalar_iterate_special_bytes
<F
, T
>(
2626 ) -> (usize, Option
<T
>)
2628 F
: FnMut(usize, u8) -> LoopInstruction
<Option
<T
>>,
2630 let special_bytes
= special_bytes();
2632 while ix
< bytes
.len() {
2634 if special_bytes
[b
as usize] {
2635 match callback(ix
, b
) {
2636 LoopInstruction
::ContinueAndSkip(skip
) => {
2639 LoopInstruction
::BreakAtWith(ix
, val
) => {
2650 /// Markdown event and source range iterator.
2652 /// Generates tuples where the first element is the markdown event and the second
2653 /// is a the corresponding range in the source string.
2655 /// Constructed from a `Parser` using its
2656 /// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
2657 pub struct OffsetIter
<'a
> {
2661 impl<'a
> Iterator
for OffsetIter
<'a
> {
2662 type Item
= (Event
<'a
>, Range
<usize>);
2664 fn next(&mut self) -> Option
<Self::Item
> {
2665 match self.inner
.tree
.cur() {
2667 let ix
= self.inner
.tree
.pop()?
;
2668 let tag
= item_to_tag(&self.inner
.tree
[ix
].item
, &self.inner
.allocs
);
2669 self.inner
.tree
.next_sibling(ix
);
2672 self.inner
.tree
[ix
].item
.start
..self.inner
.tree
[ix
].item
.end
,
2676 if self.inner
.tree
[cur_ix
].item
.body
.is_inline() {
2677 self.inner
.handle_inline();
2680 let node
= self.inner
.tree
[cur_ix
];
2681 let item
= node
.item
;
2682 let event
= item_to_event(item
, self.inner
.text
, &self.inner
.allocs
);
2683 if let Event
::Start(..) = event
{
2684 self.inner
.tree
.push();
2686 self.inner
.tree
.next_sibling(cur_ix
);
2688 Some((event
, item
.start
..item
.end
))
2694 fn item_to_tag
<'a
>(item
: &Item
, allocs
: &Allocations
<'a
>) -> Tag
<'a
> {
2696 ItemBody
::Paragraph
=> Tag
::Paragraph
,
2697 ItemBody
::Emphasis
=> Tag
::Emphasis
,
2698 ItemBody
::Strong
=> Tag
::Strong
,
2699 ItemBody
::Strikethrough
=> Tag
::Strikethrough
,
2700 ItemBody
::Link(link_ix
) => {
2701 let &(ref link_type
, ref url
, ref title
) = allocs
.index(link_ix
);
2702 Tag
::Link(*link_type
, url
.clone(), title
.clone())
2704 ItemBody
::Image(link_ix
) => {
2705 let &(ref link_type
, ref url
, ref title
) = allocs
.index(link_ix
);
2706 Tag
::Image(*link_type
, url
.clone(), title
.clone())
2708 ItemBody
::Heading(level
) => Tag
::Heading(level
),
2709 ItemBody
::FencedCodeBlock(cow_ix
) => {
2710 Tag
::CodeBlock(CodeBlockKind
::Fenced(allocs
[cow_ix
].clone()))
2712 ItemBody
::IndentCodeBlock
=> Tag
::CodeBlock(CodeBlockKind
::Indented
),
2713 ItemBody
::BlockQuote
=> Tag
::BlockQuote
,
2714 ItemBody
::List(_
, c
, listitem_start
) => {
2715 if c
== b'
.'
|| c
== b'
)'
{
2716 Tag
::List(Some(listitem_start
))
2721 ItemBody
::ListItem(_
) => Tag
::Item
,
2722 ItemBody
::TableHead
=> Tag
::TableHead
,
2723 ItemBody
::TableCell
=> Tag
::TableCell
,
2724 ItemBody
::TableRow
=> Tag
::TableRow
,
2725 ItemBody
::Table(alignment_ix
) => Tag
::Table(allocs
[alignment_ix
].clone()),
2726 ItemBody
::FootnoteDefinition(cow_ix
) => Tag
::FootnoteDefinition(allocs
[cow_ix
].clone()),
2727 _
=> panic
!("unexpected item body {:?}", item
.body
),
2731 fn item_to_event
<'a
>(item
: Item
, text
: &'a
str, allocs
: &Allocations
<'a
>) -> Event
<'a
> {
2732 let tag
= match item
.body
{
2733 ItemBody
::Text
=> return Event
::Text(text
[item
.start
..item
.end
].into()),
2734 ItemBody
::Code(cow_ix
) => return Event
::Code(allocs
[cow_ix
].clone()),
2735 ItemBody
::SynthesizeText(cow_ix
) => return Event
::Text(allocs
[cow_ix
].clone()),
2736 ItemBody
::Html
=> return Event
::Html(text
[item
.start
..item
.end
].into()),
2737 ItemBody
::SoftBreak
=> return Event
::SoftBreak
,
2738 ItemBody
::HardBreak
=> return Event
::HardBreak
,
2739 ItemBody
::FootnoteReference(cow_ix
) => {
2740 return Event
::FootnoteReference(allocs
[cow_ix
].clone())
2742 ItemBody
::TaskListMarker(checked
) => return Event
::TaskListMarker(checked
),
2743 ItemBody
::Rule
=> return Event
::Rule
,
2745 ItemBody
::Paragraph
=> Tag
::Paragraph
,
2746 ItemBody
::Emphasis
=> Tag
::Emphasis
,
2747 ItemBody
::Strong
=> Tag
::Strong
,
2748 ItemBody
::Strikethrough
=> Tag
::Strikethrough
,
2749 ItemBody
::Link(link_ix
) => {
2750 let &(ref link_type
, ref url
, ref title
) = allocs
.index(link_ix
);
2751 Tag
::Link(*link_type
, url
.clone(), title
.clone())
2753 ItemBody
::Image(link_ix
) => {
2754 let &(ref link_type
, ref url
, ref title
) = allocs
.index(link_ix
);
2755 Tag
::Image(*link_type
, url
.clone(), title
.clone())
2757 ItemBody
::Heading(level
) => Tag
::Heading(level
),
2758 ItemBody
::FencedCodeBlock(cow_ix
) => {
2759 Tag
::CodeBlock(CodeBlockKind
::Fenced(allocs
[cow_ix
].clone()))
2761 ItemBody
::IndentCodeBlock
=> Tag
::CodeBlock(CodeBlockKind
::Indented
),
2762 ItemBody
::BlockQuote
=> Tag
::BlockQuote
,
2763 ItemBody
::List(_
, c
, listitem_start
) => {
2764 if c
== b'
.'
|| c
== b'
)'
{
2765 Tag
::List(Some(listitem_start
))
2770 ItemBody
::ListItem(_
) => Tag
::Item
,
2771 ItemBody
::TableHead
=> Tag
::TableHead
,
2772 ItemBody
::TableCell
=> Tag
::TableCell
,
2773 ItemBody
::TableRow
=> Tag
::TableRow
,
2774 ItemBody
::Table(alignment_ix
) => Tag
::Table(allocs
[alignment_ix
].clone()),
2775 ItemBody
::FootnoteDefinition(cow_ix
) => Tag
::FootnoteDefinition(allocs
[cow_ix
].clone()),
2776 _
=> panic
!("unexpected item body {:?}", item
.body
),
2782 // https://english.stackexchange.com/a/285573
2783 fn surgerize_tight_list(tree
: &mut Tree
<Item
>, list_ix
: TreeIndex
) {
2784 let mut list_item
= tree
[list_ix
].child
;
2785 while let Some(listitem_ix
) = list_item
{
2786 // first child is special, controls how we repoint list_item.child
2787 let list_item_firstborn
= tree
[listitem_ix
].child
;
2789 // Check that list item has children - this is not necessarily the case!
2790 if let Some(firstborn_ix
) = list_item_firstborn
{
2791 if let ItemBody
::Paragraph
= tree
[firstborn_ix
].item
.body
{
2792 tree
[listitem_ix
].child
= tree
[firstborn_ix
].child
;
2795 let mut list_item_child
= Some(firstborn_ix
);
2796 let mut node_to_repoint
= None
;
2797 while let Some(child_ix
) = list_item_child
{
2798 // surgerize paragraphs
2799 let repoint_ix
= if let ItemBody
::Paragraph
= tree
[child_ix
].item
.body
{
2800 if let Some(child_firstborn
) = tree
[child_ix
].child
{
2801 if let Some(repoint_ix
) = node_to_repoint
{
2802 tree
[repoint_ix
].next
= Some(child_firstborn
);
2804 let mut child_lastborn
= child_firstborn
;
2805 while let Some(lastborn_next_ix
) = tree
[child_lastborn
].next
{
2806 child_lastborn
= lastborn_next_ix
;
2816 node_to_repoint
= Some(repoint_ix
);
2817 tree
[repoint_ix
].next
= tree
[child_ix
].next
;
2818 list_item_child
= tree
[child_ix
].next
;
2822 list_item
= tree
[listitem_ix
].next
;
2826 impl<'a
> Iterator
for Parser
<'a
> {
2827 type Item
= Event
<'a
>;
2829 fn next(&mut self) -> Option
<Event
<'a
>> {
2830 match self.tree
.cur() {
2832 let ix
= self.tree
.pop()?
;
2833 let tag
= item_to_tag(&self.tree
[ix
].item
, &self.allocs
);
2834 self.tree
.next_sibling(ix
);
2835 Some(Event
::End(tag
))
2838 if self.tree
[cur_ix
].item
.body
.is_inline() {
2839 self.handle_inline();
2842 let node
= self.tree
[cur_ix
];
2843 let item
= node
.item
;
2844 let event
= item_to_event(item
, self.text
, &self.allocs
);
2845 if let Event
::Start(..) = event
{
2848 self.tree
.next_sibling(cur_ix
);
2859 use crate::tree
::Node
;
2861 // TODO: move these tests to tests/html.rs?
2863 fn parser_with_extensions(text
: &str) -> Parser
<'_
> {
2864 let mut opts
= Options
::empty();
2865 opts
.insert(Options
::ENABLE_TABLES
);
2866 opts
.insert(Options
::ENABLE_FOOTNOTES
);
2867 opts
.insert(Options
::ENABLE_STRIKETHROUGH
);
2868 opts
.insert(Options
::ENABLE_TASKLISTS
);
2870 Parser
::new_ext(text
, opts
)
2874 #[cfg(target_pointer_width = "64")]
2876 let node_size
= std
::mem
::size_of
::<Node
<Item
>>();
2877 assert_eq
!(48, node_size
);
2881 #[cfg(target_pointer_width = "64")]
2883 let body_size
= std
::mem
::size_of
::<ItemBody
>();
2884 assert_eq
!(16, body_size
);
2888 fn single_open_fish_bracket() {
2890 assert_eq
!(3, Parser
::new("<").count());
2896 assert_eq
!(2, Parser
::new("#").count());
2900 fn lots_of_backslashes() {
2902 Parser
::new("\\\\\r\r").count();
2903 Parser
::new("\\\r\r\\.\\\\\r\r\\.\\").count();
2909 parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
2915 parser_with_extensions("|\r-]([^|\r-]([^").count();
2916 parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
2922 parser_with_extensions("[^\r\ra]").count();
2923 parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
2929 parser_with_extensions("*]0[^\r\r*]0[^").count();
2930 parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
2936 parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
2941 let input
= std
::str::from_utf8(b
"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
2943 parser_with_extensions(input
).count();
2949 parser_with_extensions("> - \\\n> - ").count();
2950 parser_with_extensions("- \n\n").count();
2956 parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
2962 parser_with_extensions("_6**6*_*").count();
2966 fn another_emphasis_panic() {
2967 parser_with_extensions("*__#_#__*").count();
2972 let event_offsets
: Vec
<_
> = Parser
::new("*hello* world")
2974 .map(|(_ev
, range
)| range
)
2976 let expected_offsets
= vec
![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
2977 assert_eq
!(expected_offsets
, event_offsets
);
2981 fn reference_link_offsets() {
2983 Parser
::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
2985 .filter_map(|(ev
, range
)| match ev
{
2986 Event
::Start(Tag
::Link(LinkType
::Reference
, ..), ..) => Some(range
),
2991 assert_eq
!(5..30, range
);
2995 fn footnote_offsets() {
2996 let range
= Parser
::new("Testing this[^1] out.\n\n[^1]: Footnote.")
2998 .filter_map(|(ev
, range
)| match ev
{
2999 Event
::FootnoteReference(..) => Some(range
),
3004 assert_eq
!(12..16, range
);
3009 let markdown
= "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
3010 let event_offset
= parser_with_extensions(markdown
)
3012 .map(|(_ev
, range
)| range
)
3015 let expected_offset
= 3..59;
3016 assert_eq
!(expected_offset
, event_offset
);
3020 fn offset_iter_issue_378() {
3021 let event_offsets
: Vec
<_
> = Parser
::new("a [b](c) d")
3023 .map(|(_ev
, range
)| range
)
3025 let expected_offsets
= vec
![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
3026 assert_eq
!(expected_offsets
, event_offsets
);
3030 fn offset_iter_issue_404() {
3031 let event_offsets
: Vec
<_
> = Parser
::new("###\n")
3033 .map(|(_ev
, range
)| range
)
3035 let expected_offsets
= vec
![(0..4), (0..4)];
3036 assert_eq
!(expected_offsets
, event_offsets
);
3039 // FIXME: add this one regression suite
3041 fn link_def_at_eof() {
3042 let test_str
= "[My site][world]\n\n[world]: https://vincentprouillet.com";
3043 let expected
= "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
3045 let mut buf
= String
::new();
3046 crate::html
::push_html(&mut buf
, Parser
::new(test_str
));
3047 assert_eq
!(expected
, buf
);
3051 fn ref_def_at_eof() {
3052 let test_str
= "[test]:\\";
3055 let mut buf
= String
::new();
3056 crate::html
::push_html(&mut buf
, Parser
::new(test_str
));
3057 assert_eq
!(expected
, buf
);
3061 fn ref_def_cr_lf() {
3062 let test_str
= "[a]: /u\r\n\n[a]";
3063 let expected
= "<p><a href=\"/u\">a</a></p>\n";
3065 let mut buf
= String
::new();
3066 crate::html
::push_html(&mut buf
, Parser
::new(test_str
));
3067 assert_eq
!(expected
, buf
);
3071 fn no_dest_refdef() {
3072 let test_str
= "[a]:";
3073 let expected
= "<p>[a]:</p>\n";
3075 let mut buf
= String
::new();
3076 crate::html
::push_html(&mut buf
, Parser
::new(test_str
));
3077 assert_eq
!(expected
, buf
);
3081 fn simple_broken_link_callback() {
3082 let test_str
= "This is a link w/o def: [hello][world]";
3083 let parser
= Parser
::new_with_broken_link_callback(
3087 assert_eq
!("world", raw
);
3088 assert_eq
!("world", norm
);
3089 Some(("YOLO".to_owned(), "SWAG".to_owned()))
3092 let mut link_tag_count
= 0;
3093 for (typ
, url
, title
) in parser
.filter_map(|event
| match event
{
3094 Event
::Start(tag
) | Event
::End(tag
) => match tag
{
3095 Tag
::Link(typ
, url
, title
) => Some((typ
, url
, title
)),
3100 link_tag_count
+= 1;
3101 assert_eq
!(typ
, LinkType
::ReferenceUnknown
);
3102 assert_eq
!(url
.as_ref(), "YOLO");
3103 assert_eq
!(title
.as_ref(), "SWAG");
3105 assert
!(link_tag_count
> 0);
3109 fn code_block_kind_check_fenced() {
3110 let parser
= Parser
::new("hello\n```test\ntadam\n```");
3112 for (ev
, _range
) in parser
.into_offset_iter() {
3114 Event
::Start(Tag
::CodeBlock(CodeBlockKind
::Fenced(syntax
))) => {
3115 assert_eq
!(syntax
.as_ref(), "test");
3121 assert_eq
!(found
, 1);
3125 fn code_block_kind_check_indented() {
3126 let parser
= Parser
::new("hello\n\n ```test\n tadam\nhello");
3128 for (ev
, _range
) in parser
.into_offset_iter() {
3130 Event
::Start(Tag
::CodeBlock(CodeBlockKind
::Indented
)) => {
3136 assert_eq
!(found
, 1);