]> git.proxmox.com Git - rustc.git/blob - vendor/pulldown-cmark-0.7.2/src/parse.rs
New upstream version 1.49.0+dfsg1
[rustc.git] / vendor / pulldown-cmark-0.7.2 / src / parse.rs
1 // Copyright 2017 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20
21 //! Tree-based two pass parser.
22
23 use std::cmp::{max, min};
24 use std::collections::{HashMap, VecDeque};
25 use std::ops::{Index, Range};
26
27 use unicase::UniCase;
28
29 use crate::linklabel::{scan_link_label_rest, LinkLabel, ReferenceLabel};
30 use crate::scanners::*;
31 use crate::strings::CowStr;
32 use crate::tree::{Tree, TreeIndex};
33
34 // Allowing arbitrary depth nested parentheses inside link destinations
35 // can create denial of service vulnerabilities if we're not careful.
36 // The simplest countermeasure is to limit their depth, which is
37 // explicitly allowed by the spec as long as the limit is at least 3:
38 // https://spec.commonmark.org/0.29/#link-destination
39 const LINK_MAX_NESTED_PARENS: usize = 5;
40
41 /// Codeblock kind.
42 #[derive(Clone, Debug, PartialEq)]
43 pub enum CodeBlockKind<'a> {
44 Indented,
45 /// The value contained in the tag describes the language of the code, which may be empty.
46 Fenced(CowStr<'a>),
47 }
48
49 impl<'a> CodeBlockKind<'a> {
50 pub fn is_indented(&self) -> bool {
51 match *self {
52 CodeBlockKind::Indented => true,
53 _ => false,
54 }
55 }
56
57 pub fn is_fenced(&self) -> bool {
58 match *self {
59 CodeBlockKind::Fenced(_) => true,
60 _ => false,
61 }
62 }
63 }
64
65 /// Tags for elements that can contain other elements.
66 #[derive(Clone, Debug, PartialEq)]
67 pub enum Tag<'a> {
68 /// A paragraph of text and other inline elements.
69 Paragraph,
70
71 /// A heading. The field indicates the level of the heading.
72 Heading(u32),
73
74 BlockQuote,
75 /// A code block.
76 CodeBlock(CodeBlockKind<'a>),
77
78 /// A list. If the list is ordered the field indicates the number of the first item.
79 /// Contains only list items.
80 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
81 /// A list item.
82 Item,
83 /// A footnote definition. The value contained is the footnote's label by which it can
84 /// be referred to.
85 FootnoteDefinition(CowStr<'a>),
86
87 /// A table. Contains a vector describing the text-alignment for each of its columns.
88 Table(Vec<Alignment>),
89 /// A table header. Contains only `TableRow`s. Note that the table body starts immediately
90 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
91 TableHead,
92 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
93 TableRow,
94 TableCell,
95
96 // span-level tags
97 Emphasis,
98 Strong,
99 Strikethrough,
100
101 /// A link. The first field is the link type, the second the destination URL and the third is a title.
102 Link(LinkType, CowStr<'a>, CowStr<'a>),
103
104 /// An image. The first field is the link type, the second the destination URL and the third is a title.
105 Image(LinkType, CowStr<'a>, CowStr<'a>),
106 }
107
108 /// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
109 #[derive(Clone, Debug, PartialEq, Copy)]
110 pub enum LinkType {
111 /// Inline link like `[foo](bar)`
112 Inline,
113 /// Reference link like `[foo][bar]`
114 Reference,
115 /// Reference without destination in the document, but resolved by the broken_link_callback
116 ReferenceUnknown,
117 /// Collapsed link like `[foo][]`
118 Collapsed,
119 /// Collapsed link without destination in the document, but resolved by the broken_link_callback
120 CollapsedUnknown,
121 /// Shortcut link like `[foo]`
122 Shortcut,
123 /// Shortcut without destination in the document, but resolved by the broken_link_callback
124 ShortcutUnknown,
125 /// Autolink like `<http://foo.bar/baz>`
126 Autolink,
127 /// Email address in autolink like `<john@example.org>`
128 Email,
129 }
130
131 impl LinkType {
132 fn to_unknown(self) -> Self {
133 match self {
134 LinkType::Reference => LinkType::ReferenceUnknown,
135 LinkType::Collapsed => LinkType::CollapsedUnknown,
136 LinkType::Shortcut => LinkType::ShortcutUnknown,
137 _ => unreachable!(),
138 }
139 }
140 }
141
142 /// Markdown events that are generated in a preorder traversal of the document
143 /// tree, with additional `End` events whenever all of an inner node's children
144 /// have been visited.
145 #[derive(Clone, Debug, PartialEq)]
146 pub enum Event<'a> {
147 /// Start of a tagged element. Events that are yielded after this event
148 /// and before its corresponding `End` event are inside this element.
149 /// Start and end events are guaranteed to be balanced.
150 Start(Tag<'a>),
151 /// End of a tagged element.
152 End(Tag<'a>),
153 /// A text node.
154 Text(CowStr<'a>),
155 /// An inline code node.
156 Code(CowStr<'a>),
157 /// An HTML node.
158 Html(CowStr<'a>),
159 /// A reference to a footnote with given label, which may or may not be defined
160 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
161 /// occur in any order.
162 FootnoteReference(CowStr<'a>),
163 /// A soft line break.
164 SoftBreak,
165 /// A hard line break.
166 HardBreak,
167 /// A horizontal ruler.
168 Rule,
169 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
170 TaskListMarker(bool),
171 }
172
173 /// Table column text alignment.
174 #[derive(Copy, Clone, Debug, PartialEq)]
175 pub enum Alignment {
176 /// Default text alignment.
177 None,
178 Left,
179 Center,
180 Right,
181 }
182
183 bitflags! {
184 /// Option struct containing flags for enabling extra features
185 /// that are not part of the CommonMark spec.
186 pub struct Options: u32 {
187 const ENABLE_TABLES = 1 << 1;
188 const ENABLE_FOOTNOTES = 1 << 2;
189 const ENABLE_STRIKETHROUGH = 1 << 3;
190 const ENABLE_TASKLISTS = 1 << 4;
191 }
192 }
193
194 #[derive(Debug, Default, Clone, Copy)]
195 struct Item {
196 start: usize,
197 end: usize,
198 body: ItemBody,
199 }
200
201 #[derive(Debug, PartialEq, Clone, Copy)]
202 enum ItemBody {
203 Paragraph,
204 Text,
205 SoftBreak,
206 HardBreak,
207
208 // These are possible inline items, need to be resolved in second pass.
209
210 // repeats, can_open, can_close
211 MaybeEmphasis(usize, bool, bool),
212 MaybeCode(usize, bool), // number of backticks, preceeded by backslash
213 MaybeHtml,
214 MaybeLinkOpen,
215 MaybeLinkClose,
216 MaybeImage,
217
218 // These are inline items after resolution.
219 Emphasis,
220 Strong,
221 Strikethrough,
222 Code(CowIndex),
223 Link(LinkIndex),
224 Image(LinkIndex),
225 FootnoteReference(CowIndex),
226 TaskListMarker(bool), // true for checked
227
228 Rule,
229 Heading(u32), // heading level
230 FencedCodeBlock(CowIndex),
231 IndentCodeBlock,
232 Html,
233 BlockQuote,
234 List(bool, u8, u64), // is_tight, list character, list start index
235 ListItem(usize), // indent level
236 SynthesizeText(CowIndex),
237 FootnoteDefinition(CowIndex),
238
239 // Tables
240 Table(AlignmentIndex),
241 TableHead,
242 TableRow,
243 TableCell,
244
245 // Dummy node at the top of the tree - should not be used otherwise!
246 Root,
247 }
248
249 impl<'a> ItemBody {
250 fn is_inline(&self) -> bool {
251 match *self {
252 ItemBody::MaybeEmphasis(..)
253 | ItemBody::MaybeHtml
254 | ItemBody::MaybeCode(..)
255 | ItemBody::MaybeLinkOpen
256 | ItemBody::MaybeLinkClose
257 | ItemBody::MaybeImage => true,
258 _ => false,
259 }
260 }
261 }
262
263 impl<'a> Default for ItemBody {
264 fn default() -> Self {
265 ItemBody::Root
266 }
267 }
268
269 /// Scanning modes for `Parser`'s `parse_line` method.
270 #[derive(PartialEq, Eq, Copy, Clone)]
271 enum TableParseMode {
272 /// Inside a paragraph, scanning for table headers.
273 Scan,
274 /// Inside a table.
275 Active,
276 /// Inside a paragraph, not scanning for table headers.
277 Disabled,
278 }
279
280 /// State for the first parsing pass.
281 ///
282 /// The first pass resolves all block structure, generating an AST. Within a block, items
283 /// are in a linear chain with potential inline markup identified.
284 struct FirstPass<'a> {
285 text: &'a str,
286 tree: Tree<Item>,
287 begin_list_item: bool,
288 last_line_blank: bool,
289 allocs: Allocations<'a>,
290 options: Options,
291 list_nesting: usize,
292 }
293
294 impl<'a> FirstPass<'a> {
295 fn new(text: &'a str, options: Options) -> FirstPass {
296 // This is a very naive heuristic for the number of nodes
297 // we'll need.
298 let start_capacity = max(128, text.len() / 32);
299 let tree = Tree::with_capacity(start_capacity);
300 let begin_list_item = false;
301 let last_line_blank = false;
302 let allocs = Allocations::new();
303 FirstPass {
304 text,
305 tree,
306 begin_list_item,
307 last_line_blank,
308 allocs,
309 options,
310 list_nesting: 0,
311 }
312 }
313
314 fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
315 let mut ix = 0;
316 while ix < self.text.len() {
317 ix = self.parse_block(ix);
318 }
319 for _ in 0..self.tree.spine_len() {
320 self.pop(ix);
321 }
322 (self.tree, self.allocs)
323 }
324
325 /// Returns offset after block.
326 fn parse_block(&mut self, mut start_ix: usize) -> usize {
327 let bytes = self.text.as_bytes();
328 let mut line_start = LineStart::new(&bytes[start_ix..]);
329
330 let i = scan_containers(&self.tree, &mut line_start);
331 for _ in i..self.tree.spine_len() {
332 self.pop(start_ix);
333 }
334
335 if self.options.contains(Options::ENABLE_FOOTNOTES) {
336 // finish footnote if it's still open and was preceeded by blank line
337 if let Some(node_ix) = self.tree.peek_up() {
338 if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
339 if self.last_line_blank {
340 self.pop(start_ix);
341 }
342 }
343 }
344
345 // Footnote definitions of the form
346 // [^bar]:
347 // * anything really
348 let container_start = start_ix + line_start.bytes_scanned();
349 if let Some(bytecount) = self.parse_footnote(container_start) {
350 start_ix = container_start + bytecount;
351 start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
352 line_start = LineStart::new(&bytes[start_ix..]);
353 }
354 }
355
356 // Process new containers
357 loop {
358 let container_start = start_ix + line_start.bytes_scanned();
359 if let Some((ch, index, indent)) = line_start.scan_list_marker() {
360 let after_marker_index = start_ix + line_start.bytes_scanned();
361 self.continue_list(container_start, ch, index);
362 self.tree.append(Item {
363 start: container_start,
364 end: after_marker_index, // will get updated later if item not empty
365 body: ItemBody::ListItem(indent),
366 });
367 self.tree.push();
368 if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
369 self.begin_list_item = true;
370 return after_marker_index + n;
371 }
372 if self.options.contains(Options::ENABLE_TASKLISTS) {
373 if let Some(is_checked) = line_start.scan_task_list_marker() {
374 self.tree.append(Item {
375 start: after_marker_index,
376 end: start_ix + line_start.bytes_scanned(),
377 body: ItemBody::TaskListMarker(is_checked),
378 });
379 }
380 }
381 } else if line_start.scan_blockquote_marker() {
382 self.finish_list(start_ix);
383 self.tree.append(Item {
384 start: container_start,
385 end: 0, // will get set later
386 body: ItemBody::BlockQuote,
387 });
388 self.tree.push();
389 } else {
390 break;
391 }
392 }
393
394 let ix = start_ix + line_start.bytes_scanned();
395
396 if let Some(n) = scan_blank_line(&bytes[ix..]) {
397 if let Some(node_ix) = self.tree.peek_up() {
398 match self.tree[node_ix].item.body {
399 ItemBody::BlockQuote => (),
400 _ => {
401 if self.begin_list_item {
402 // A list item can begin with at most one blank line.
403 self.pop(start_ix);
404 }
405 self.last_line_blank = true;
406 }
407 }
408 }
409 return ix + n;
410 }
411
412 self.begin_list_item = false;
413 self.finish_list(start_ix);
414
415 // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
416 let remaining_space = line_start.remaining_space();
417
418 let indent = line_start.scan_space_upto(4);
419 if indent == 4 {
420 let ix = start_ix + line_start.bytes_scanned();
421 let remaining_space = line_start.remaining_space();
422 return self.parse_indented_code_block(ix, remaining_space);
423 }
424
425 let ix = start_ix + line_start.bytes_scanned();
426
427 // HTML Blocks
428 if bytes[ix] == b'<' {
429 // Types 1-5 are all detected by one function and all end with the same
430 // pattern
431 if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
432 return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
433 }
434
435 // Detect type 6
436 let possible_tag = scan_html_block_tag(&bytes[(ix + 1)..]).1;
437 if is_html_tag(possible_tag) {
438 return self.parse_html_block_type_6_or_7(ix, remaining_space);
439 }
440
441 // Detect type 7
442 if let Some(_html_bytes) = scan_html_type_7(&bytes[(ix + 1)..]) {
443 return self.parse_html_block_type_6_or_7(ix, remaining_space);
444 }
445 }
446
447 if let Ok(n) = scan_hrule(&bytes[ix..]) {
448 return self.parse_hrule(n, ix);
449 }
450
451 if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
452 return self.parse_atx_heading(ix, atx_size);
453 }
454
455 // parse refdef
456 if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
457 self.allocs.refdefs.entry(label).or_insert(link_def);
458 let ix = ix + bytecount;
459 // try to read trailing whitespace or it will register as a completely blank line
460 // TODO: shouldn't we do this for all block level items?
461 return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
462 }
463
464 if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
465 return self.parse_fenced_code_block(ix, indent, fence_ch, n);
466 }
467 self.parse_paragraph(ix)
468 }
469
470 /// Returns the offset of the first line after the table.
471 /// Assumptions: current focus is a table element and the table header
472 /// matches the separator line (same number of columns).
473 fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
474 // parse header. this shouldn't fail because we made sure the table header is ok
475 let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
476 self.tree[thead_ix].item.body = ItemBody::TableHead;
477
478 // parse body
479 let mut ix = body_start;
480 while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
481 ix = next_ix;
482 }
483
484 self.pop(ix);
485 ix
486 }
487
488 /// Call this when containers are taken care of.
489 /// Returns bytes scanned, row_ix
490 fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
491 let bytes = self.text.as_bytes();
492 let mut cells = 0;
493 let mut final_cell_ix = None;
494
495 let row_ix = self.tree.append(Item {
496 start: ix,
497 end: 0, // set at end of this function
498 body: ItemBody::TableRow,
499 });
500 self.tree.push();
501
502 loop {
503 ix += scan_ch(&bytes[ix..], b'|');
504 ix += scan_whitespace_no_nl(&bytes[ix..]);
505
506 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
507 ix += eol_bytes;
508 break;
509 }
510
511 let cell_ix = self.tree.append(Item {
512 start: ix,
513 end: ix,
514 body: ItemBody::TableCell,
515 });
516 self.tree.push();
517 let (next_ix, _brk) = self.parse_line(ix, TableParseMode::Active);
518 let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
519
520 if let Some(cur_ix) = self.tree.cur() {
521 self.tree[cur_ix].item.end -= trailing_whitespace;
522 }
523
524 self.tree[cell_ix].item.end = next_ix - trailing_whitespace;
525 self.tree.pop();
526
527 ix = next_ix;
528 cells += 1;
529
530 if cells == row_cells {
531 final_cell_ix = Some(cell_ix);
532 }
533 }
534
535 // fill empty cells if needed
536 // note: this is where GFM and commonmark-extra diverge. we follow
537 // GFM here
538 for _ in cells..row_cells {
539 self.tree.append(Item {
540 start: ix,
541 end: ix,
542 body: ItemBody::TableCell,
543 });
544 }
545
546 // drop excess cells
547 if let Some(cell_ix) = final_cell_ix {
548 self.tree[cell_ix].next = None;
549 }
550
551 self.pop(ix);
552
553 (ix, row_ix)
554 }
555
556 /// Returns first offset after the row and the tree index of the row.
557 fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
558 let bytes = self.text.as_bytes();
559 let mut line_start = LineStart::new(&bytes[ix..]);
560 let containers = scan_containers(&self.tree, &mut line_start);
561 if containers != self.tree.spine_len() {
562 return None;
563 }
564 line_start.scan_all_space();
565 ix += line_start.bytes_scanned();
566 if scan_paragraph_interrupt(&bytes[ix..]) {
567 return None;
568 }
569
570 let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
571 Some((ix, row_ix))
572 }
573
574 /// Returns offset of line start after paragraph.
575 fn parse_paragraph(&mut self, start_ix: usize) -> usize {
576 let node_ix = self.tree.append(Item {
577 start: start_ix,
578 end: 0, // will get set later
579 body: ItemBody::Paragraph,
580 });
581 self.tree.push();
582 let bytes = self.text.as_bytes();
583
584 let mut ix = start_ix;
585 loop {
586 let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
587 TableParseMode::Scan
588 } else {
589 TableParseMode::Disabled
590 };
591 let (next_ix, brk) = self.parse_line(ix, scan_mode);
592
593 // break out when we find a table
594 if let Some(Item {
595 body: ItemBody::Table(alignment_ix),
596 ..
597 }) = brk
598 {
599 let table_cols = self.allocs[alignment_ix].len();
600 self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
601 // this clears out any stuff we may have appended - but there may
602 // be a cleaner way
603 self.tree[node_ix].child = None;
604 self.tree.pop();
605 self.tree.push();
606 return self.parse_table(table_cols, ix, next_ix);
607 }
608
609 ix = next_ix;
610 let mut line_start = LineStart::new(&bytes[ix..]);
611 let n_containers = scan_containers(&self.tree, &mut line_start);
612 if !line_start.scan_space(4) {
613 let ix_new = ix + line_start.bytes_scanned();
614 if n_containers == self.tree.spine_len() {
615 if let Some(ix_setext) = self.parse_setext_heading(ix_new, node_ix) {
616 if let Some(Item {
617 start,
618 body: ItemBody::HardBreak,
619 ..
620 }) = brk
621 {
622 if bytes[start] == b'\\' {
623 self.tree.append_text(start, start + 1);
624 }
625 }
626 ix = ix_setext;
627 break;
628 }
629 }
630 // first check for non-empty lists, then for other interrupts
631 let suffix = &bytes[ix_new..];
632 if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
633 break;
634 }
635 }
636 line_start.scan_all_space();
637 if line_start.is_at_eol() {
638 break;
639 }
640 ix = next_ix + line_start.bytes_scanned();
641 if let Some(item) = brk {
642 self.tree.append(item);
643 }
644 }
645
646 self.pop(ix);
647 ix
648 }
649
650 /// Returns end ix of setext_heading on success.
651 fn parse_setext_heading(&mut self, ix: usize, node_ix: TreeIndex) -> Option<usize> {
652 let bytes = self.text.as_bytes();
653 let (n, level) = scan_setext_heading(&bytes[ix..])?;
654 self.tree[node_ix].item.body = ItemBody::Heading(level);
655
656 // strip trailing whitespace
657 if let Some(cur_ix) = self.tree.cur() {
658 self.tree[cur_ix].item.end -= scan_rev_while(
659 &bytes[..self.tree[cur_ix].item.end],
660 is_ascii_whitespace_no_nl,
661 );
662 }
663
664 Some(ix + n)
665 }
666
667 /// Parse a line of input, appending text and items to tree.
668 ///
669 /// Returns: index after line and an item representing the break.
670 fn parse_line(&mut self, start: usize, mode: TableParseMode) -> (usize, Option<Item>) {
671 let bytes = &self.text.as_bytes();
672 let mut pipes = 0;
673 let mut last_pipe_ix = start;
674 let mut begin_text = start;
675
676 let (final_ix, brk) = iterate_special_bytes(bytes, start, |ix, byte| {
677 match byte {
678 b'\n' | b'\r' => {
679 if let TableParseMode::Active = mode {
680 return LoopInstruction::BreakAtWith(ix, None);
681 }
682
683 let mut i = ix;
684 let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
685 if mode == TableParseMode::Scan && pipes > 0 {
686 // check if we may be parsing a table
687 let next_line_ix = ix + eol_bytes;
688 let mut line_start = LineStart::new(&bytes[next_line_ix..]);
689 if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() {
690 let table_head_ix = next_line_ix + line_start.bytes_scanned();
691 let (table_head_bytes, alignment) =
692 scan_table_head(&bytes[table_head_ix..]);
693
694 if table_head_bytes > 0 {
695 // computing header count from number of pipes
696 let header_count =
697 count_header_cols(bytes, pipes, start, last_pipe_ix);
698
699 // make sure they match the number of columns we find in separator line
700 if alignment.len() == header_count {
701 let alignment_ix = self.allocs.allocate_alignment(alignment);
702 let end_ix = table_head_ix + table_head_bytes;
703 return LoopInstruction::BreakAtWith(
704 end_ix,
705 Some(Item {
706 start: i,
707 end: end_ix, // must update later
708 body: ItemBody::Table(alignment_ix),
709 }),
710 );
711 }
712 }
713 }
714 }
715
716 let end_ix = ix + eol_bytes;
717 let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
718 if trailing_backslashes % 2 == 1 && end_ix < self.text.len() {
719 i -= 1;
720 self.tree.append_text(begin_text, i);
721 return LoopInstruction::BreakAtWith(
722 end_ix,
723 Some(Item {
724 start: i,
725 end: end_ix,
726 body: ItemBody::HardBreak,
727 }),
728 );
729 }
730 let trailing_whitespace =
731 scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
732 if trailing_whitespace >= 2 {
733 i -= trailing_whitespace;
734 self.tree.append_text(begin_text, i);
735 return LoopInstruction::BreakAtWith(
736 end_ix,
737 Some(Item {
738 start: i,
739 end: end_ix,
740 body: ItemBody::HardBreak,
741 }),
742 );
743 }
744
745 self.tree.append_text(begin_text, ix);
746 LoopInstruction::BreakAtWith(
747 end_ix,
748 Some(Item {
749 start: i,
750 end: end_ix,
751 body: ItemBody::SoftBreak,
752 }),
753 )
754 }
755 b'\\' => {
756 if ix + 1 < self.text.len() && is_ascii_punctuation(bytes[ix + 1]) {
757 self.tree.append_text(begin_text, ix);
758 if bytes[ix + 1] == b'`' {
759 let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
760 self.tree.append(Item {
761 start: ix + 1,
762 end: ix + count + 1,
763 body: ItemBody::MaybeCode(count, true),
764 });
765 begin_text = ix + 1 + count;
766 LoopInstruction::ContinueAndSkip(count)
767 } else {
768 begin_text = ix + 1;
769 LoopInstruction::ContinueAndSkip(1)
770 }
771 } else {
772 LoopInstruction::ContinueAndSkip(0)
773 }
774 }
775 c @ b'*' | c @ b'_' | c @ b'~' => {
776 let string_suffix = &self.text[ix..];
777 let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
778 let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
779 let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
780 let is_valid_seq = c != b'~'
781 || count == 2 && self.options.contains(Options::ENABLE_STRIKETHROUGH);
782
783 if (can_open || can_close) && is_valid_seq {
784 self.tree.append_text(begin_text, ix);
785 for i in 0..count {
786 self.tree.append(Item {
787 start: ix + i,
788 end: ix + i + 1,
789 body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
790 });
791 }
792 begin_text = ix + count;
793 }
794 LoopInstruction::ContinueAndSkip(count - 1)
795 }
796 b'`' => {
797 self.tree.append_text(begin_text, ix);
798 let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
799 self.tree.append(Item {
800 start: ix,
801 end: ix + count,
802 body: ItemBody::MaybeCode(count, false),
803 });
804 begin_text = ix + count;
805 LoopInstruction::ContinueAndSkip(count - 1)
806 }
807 b'<' => {
808 // Note: could detect some non-HTML cases and early escape here, but not
809 // clear that's a win.
810 self.tree.append_text(begin_text, ix);
811 self.tree.append(Item {
812 start: ix,
813 end: ix + 1,
814 body: ItemBody::MaybeHtml,
815 });
816 begin_text = ix + 1;
817 LoopInstruction::ContinueAndSkip(0)
818 }
819 b'!' => {
820 if ix + 1 < self.text.len() && bytes[ix + 1] == b'[' {
821 self.tree.append_text(begin_text, ix);
822 self.tree.append(Item {
823 start: ix,
824 end: ix + 2,
825 body: ItemBody::MaybeImage,
826 });
827 begin_text = ix + 2;
828 LoopInstruction::ContinueAndSkip(1)
829 } else {
830 LoopInstruction::ContinueAndSkip(0)
831 }
832 }
833 b'[' => {
834 self.tree.append_text(begin_text, ix);
835 self.tree.append(Item {
836 start: ix,
837 end: ix + 1,
838 body: ItemBody::MaybeLinkOpen,
839 });
840 begin_text = ix + 1;
841 LoopInstruction::ContinueAndSkip(0)
842 }
843 b']' => {
844 self.tree.append_text(begin_text, ix);
845 self.tree.append(Item {
846 start: ix,
847 end: ix + 1,
848 body: ItemBody::MaybeLinkClose,
849 });
850 begin_text = ix + 1;
851 LoopInstruction::ContinueAndSkip(0)
852 }
853 b'&' => match scan_entity(&bytes[ix..]) {
854 (n, Some(value)) => {
855 self.tree.append_text(begin_text, ix);
856 self.tree.append(Item {
857 start: ix,
858 end: ix + n,
859 body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
860 });
861 begin_text = ix + n;
862 LoopInstruction::ContinueAndSkip(n - 1)
863 }
864 _ => LoopInstruction::ContinueAndSkip(0),
865 },
866 b'|' => {
867 if let TableParseMode::Active = mode {
868 LoopInstruction::BreakAtWith(ix, None)
869 } else {
870 last_pipe_ix = ix;
871 pipes += 1;
872 LoopInstruction::ContinueAndSkip(0)
873 }
874 }
875 _ => LoopInstruction::ContinueAndSkip(0),
876 }
877 });
878
879 if brk.is_none() {
880 // need to close text at eof
881 self.tree.append_text(begin_text, final_ix);
882 }
883 (final_ix, brk)
884 }
885
886 /// Check whether we should allow a paragraph interrupt by lists. Only non-empty
887 /// lists are allowed.
888 fn interrupt_paragraph_by_list(&self, suffix: &[u8]) -> bool {
889 scan_listitem(suffix).map_or(false, |(ix, delim, index, _)| {
890 self.list_nesting > 0 ||
891 // we don't allow interruption by either empty lists or
892 // numbered lists starting at an index other than 1
893 !scan_empty_list(&suffix[ix..]) && (delim == b'*' || delim == b'-' || index == 1)
894 })
895 }
896
897 /// When start_ix is at the beginning of an HTML block of type 1 to 5,
898 /// this will find the end of the block, adding the block itself to the
899 /// tree and also keeping track of the lines of HTML within the block.
900 ///
901 /// The html_end_tag is the tag that must be found on a line to end the block.
902 fn parse_html_block_type_1_to_5(
903 &mut self,
904 start_ix: usize,
905 html_end_tag: &str,
906 mut remaining_space: usize,
907 ) -> usize {
908 let bytes = self.text.as_bytes();
909 let mut ix = start_ix;
910 loop {
911 let line_start_ix = ix;
912 ix += scan_nextline(&bytes[ix..]);
913 self.append_html_line(remaining_space, line_start_ix, ix);
914
915 let mut line_start = LineStart::new(&bytes[ix..]);
916 let n_containers = scan_containers(&self.tree, &mut line_start);
917 if n_containers < self.tree.spine_len() {
918 break;
919 }
920
921 if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
922 break;
923 }
924
925 let next_line_ix = ix + line_start.bytes_scanned();
926 if next_line_ix == self.text.len() {
927 break;
928 }
929 ix = next_line_ix;
930 remaining_space = line_start.remaining_space();
931 }
932 ix
933 }
934
935 /// When start_ix is at the beginning of an HTML block of type 6 or 7,
936 /// this will consume lines until there is a blank line and keep track of
937 /// the HTML within the block.
938 fn parse_html_block_type_6_or_7(
939 &mut self,
940 start_ix: usize,
941 mut remaining_space: usize,
942 ) -> usize {
943 let bytes = self.text.as_bytes();
944 let mut ix = start_ix;
945 loop {
946 let line_start_ix = ix;
947 ix += scan_nextline(&bytes[ix..]);
948 self.append_html_line(remaining_space, line_start_ix, ix);
949
950 let mut line_start = LineStart::new(&bytes[ix..]);
951 let n_containers = scan_containers(&self.tree, &mut line_start);
952 if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
953 break;
954 }
955
956 let next_line_ix = ix + line_start.bytes_scanned();
957 if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
958 {
959 break;
960 }
961 ix = next_line_ix;
962 remaining_space = line_start.remaining_space();
963 }
964 ix
965 }
966
967 fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
968 self.tree.append(Item {
969 start: start_ix,
970 end: 0, // will get set later
971 body: ItemBody::IndentCodeBlock,
972 });
973 self.tree.push();
974 let bytes = self.text.as_bytes();
975 let mut last_nonblank_child = None;
976 let mut last_nonblank_ix = 0;
977 let mut end_ix = 0;
978 let mut last_line_blank = false;
979
980 let mut ix = start_ix;
981 loop {
982 let line_start_ix = ix;
983 ix += scan_nextline(&bytes[ix..]);
984 self.append_code_text(remaining_space, line_start_ix, ix);
985 // TODO(spec clarification): should we synthesize newline at EOF?
986
987 if !last_line_blank {
988 last_nonblank_child = self.tree.cur();
989 last_nonblank_ix = ix;
990 end_ix = ix;
991 }
992
993 let mut line_start = LineStart::new(&bytes[ix..]);
994 let n_containers = scan_containers(&self.tree, &mut line_start);
995 if n_containers < self.tree.spine_len()
996 || !(line_start.scan_space(4) || line_start.is_at_eol())
997 {
998 break;
999 }
1000 let next_line_ix = ix + line_start.bytes_scanned();
1001 if next_line_ix == self.text.len() {
1002 break;
1003 }
1004 ix = next_line_ix;
1005 remaining_space = line_start.remaining_space();
1006 last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
1007 }
1008
1009 // Trim trailing blank lines.
1010 if let Some(child) = last_nonblank_child {
1011 self.tree[child].next = None;
1012 self.tree[child].item.end = last_nonblank_ix;
1013 }
1014 self.pop(end_ix);
1015 ix
1016 }
1017
1018 fn parse_fenced_code_block(
1019 &mut self,
1020 start_ix: usize,
1021 indent: usize,
1022 fence_ch: u8,
1023 n_fence_char: usize,
1024 ) -> usize {
1025 let bytes = self.text.as_bytes();
1026 let mut info_start = start_ix + n_fence_char;
1027 info_start += scan_whitespace_no_nl(&bytes[info_start..]);
1028 // TODO: info strings are typically very short. wouldnt it be faster
1029 // to just do a forward scan here?
1030 let mut ix = info_start + scan_nextline(&bytes[info_start..]);
1031 let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
1032 let info_string = unescape(&self.text[info_start..info_end]);
1033 self.tree.append(Item {
1034 start: start_ix,
1035 end: 0, // will get set later
1036 body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
1037 });
1038 self.tree.push();
1039 loop {
1040 let mut line_start = LineStart::new(&bytes[ix..]);
1041 let n_containers = scan_containers(&self.tree, &mut line_start);
1042 if n_containers < self.tree.spine_len() {
1043 break;
1044 }
1045 line_start.scan_space(indent);
1046 let mut close_line_start = line_start.clone();
1047 if !close_line_start.scan_space(4) {
1048 let close_ix = ix + close_line_start.bytes_scanned();
1049 if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
1050 {
1051 ix = close_ix + n;
1052 break;
1053 }
1054 }
1055 let remaining_space = line_start.remaining_space();
1056 ix += line_start.bytes_scanned();
1057 let next_ix = ix + scan_nextline(&bytes[ix..]);
1058 self.append_code_text(remaining_space, ix, next_ix);
1059 ix = next_ix;
1060 }
1061
1062 self.pop(ix);
1063
1064 // try to read trailing whitespace or it will register as a completely blank line
1065 ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
1066 }
1067
1068 fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
1069 if remaining_space > 0 {
1070 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
1071 self.tree.append(Item {
1072 start,
1073 end: start,
1074 body: ItemBody::SynthesizeText(cow_ix),
1075 });
1076 }
1077 if self.text.as_bytes()[end - 2] == b'\r' {
1078 // Normalize CRLF to LF
1079 self.tree.append_text(start, end - 2);
1080 self.tree.append_text(end - 1, end);
1081 } else {
1082 self.tree.append_text(start, end);
1083 }
1084 }
1085
1086 /// Appends a line of HTML to the tree.
1087 fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
1088 if remaining_space > 0 {
1089 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
1090 self.tree.append(Item {
1091 start,
1092 end: start,
1093 // TODO: maybe this should synthesize to html rather than text?
1094 body: ItemBody::SynthesizeText(cow_ix),
1095 });
1096 }
1097 if self.text.as_bytes()[end - 2] == b'\r' {
1098 // Normalize CRLF to LF
1099 self.tree.append(Item {
1100 start,
1101 end: end - 2,
1102 body: ItemBody::Html,
1103 });
1104 self.tree.append(Item {
1105 start: end - 1,
1106 end,
1107 body: ItemBody::Html,
1108 });
1109 } else {
1110 self.tree.append(Item {
1111 start,
1112 end,
1113 body: ItemBody::Html,
1114 });
1115 }
1116 }
1117
1118 /// Pop a container, setting its end.
1119 fn pop(&mut self, ix: usize) {
1120 let cur_ix = self.tree.pop().unwrap();
1121 self.tree[cur_ix].item.end = ix;
1122 if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
1123 surgerize_tight_list(&mut self.tree, cur_ix);
1124 }
1125 }
1126
1127 /// Close a list if it's open. Also set loose if last line was blank
1128 fn finish_list(&mut self, ix: usize) {
1129 if let Some(node_ix) = self.tree.peek_up() {
1130 if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
1131 self.pop(ix);
1132 self.list_nesting -= 1;
1133 }
1134 }
1135 if self.last_line_blank {
1136 if let Some(node_ix) = self.tree.peek_grandparent() {
1137 if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
1138 *is_tight = false;
1139 }
1140 }
1141 self.last_line_blank = false;
1142 }
1143 }
1144
1145 /// Continue an existing list or start a new one if there's not an open
1146 /// list that matches.
1147 fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
1148 if let Some(node_ix) = self.tree.peek_up() {
1149 if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
1150 if existing_ch == ch {
1151 if self.last_line_blank {
1152 *is_tight = false;
1153 self.last_line_blank = false;
1154 }
1155 return;
1156 }
1157 }
1158 // TODO: this is not the best choice for end; maybe get end from last list item.
1159 self.finish_list(start);
1160 }
1161 self.tree.append(Item {
1162 start,
1163 end: 0, // will get set later
1164 body: ItemBody::List(true, ch, index),
1165 });
1166 self.list_nesting += 1;
1167 self.tree.push();
1168 self.last_line_blank = false;
1169 }
1170
1171 /// Parse a thematic break.
1172 ///
1173 /// Returns index of start of next line.
1174 fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1175 self.tree.append(Item {
1176 start: ix,
1177 end: ix + hrule_size,
1178 body: ItemBody::Rule,
1179 });
1180 ix + hrule_size
1181 }
1182
1183 /// Parse an ATX heading.
1184 ///
1185 /// Returns index of start of next line.
1186 fn parse_atx_heading(&mut self, mut ix: usize, atx_size: usize) -> usize {
1187 let heading_ix = self.tree.append(Item {
1188 start: ix,
1189 end: 0, // set later
1190 body: ItemBody::Heading(atx_size as u32),
1191 });
1192 ix += atx_size;
1193 // next char is space or eol (guaranteed by scan_atx_heading)
1194 let bytes = self.text.as_bytes();
1195 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1196 self.tree[heading_ix].item.end = ix + eol_bytes;
1197 return ix + eol_bytes;
1198 }
1199 // skip leading spaces
1200 let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1201 ix += skip_spaces;
1202
1203 // now handle the header text
1204 let header_start = ix;
1205 let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1206 ix = self.parse_line(ix, TableParseMode::Disabled).0;
1207 self.tree[header_node_idx].item.end = ix;
1208
1209 // remove trailing matter from header text
1210 if let Some(cur_ix) = self.tree.cur() {
1211 let header_text = &bytes[header_start..ix];
1212 let mut limit = header_text
1213 .iter()
1214 .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
1215 .map_or(0, |i| i + 1);
1216 let closer = header_text[..limit]
1217 .iter()
1218 .rposition(|&b| b != b'#')
1219 .map_or(0, |i| i + 1);
1220 if closer == 0 {
1221 limit = closer;
1222 } else {
1223 let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
1224 if spaces > 0 {
1225 limit = closer - spaces;
1226 }
1227 }
1228 self.tree[cur_ix].item.end = limit + header_start;
1229 }
1230
1231 self.tree.pop();
1232 ix
1233 }
1234
1235 /// Returns the number of bytes scanned on success.
1236 fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1237 let bytes = &self.text.as_bytes()[start..];
1238 if !bytes.starts_with(b"[^") {
1239 return None;
1240 }
1241 let (mut i, label) = self.parse_refdef_label(start + 2)?;
1242 i += 2;
1243 if scan_ch(&bytes[i..], b':') == 0 {
1244 return None;
1245 }
1246 i += 1;
1247 self.finish_list(start);
1248 self.tree.append(Item {
1249 start,
1250 end: 0, // will get set later
1251 // TODO: check whether the label here is strictly necessary
1252 body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1253 });
1254 self.tree.push();
1255 Some(i)
1256 }
1257
1258 /// Tries to parse a reference label, which can be interrupted by new blocks.
1259 /// On success, returns the number of bytes of the label and the label itself.
1260 fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1261 scan_link_label_rest(&self.text[start..], &|bytes| {
1262 let mut line_start = LineStart::new(bytes);
1263 let _ = scan_containers(&self.tree, &mut line_start);
1264 let bytes_scanned = line_start.bytes_scanned();
1265
1266 let suffix = &bytes[bytes_scanned..];
1267 if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
1268 None
1269 } else {
1270 Some(bytes_scanned)
1271 }
1272 })
1273 }
1274
1275 /// Returns number of bytes scanned, label and definition on success.
1276 fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1277 let bytes = &self.text.as_bytes()[start..];
1278 if scan_ch(bytes, b'[') == 0 {
1279 return None;
1280 }
1281 let (mut i, label) = self.parse_refdef_label(start + 1)?;
1282 i += 1;
1283 if scan_ch(&bytes[i..], b':') == 0 {
1284 return None;
1285 }
1286 i += 1;
1287 let (bytecount, link_def) = self.scan_refdef(start + i)?;
1288 Some((bytecount + i, UniCase::new(label), link_def))
1289 }
1290
1291 /// Returns number of bytes and number of newlines
1292 fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1293 let mut newlines = 0;
1294 loop {
1295 let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1296 i += whitespaces;
1297 if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1298 i += eol_bytes;
1299 newlines += 1;
1300 if newlines > 1 {
1301 return None;
1302 }
1303 } else {
1304 break;
1305 }
1306 let mut line_start = LineStart::new(&bytes[i..]);
1307 if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
1308 return None;
1309 }
1310 i += line_start.bytes_scanned();
1311 }
1312 Some((i, newlines))
1313 }
1314
1315 /// Returns # of bytes and definition.
1316 /// Assumes the label of the reference including colon has already been scanned.
1317 fn scan_refdef(&self, start: usize) -> Option<(usize, LinkDef<'a>)> {
1318 let bytes = self.text.as_bytes();
1319
1320 // whitespace between label and url (including up to one newline)
1321 let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1322
1323 // scan link dest
1324 let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
1325 if dest_length == 0 {
1326 return None;
1327 }
1328 let dest = unescape(dest);
1329 i += dest_length;
1330
1331 // no title
1332 let mut backup = (i - start, LinkDef { dest, title: None });
1333
1334 // scan whitespace between dest and label
1335 let (mut i, newlines) =
1336 if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1337 if i == self.text.len() {
1338 newlines += 1;
1339 }
1340 if new_i == i && newlines == 0 {
1341 return None;
1342 }
1343 if newlines > 1 {
1344 return Some(backup);
1345 };
1346 (new_i, newlines)
1347 } else {
1348 return Some(backup);
1349 };
1350
1351 // scan title
1352 // if this fails but newline == 1, return also a refdef without title
1353 if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
1354 i += title_length;
1355 backup.1.title = Some(unescape(title));
1356 } else if newlines > 0 {
1357 return Some(backup);
1358 } else {
1359 return None;
1360 };
1361
1362 // scan EOL
1363 if let Some(bytes) = scan_blank_line(&bytes[i..]) {
1364 backup.0 = i + bytes - start;
1365 Some(backup)
1366 } else if newlines > 0 {
1367 Some(backup)
1368 } else {
1369 None
1370 }
1371 }
1372 }
1373
1374 /// Returns number of containers scanned.
1375 fn scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize {
1376 let mut i = 0;
1377 for &node_ix in tree.walk_spine() {
1378 match tree[node_ix].item.body {
1379 ItemBody::BlockQuote => {
1380 let save = line_start.clone();
1381 if !line_start.scan_blockquote_marker() {
1382 *line_start = save;
1383 break;
1384 }
1385 }
1386 ItemBody::ListItem(indent) => {
1387 let save = line_start.clone();
1388 if !line_start.scan_space(indent) {
1389 if !line_start.is_at_eol() {
1390 *line_start = save;
1391 break;
1392 }
1393 }
1394 }
1395 _ => (),
1396 }
1397 i += 1;
1398 }
1399 i
1400 }
1401
1402 /// Computes the number of header columns in a table line by computing the number of dividing pipes
1403 /// that aren't followed or preceeded by whitespace.
1404 fn count_header_cols(
1405 bytes: &[u8],
1406 mut pipes: usize,
1407 mut start: usize,
1408 last_pipe_ix: usize,
1409 ) -> usize {
1410 // was first pipe preceeded by whitespace? if so, subtract one
1411 start += scan_whitespace_no_nl(&bytes[start..]);
1412 if bytes[start] == b'|' {
1413 pipes -= 1;
1414 }
1415
1416 // was last pipe followed by whitespace? if so, sub one
1417 if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
1418 pipes
1419 } else {
1420 pipes + 1
1421 }
1422 }
1423
1424 impl<'a> Tree<Item> {
1425 fn append_text(&mut self, start: usize, end: usize) {
1426 if end > start {
1427 if let Some(ix) = self.cur() {
1428 if ItemBody::Text == self[ix].item.body && self[ix].item.end == start {
1429 self[ix].item.end = end;
1430 return;
1431 }
1432 }
1433 self.append(Item {
1434 start,
1435 end,
1436 body: ItemBody::Text,
1437 });
1438 }
1439 }
1440 }
1441
1442 /// Determines whether the delimiter run starting at given index is
1443 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1444 /// for _ delims).
1445 /// suffix is &s[ix..], which is passed in as an optimization, since taking
1446 /// a string subslice is O(n).
1447 fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1448 let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1449 c
1450 } else {
1451 return false;
1452 };
1453 if next_char.is_whitespace() {
1454 return false;
1455 }
1456 if ix == 0 {
1457 return true;
1458 }
1459 let delim = suffix.chars().next().unwrap();
1460 if delim == '*' && !is_punctuation(next_char) {
1461 return true;
1462 }
1463
1464 let prev_char = s[..ix].chars().last().unwrap();
1465
1466 prev_char.is_whitespace() || is_punctuation(prev_char)
1467 }
1468
1469 /// Determines whether the delimiter run starting at given index is
1470 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1471 /// for _ delims)
1472 fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1473 if ix == 0 {
1474 return false;
1475 }
1476 let prev_char = s[..ix].chars().last().unwrap();
1477 if prev_char.is_whitespace() {
1478 return false;
1479 }
1480 let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1481 c
1482 } else {
1483 return true;
1484 };
1485 let delim = suffix.chars().next().unwrap();
1486 if delim == '*' && !is_punctuation(prev_char) {
1487 return true;
1488 }
1489
1490 next_char.is_whitespace() || is_punctuation(next_char)
1491 }
1492
1493 /// Checks whether we should break a paragraph on the given input.
1494 /// Note: lists are dealt with in `interrupt_paragraph_by_list`, because determing
1495 /// whether to break on a list requires additional context.
1496 fn scan_paragraph_interrupt(bytes: &[u8]) -> bool {
1497 if scan_eol(bytes).is_some()
1498 || scan_hrule(bytes).is_ok()
1499 || scan_atx_heading(bytes).is_some()
1500 || scan_code_fence(bytes).is_some()
1501 || scan_blockquote_start(bytes).is_some()
1502 {
1503 return true;
1504 }
1505 bytes.starts_with(b"<")
1506 && (get_html_end_tag(&bytes[1..]).is_some()
1507 || is_html_tag(scan_html_block_tag(&bytes[1..]).1))
1508 }
1509
1510 /// Assumes `text_bytes` is preceded by `<`.
1511 fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
1512 static BEGIN_TAGS: &[&[u8]; 3] = &[b"pre", b"style", b"script"];
1513 static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
1514
1515 for (beg_tag, end_tag) in BEGIN_TAGS
1516 .iter()
1517 .zip(["</pre>", "</style>", "</script>"].iter())
1518 {
1519 let tag_len = beg_tag.len();
1520
1521 if text_bytes.len() < tag_len {
1522 // begin tags are increasing in size
1523 break;
1524 }
1525
1526 if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
1527 continue;
1528 }
1529
1530 // Must either be the end of the line...
1531 if text_bytes.len() == tag_len {
1532 return Some(end_tag);
1533 }
1534
1535 // ...or be followed by whitespace, newline, or '>'.
1536 let s = text_bytes[tag_len];
1537 if is_ascii_whitespace(s) || s == b'>' {
1538 return Some(end_tag);
1539 }
1540 }
1541
1542 for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
1543 if text_bytes.starts_with(beg_tag) {
1544 return Some(end_tag);
1545 }
1546 }
1547
1548 if text_bytes.len() > 1
1549 && text_bytes[0] == b'!'
1550 && text_bytes[1] >= b'A'
1551 && text_bytes[1] <= b'Z'
1552 {
1553 Some(">")
1554 } else {
1555 None
1556 }
1557 }
1558
1559 #[derive(Copy, Clone, Debug)]
1560 struct InlineEl {
1561 start: TreeIndex, // offset of tree node
1562 count: usize,
1563 c: u8, // b'*' or b'_'
1564 both: bool, // can both open and close
1565 }
1566
1567 #[derive(Debug, Clone, Default)]
1568 struct InlineStack {
1569 stack: Vec<InlineEl>,
1570 // Lower bounds for matching indices in the stack. For example
1571 // a strikethrough delimiter will never match with any element
1572 // in the stack with index smaller than
1573 // `lower_bounds[InlineStack::TILDES]`.
1574 lower_bounds: [usize; 7],
1575 }
1576
1577 impl InlineStack {
1578 /// These are indices into the lower bounds array.
1579 /// Not both refers to the property that the delimiter can not both
1580 /// be opener as a closer.
1581 const UNDERSCORE_NOT_BOTH: usize = 0;
1582 const ASTERISK_NOT_BOTH: usize = 1;
1583 const ASTERISK_BASE: usize = 2;
1584 const TILDES: usize = 5;
1585 const UNDERSCORE_BOTH: usize = 6;
1586
1587 fn pop_all(&mut self, tree: &mut Tree<Item>) {
1588 for el in self.stack.drain(..) {
1589 for i in 0..el.count {
1590 tree[el.start + i].item.body = ItemBody::Text;
1591 }
1592 }
1593 self.lower_bounds = [0; 7];
1594 }
1595
1596 fn get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize {
1597 if c == b'_' {
1598 if both {
1599 self.lower_bounds[InlineStack::UNDERSCORE_BOTH]
1600 } else {
1601 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH]
1602 }
1603 } else if c == b'*' {
1604 let mod3_lower = self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3];
1605 if both {
1606 mod3_lower
1607 } else {
1608 min(
1609 mod3_lower,
1610 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH],
1611 )
1612 }
1613 } else {
1614 self.lower_bounds[InlineStack::TILDES]
1615 }
1616 }
1617
1618 fn set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize) {
1619 if c == b'_' {
1620 if both {
1621 self.lower_bounds[InlineStack::UNDERSCORE_BOTH] = new_bound;
1622 } else {
1623 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH] = new_bound;
1624 }
1625 } else if c == b'*' {
1626 self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3] = new_bound;
1627 if !both {
1628 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH] = new_bound;
1629 }
1630 } else {
1631 self.lower_bounds[InlineStack::TILDES] = new_bound;
1632 }
1633 }
1634
1635 fn find_match(
1636 &mut self,
1637 tree: &mut Tree<Item>,
1638 c: u8,
1639 count: usize,
1640 both: bool,
1641 ) -> Option<InlineEl> {
1642 let lowerbound = min(self.stack.len(), self.get_lowerbound(c, count, both));
1643 let res = self.stack[lowerbound..]
1644 .iter()
1645 .cloned()
1646 .enumerate()
1647 .rfind(|(_, el)| {
1648 el.c == c && (!both && !el.both || (count + el.count) % 3 != 0 || count % 3 == 0)
1649 });
1650
1651 if let Some((matching_ix, matching_el)) = res {
1652 let matching_ix = matching_ix + lowerbound;
1653 for el in &self.stack[(matching_ix + 1)..] {
1654 for i in 0..el.count {
1655 tree[el.start + i].item.body = ItemBody::Text;
1656 }
1657 }
1658 self.stack.truncate(matching_ix);
1659 Some(matching_el)
1660 } else {
1661 self.set_lowerbound(c, count, both, self.stack.len());
1662 None
1663 }
1664 }
1665
1666 fn push(&mut self, el: InlineEl) {
1667 self.stack.push(el)
1668 }
1669 }
1670
1671 #[derive(Debug, Clone)]
1672 enum RefScan<'a> {
1673 // label, next node index, source ix of label end
1674 LinkLabel(CowStr<'a>, Option<TreeIndex>, usize),
1675 // contains next node index
1676 Collapsed(Option<TreeIndex>),
1677 Failed,
1678 }
1679
1680 /// Skips forward within a block to a node which spans (ends inclusive) the given
1681 /// index into the source.
1682 fn scan_nodes_to_ix(
1683 tree: &Tree<Item>,
1684 mut node: Option<TreeIndex>,
1685 ix: usize,
1686 ) -> Option<TreeIndex> {
1687 while let Some(node_ix) = node {
1688 if tree[node_ix].item.end <= ix {
1689 node = tree[node_ix].next;
1690 } else {
1691 break;
1692 }
1693 }
1694 node
1695 }
1696
1697 /// Scans an inline link label, which cannot be interrupted.
1698 /// Returns number of bytes (including brackets) and label on success.
1699 fn scan_link_label<'text, 'tree>(
1700 tree: &'tree Tree<Item>,
1701 text: &'text str,
1702 ) -> Option<(usize, ReferenceLabel<'text>)> {
1703 let bytes = &text.as_bytes();
1704 if bytes.len() < 2 || bytes[0] != b'[' {
1705 return None;
1706 }
1707 let linebreak_handler = |bytes: &[u8]| {
1708 let mut line_start = LineStart::new(bytes);
1709 let _ = scan_containers(tree, &mut line_start);
1710 Some(line_start.bytes_scanned())
1711 };
1712 let pair = if b'^' == bytes[1] {
1713 let (byte_index, cow) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
1714 (byte_index + 2, ReferenceLabel::Footnote(cow))
1715 } else {
1716 let (byte_index, cow) = scan_link_label_rest(&text[1..], &linebreak_handler)?;
1717 (byte_index + 1, ReferenceLabel::Link(cow))
1718 };
1719 Some(pair)
1720 }
1721
1722 fn scan_reference<'a, 'b>(
1723 tree: &'a Tree<Item>,
1724 text: &'b str,
1725 cur: Option<TreeIndex>,
1726 ) -> RefScan<'b> {
1727 let cur_ix = match cur {
1728 None => return RefScan::Failed,
1729 Some(cur_ix) => cur_ix,
1730 };
1731 let start = tree[cur_ix].item.start;
1732 let tail = &text.as_bytes()[start..];
1733
1734 if tail.starts_with(b"[]") {
1735 let closing_node = tree[cur_ix].next.unwrap();
1736 RefScan::Collapsed(tree[closing_node].next)
1737 } else if let Some((ix, ReferenceLabel::Link(label))) = scan_link_label(tree, &text[start..]) {
1738 let next_node = scan_nodes_to_ix(tree, cur, start + ix);
1739 RefScan::LinkLabel(label, next_node, start + ix)
1740 } else {
1741 RefScan::Failed
1742 }
1743 }
1744
1745 #[derive(Clone, Default)]
1746 struct LinkStack {
1747 inner: Vec<LinkStackEl>,
1748 disabled_ix: usize,
1749 }
1750
1751 impl LinkStack {
1752 fn push(&mut self, el: LinkStackEl) {
1753 self.inner.push(el);
1754 }
1755
1756 fn pop(&mut self) -> Option<LinkStackEl> {
1757 let el = self.inner.pop();
1758 self.disabled_ix = std::cmp::min(self.disabled_ix, self.inner.len());
1759 el
1760 }
1761
1762 fn clear(&mut self) {
1763 self.inner.clear();
1764 self.disabled_ix = 0;
1765 }
1766
1767 fn disable_all_links(&mut self) {
1768 for el in &mut self.inner[self.disabled_ix..] {
1769 if el.ty == LinkStackTy::Link {
1770 el.ty = LinkStackTy::Disabled;
1771 }
1772 }
1773 self.disabled_ix = self.inner.len();
1774 }
1775 }
1776
1777 #[derive(Clone, Debug)]
1778 struct LinkStackEl {
1779 node: TreeIndex,
1780 ty: LinkStackTy,
1781 }
1782
1783 #[derive(PartialEq, Clone, Debug)]
1784 enum LinkStackTy {
1785 Link,
1786 Image,
1787 Disabled,
1788 }
1789
1790 #[derive(Clone)]
1791 struct LinkDef<'a> {
1792 dest: CowStr<'a>,
1793 title: Option<CowStr<'a>>,
1794 }
1795
1796 /// Tracks tree indices of code span delimiters of each length. It should prevent
1797 /// quadratic scanning behaviours by providing (amortized) constant time lookups.
1798 struct CodeDelims {
1799 inner: HashMap<usize, VecDeque<TreeIndex>>,
1800 seen_first: bool,
1801 }
1802
1803 impl CodeDelims {
1804 fn new() -> Self {
1805 Self {
1806 inner: Default::default(),
1807 seen_first: false,
1808 }
1809 }
1810
1811 fn insert(&mut self, count: usize, ix: TreeIndex) {
1812 if self.seen_first {
1813 self.inner
1814 .entry(count)
1815 .or_insert_with(Default::default)
1816 .push_back(ix);
1817 } else {
1818 // Skip the first insert, since that delimiter will always
1819 // be an opener and not a closer.
1820 self.seen_first = true;
1821 }
1822 }
1823
1824 fn is_populated(&self) -> bool {
1825 !self.inner.is_empty()
1826 }
1827
1828 fn find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex> {
1829 while let Some(ix) = self.inner.get_mut(&count)?.pop_front() {
1830 if ix > open_ix {
1831 return Some(ix);
1832 }
1833 }
1834 None
1835 }
1836
1837 fn clear(&mut self) {
1838 self.inner.clear();
1839 self.seen_first = false;
1840 }
1841 }
1842
1843 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1844 struct LinkIndex(usize);
1845
1846 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1847 struct CowIndex(usize);
1848
1849 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1850 struct AlignmentIndex(usize);
1851
1852 #[derive(Clone)]
1853 struct Allocations<'a> {
1854 refdefs: HashMap<LinkLabel<'a>, LinkDef<'a>>,
1855 links: Vec<(LinkType, CowStr<'a>, CowStr<'a>)>,
1856 cows: Vec<CowStr<'a>>,
1857 alignments: Vec<Vec<Alignment>>,
1858 }
1859
1860 impl<'a> Allocations<'a> {
1861 fn new() -> Self {
1862 Self {
1863 refdefs: HashMap::new(),
1864 links: Vec::with_capacity(128),
1865 cows: Vec::new(),
1866 alignments: Vec::new(),
1867 }
1868 }
1869
1870 fn allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex {
1871 let ix = self.cows.len();
1872 self.cows.push(cow);
1873 CowIndex(ix)
1874 }
1875
1876 fn allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex {
1877 let ix = self.links.len();
1878 self.links.push((ty, url, title));
1879 LinkIndex(ix)
1880 }
1881
1882 fn allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex {
1883 let ix = self.alignments.len();
1884 self.alignments.push(alignment);
1885 AlignmentIndex(ix)
1886 }
1887 }
1888
1889 impl<'a> Index<CowIndex> for Allocations<'a> {
1890 type Output = CowStr<'a>;
1891
1892 fn index(&self, ix: CowIndex) -> &Self::Output {
1893 self.cows.index(ix.0)
1894 }
1895 }
1896
1897 impl<'a> Index<LinkIndex> for Allocations<'a> {
1898 type Output = (LinkType, CowStr<'a>, CowStr<'a>);
1899
1900 fn index(&self, ix: LinkIndex) -> &Self::Output {
1901 self.links.index(ix.0)
1902 }
1903 }
1904
1905 impl<'a> Index<AlignmentIndex> for Allocations<'a> {
1906 type Output = Vec<Alignment>;
1907
1908 fn index(&self, ix: AlignmentIndex) -> &Self::Output {
1909 self.alignments.index(ix.0)
1910 }
1911 }
1912
1913 /// A struct containing information on the reachability of certain inline HTML
1914 /// elements. In particular, for cdata elements (`<![CDATA[`), processing
1915 /// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
1916 /// represent the indices before which a scan will always fail and can hence
1917 /// be skipped.
1918 #[derive(Clone, Default)]
1919 pub(crate) struct HtmlScanGuard {
1920 pub cdata: usize,
1921 pub processing: usize,
1922 pub declaration: usize,
1923 }
1924
1925 /// Markdown event iterator.
1926 #[derive(Clone)]
1927 pub struct Parser<'a> {
1928 text: &'a str,
1929 tree: Tree<Item>,
1930 allocs: Allocations<'a>,
1931 broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
1932 html_scan_guard: HtmlScanGuard,
1933
1934 // used by inline passes. store them here for reuse
1935 inline_stack: InlineStack,
1936 link_stack: LinkStack,
1937 }
1938
1939 impl<'a> Parser<'a> {
1940 /// Creates a new event iterator for a markdown string without any options enabled.
1941 pub fn new(text: &'a str) -> Parser<'a> {
1942 Parser::new_ext(text, Options::empty())
1943 }
1944
1945 /// Creates a new event iterator for a markdown string with given options.
1946 pub fn new_ext(text: &'a str, options: Options) -> Parser<'a> {
1947 Parser::new_with_broken_link_callback(text, options, None)
1948 }
1949
1950 /// In case the parser encounters any potential links that have a broken
1951 /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
1952 /// the provided callback will be called with the reference name,
1953 /// and the returned pair will be used as the link name and title if it is not
1954 /// `None`.
1955 pub fn new_with_broken_link_callback(
1956 text: &'a str,
1957 options: Options,
1958 broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
1959 ) -> Parser<'a> {
1960 let first_pass = FirstPass::new(text, options);
1961 let (mut tree, allocs) = first_pass.run();
1962 tree.reset();
1963 let inline_stack = Default::default();
1964 let link_stack = Default::default();
1965 let html_scan_guard = Default::default();
1966 Parser {
1967 text,
1968 tree,
1969 allocs,
1970 broken_link_callback,
1971 inline_stack,
1972 link_stack,
1973 html_scan_guard,
1974 }
1975 }
1976
1977 /// Handle inline markup.
1978 ///
1979 /// When the parser encounters any item indicating potential inline markup, all
1980 /// inline markup passes are run on the remainder of the chain.
1981 ///
1982 /// Note: there's some potential for optimization here, but that's future work.
1983 fn handle_inline(&mut self) {
1984 self.handle_inline_pass1();
1985 self.handle_emphasis();
1986 }
1987
1988 /// Handle inline HTML, code spans, and links.
1989 ///
1990 /// This function handles both inline HTML and code spans, because they have
1991 /// the same precedence. It also handles links, even though they have lower
1992 /// precedence, because the URL of links must not be processed.
1993 fn handle_inline_pass1(&mut self) {
1994 let mut code_delims = CodeDelims::new();
1995 let mut cur = self.tree.cur();
1996 let mut prev = None;
1997
1998 let block_end = self.tree[self.tree.peek_up().unwrap()].item.end;
1999 let block_text = &self.text[..block_end];
2000
2001 while let Some(mut cur_ix) = cur {
2002 match self.tree[cur_ix].item.body {
2003 ItemBody::MaybeHtml => {
2004 let next = self.tree[cur_ix].next;
2005 let autolink = if let Some(next_ix) = next {
2006 scan_autolink(block_text, self.tree[next_ix].item.start)
2007 } else {
2008 None
2009 };
2010
2011 if let Some((ix, uri, link_type)) = autolink {
2012 let node = scan_nodes_to_ix(&self.tree, next, ix);
2013 let text_node = self.tree.create_node(Item {
2014 start: self.tree[cur_ix].item.start + 1,
2015 end: ix - 1,
2016 body: ItemBody::Text,
2017 });
2018 let link_ix = self.allocs.allocate_link(link_type, uri, "".into());
2019 self.tree[cur_ix].item.body = ItemBody::Link(link_ix);
2020 self.tree[cur_ix].item.end = ix;
2021 self.tree[cur_ix].next = node;
2022 self.tree[cur_ix].child = Some(text_node);
2023 prev = cur;
2024 cur = node;
2025 if let Some(node_ix) = cur {
2026 self.tree[node_ix].item.start = max(self.tree[node_ix].item.start, ix);
2027 }
2028 continue;
2029 } else {
2030 let inline_html = if let Some(next_ix) = next {
2031 self.scan_inline_html(
2032 block_text.as_bytes(),
2033 self.tree[next_ix].item.start,
2034 )
2035 } else {
2036 None
2037 };
2038 if let Some(ix) = inline_html {
2039 let node = scan_nodes_to_ix(&self.tree, next, ix);
2040 self.tree[cur_ix].item.body = ItemBody::Html;
2041 self.tree[cur_ix].item.end = ix;
2042 self.tree[cur_ix].next = node;
2043 prev = cur;
2044 cur = node;
2045 if let Some(node_ix) = cur {
2046 self.tree[node_ix].item.start =
2047 max(self.tree[node_ix].item.start, ix);
2048 }
2049 continue;
2050 }
2051 }
2052 self.tree[cur_ix].item.body = ItemBody::Text;
2053 }
2054 ItemBody::MaybeCode(mut search_count, preceded_by_backslash) => {
2055 if preceded_by_backslash {
2056 search_count -= 1;
2057 if search_count == 0 {
2058 self.tree[cur_ix].item.body = ItemBody::Text;
2059 prev = cur;
2060 cur = self.tree[cur_ix].next;
2061 continue;
2062 }
2063 }
2064
2065 if code_delims.is_populated() {
2066 // we have previously scanned all codeblock delimiters,
2067 // so we can reuse that work
2068 if let Some(scan_ix) = code_delims.find(cur_ix, search_count) {
2069 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2070 } else {
2071 self.tree[cur_ix].item.body = ItemBody::Text;
2072 }
2073 } else {
2074 // we haven't previously scanned all codeblock delimiters,
2075 // so walk the AST
2076 let mut scan = if search_count > 0 {
2077 self.tree[cur_ix].next
2078 } else {
2079 None
2080 };
2081 while let Some(scan_ix) = scan {
2082 if let ItemBody::MaybeCode(delim_count, _) =
2083 self.tree[scan_ix].item.body
2084 {
2085 if search_count == delim_count {
2086 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2087 code_delims.clear();
2088 break;
2089 } else {
2090 code_delims.insert(delim_count, scan_ix);
2091 }
2092 }
2093 scan = self.tree[scan_ix].next;
2094 }
2095 if scan == None {
2096 self.tree[cur_ix].item.body = ItemBody::Text;
2097 }
2098 }
2099 }
2100 ItemBody::MaybeLinkOpen => {
2101 self.tree[cur_ix].item.body = ItemBody::Text;
2102 self.link_stack.push(LinkStackEl {
2103 node: cur_ix,
2104 ty: LinkStackTy::Link,
2105 });
2106 }
2107 ItemBody::MaybeImage => {
2108 self.tree[cur_ix].item.body = ItemBody::Text;
2109 self.link_stack.push(LinkStackEl {
2110 node: cur_ix,
2111 ty: LinkStackTy::Image,
2112 });
2113 }
2114 ItemBody::MaybeLinkClose => {
2115 self.tree[cur_ix].item.body = ItemBody::Text;
2116 if let Some(tos) = self.link_stack.pop() {
2117 if tos.ty == LinkStackTy::Disabled {
2118 continue;
2119 }
2120 let next = self.tree[cur_ix].next;
2121 if let Some((next_ix, url, title)) =
2122 self.scan_inline_link(block_text, self.tree[cur_ix].item.end, next)
2123 {
2124 let next_node = scan_nodes_to_ix(&self.tree, next, next_ix);
2125 if let Some(prev_ix) = prev {
2126 self.tree[prev_ix].next = None;
2127 }
2128 cur = Some(tos.node);
2129 cur_ix = tos.node;
2130 let link_ix = self.allocs.allocate_link(LinkType::Inline, url, title);
2131 self.tree[cur_ix].item.body = if tos.ty == LinkStackTy::Image {
2132 ItemBody::Image(link_ix)
2133 } else {
2134 ItemBody::Link(link_ix)
2135 };
2136 self.tree[cur_ix].child = self.tree[cur_ix].next;
2137 self.tree[cur_ix].next = next_node;
2138 self.tree[cur_ix].item.end = next_ix;
2139 if let Some(next_node_ix) = next_node {
2140 self.tree[next_node_ix].item.start =
2141 max(self.tree[next_node_ix].item.start, next_ix);
2142 }
2143
2144 if tos.ty == LinkStackTy::Link {
2145 self.link_stack.disable_all_links();
2146 }
2147 } else {
2148 // ok, so its not an inline link. maybe it is a reference
2149 // to a defined link?
2150 let scan_result = scan_reference(&self.tree, block_text, next);
2151 let (node_after_link, link_type) = match scan_result {
2152 // [label][reference]
2153 RefScan::LinkLabel(_, next_node, _) => {
2154 (next_node, LinkType::Reference)
2155 }
2156 // []
2157 RefScan::Collapsed(next_node) => (next_node, LinkType::Collapsed),
2158 // [shortcut]
2159 //
2160 // [shortcut]: /blah
2161 RefScan::Failed => (next, LinkType::Shortcut),
2162 };
2163
2164 // (label, source_ix end)
2165 let label: Option<(ReferenceLabel<'a>, usize)> = match scan_result {
2166 RefScan::LinkLabel(l, _, end_ix) => {
2167 Some((ReferenceLabel::Link(l), end_ix))
2168 }
2169 RefScan::Collapsed(..) | RefScan::Failed => {
2170 // No label? maybe it is a shortcut reference
2171 let label_start = self.tree[tos.node].item.end - 1;
2172 scan_link_label(
2173 &self.tree,
2174 &self.text[label_start..self.tree[cur_ix].item.end],
2175 )
2176 .map(|(ix, label)| (label, label_start + ix))
2177 }
2178 };
2179
2180 // see if it's a footnote reference
2181 if let Some((ReferenceLabel::Footnote(l), end)) = label {
2182 self.tree[tos.node].next = node_after_link;
2183 self.tree[tos.node].child = None;
2184 self.tree[tos.node].item.body =
2185 ItemBody::FootnoteReference(self.allocs.allocate_cow(l));
2186 self.tree[tos.node].item.end = end;
2187 prev = Some(tos.node);
2188 cur = node_after_link;
2189 self.link_stack.clear();
2190 continue;
2191 } else if let Some((ReferenceLabel::Link(link_label), end)) = label {
2192 let type_url_title = self
2193 .allocs
2194 .refdefs
2195 .get(&UniCase::new(link_label.as_ref().into()))
2196 .map(|matching_def| {
2197 // found a matching definition!
2198 let title = matching_def
2199 .title
2200 .as_ref()
2201 .cloned()
2202 .unwrap_or_else(|| "".into());
2203 let url = matching_def.dest.clone();
2204 (link_type, url, title)
2205 })
2206 .or_else(|| {
2207 self.broken_link_callback
2208 .and_then(|callback| {
2209 // looked for matching definition, but didn't find it. try to fix
2210 // link with callback, if it is defined
2211 callback(link_label.as_ref(), link_label.as_ref())
2212 })
2213 .map(|(url, title)| {
2214 (link_type.to_unknown(), url.into(), title.into())
2215 })
2216 });
2217
2218 if let Some((def_link_type, url, title)) = type_url_title {
2219 let link_ix =
2220 self.allocs.allocate_link(def_link_type, url, title);
2221 self.tree[tos.node].item.body = if tos.ty == LinkStackTy::Image
2222 {
2223 ItemBody::Image(link_ix)
2224 } else {
2225 ItemBody::Link(link_ix)
2226 };
2227 let label_node = self.tree[tos.node].next;
2228
2229 // lets do some tree surgery to add the link to the tree
2230 // 1st: skip the label node and close node
2231 self.tree[tos.node].next = node_after_link;
2232
2233 // then, if it exists, add the label node as a child to the link node
2234 if label_node != cur {
2235 self.tree[tos.node].child = label_node;
2236
2237 // finally: disconnect list of children
2238 if let Some(prev_ix) = prev {
2239 self.tree[prev_ix].next = None;
2240 }
2241 }
2242
2243 self.tree[tos.node].item.end = end;
2244
2245 // set up cur so next node will be node_after_link
2246 cur = Some(tos.node);
2247 cur_ix = tos.node;
2248
2249 if tos.ty == LinkStackTy::Link {
2250 self.link_stack.disable_all_links();
2251 }
2252 }
2253 }
2254 }
2255 }
2256 }
2257 _ => (),
2258 }
2259 prev = cur;
2260 cur = self.tree[cur_ix].next;
2261 }
2262 self.link_stack.clear();
2263 }
2264
2265 fn handle_emphasis(&mut self) {
2266 let mut prev = None;
2267 let mut prev_ix: TreeIndex;
2268 let mut cur = self.tree.cur();
2269 while let Some(mut cur_ix) = cur {
2270 if let ItemBody::MaybeEmphasis(mut count, can_open, can_close) =
2271 self.tree[cur_ix].item.body
2272 {
2273 let c = self.text.as_bytes()[self.tree[cur_ix].item.start];
2274 let both = can_open && can_close;
2275 if can_close {
2276 while let Some(el) =
2277 self.inline_stack.find_match(&mut self.tree, c, count, both)
2278 {
2279 // have a match!
2280 if let Some(prev_ix) = prev {
2281 self.tree[prev_ix].next = None;
2282 }
2283 let match_count = min(count, el.count);
2284 // start, end are tree node indices
2285 let mut end = cur_ix - 1;
2286 let mut start = el.start + el.count;
2287
2288 // work from the inside out
2289 while start > el.start + el.count - match_count {
2290 let (inc, ty) = if c == b'~' {
2291 (2, ItemBody::Strikethrough)
2292 } else if start > el.start + el.count - match_count + 1 {
2293 (2, ItemBody::Strong)
2294 } else {
2295 (1, ItemBody::Emphasis)
2296 };
2297
2298 let root = start - inc;
2299 end = end + inc;
2300 self.tree[root].item.body = ty;
2301 self.tree[root].item.end = self.tree[end].item.end;
2302 self.tree[root].child = Some(start);
2303 self.tree[root].next = None;
2304 start = root;
2305 }
2306
2307 // set next for top most emph level
2308 prev_ix = el.start + el.count - match_count;
2309 prev = Some(prev_ix);
2310 cur = self.tree[cur_ix + match_count - 1].next;
2311 self.tree[prev_ix].next = cur;
2312
2313 if el.count > match_count {
2314 self.inline_stack.push(InlineEl {
2315 start: el.start,
2316 count: el.count - match_count,
2317 c: el.c,
2318 both,
2319 })
2320 }
2321 count -= match_count;
2322 if count > 0 {
2323 cur_ix = cur.unwrap();
2324 } else {
2325 break;
2326 }
2327 }
2328 }
2329 if count > 0 {
2330 if can_open {
2331 self.inline_stack.push(InlineEl {
2332 start: cur_ix,
2333 count,
2334 c,
2335 both,
2336 });
2337 } else {
2338 for i in 0..count {
2339 self.tree[cur_ix + i].item.body = ItemBody::Text;
2340 }
2341 }
2342 prev_ix = cur_ix + count - 1;
2343 prev = Some(prev_ix);
2344 cur = self.tree[prev_ix].next;
2345 }
2346 } else {
2347 prev = cur;
2348 cur = self.tree[cur_ix].next;
2349 }
2350 }
2351 self.inline_stack.pop_all(&mut self.tree);
2352 }
2353
2354 /// Returns next byte index, url and title.
2355 fn scan_inline_link(
2356 &self,
2357 underlying: &'a str,
2358 mut ix: usize,
2359 node: Option<TreeIndex>,
2360 ) -> Option<(usize, CowStr<'a>, CowStr<'a>)> {
2361 if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
2362 return None;
2363 }
2364 ix += 1;
2365 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2366
2367 let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
2368 let dest = unescape(dest);
2369 ix += dest_length;
2370
2371 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2372
2373 let title = if let Some((bytes_scanned, t)) = self.scan_link_title(underlying, ix, node) {
2374 ix += bytes_scanned;
2375 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2376 t
2377 } else {
2378 "".into()
2379 };
2380 if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
2381 return None;
2382 }
2383 ix += 1;
2384
2385 Some((ix, dest, title))
2386 }
2387
2388 // returns (bytes scanned, title cow)
2389 fn scan_link_title(
2390 &self,
2391 text: &'a str,
2392 start_ix: usize,
2393 node: Option<TreeIndex>,
2394 ) -> Option<(usize, CowStr<'a>)> {
2395 let bytes = text.as_bytes();
2396 let open = match bytes.get(start_ix) {
2397 Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
2398 _ => return None,
2399 };
2400 let close = if open == b'(' { b')' } else { open };
2401
2402 let mut title = String::new();
2403 let mut mark = start_ix + 1;
2404 let mut i = start_ix + 1;
2405
2406 while i < bytes.len() {
2407 let c = bytes[i];
2408
2409 if c == close {
2410 let cow = if mark == 1 {
2411 (i - start_ix + 1, text[mark..i].into())
2412 } else {
2413 title.push_str(&text[mark..i]);
2414 (i - start_ix + 1, title.into())
2415 };
2416
2417 return Some(cow);
2418 }
2419 if c == open {
2420 return None;
2421 }
2422
2423 if c == b'\n' || c == b'\r' {
2424 if let Some(node_ix) = scan_nodes_to_ix(&self.tree, node, i + 1) {
2425 if self.tree[node_ix].item.start > i {
2426 title.push_str(&text[mark..i]);
2427 title.push('\n');
2428 i = self.tree[node_ix].item.start;
2429 mark = i;
2430 continue;
2431 }
2432 }
2433 }
2434 if c == b'&' {
2435 if let (n, Some(value)) = scan_entity(&bytes[i..]) {
2436 title.push_str(&text[mark..i]);
2437 title.push_str(&value);
2438 i += n;
2439 mark = i;
2440 continue;
2441 }
2442 }
2443 if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
2444 title.push_str(&text[mark..i]);
2445 i += 1;
2446 mark = i;
2447 }
2448
2449 i += 1;
2450 }
2451
2452 None
2453 }
2454
2455 /// Make a code span.
2456 ///
2457 /// Both `open` and `close` are matching MaybeCode items.
2458 fn make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool) {
2459 let first_ix = open + 1;
2460 let last_ix = close - 1;
2461 let bytes = self.text.as_bytes();
2462 let mut span_start = self.tree[open].item.end;
2463 let mut span_end = self.tree[close].item.start;
2464 let mut buf: Option<String> = None;
2465
2466 // detect all-space sequences, since they are kept as-is as of commonmark 0.29
2467 if !bytes[span_start..span_end].iter().all(|&b| b == b' ') {
2468 let opening = match bytes[span_start] {
2469 b' ' | b'\r' | b'\n' => true,
2470 _ => false,
2471 };
2472 let closing = match bytes[span_end - 1] {
2473 b' ' | b'\r' | b'\n' => true,
2474 _ => false,
2475 };
2476 let drop_enclosing_whitespace = opening && closing;
2477
2478 if drop_enclosing_whitespace {
2479 span_start += 1;
2480 if span_start < span_end {
2481 span_end -= 1;
2482 }
2483 }
2484
2485 let mut ix = first_ix;
2486
2487 while ix < close {
2488 if let ItemBody::HardBreak | ItemBody::SoftBreak = self.tree[ix].item.body {
2489 if drop_enclosing_whitespace {
2490 // check whether break should be ignored
2491 if ix == first_ix {
2492 ix = ix + 1;
2493 span_start = min(span_end, self.tree[ix].item.start);
2494 continue;
2495 } else if ix == last_ix && last_ix > first_ix {
2496 ix = ix + 1;
2497 continue;
2498 }
2499 }
2500
2501 let end = bytes[self.tree[ix].item.start..]
2502 .iter()
2503 .position(|&b| b == b'\r' || b == b'\n')
2504 .unwrap()
2505 + self.tree[ix].item.start;
2506 if let Some(ref mut buf) = buf {
2507 buf.push_str(&self.text[self.tree[ix].item.start..end]);
2508 buf.push(' ');
2509 } else {
2510 let mut new_buf = String::with_capacity(span_end - span_start);
2511 new_buf.push_str(&self.text[span_start..end]);
2512 new_buf.push(' ');
2513 buf = Some(new_buf);
2514 }
2515 } else if let Some(ref mut buf) = buf {
2516 let end = if ix == last_ix {
2517 span_end
2518 } else {
2519 self.tree[ix].item.end
2520 };
2521 buf.push_str(&self.text[self.tree[ix].item.start..end]);
2522 }
2523 ix = ix + 1;
2524 }
2525 }
2526
2527 let cow = if let Some(buf) = buf {
2528 buf.into()
2529 } else {
2530 self.text[span_start..span_end].into()
2531 };
2532 if preceding_backslash {
2533 self.tree[open].item.body = ItemBody::Text;
2534 self.tree[open].item.end = self.tree[open].item.start + 1;
2535 self.tree[open].next = Some(close);
2536 self.tree[close].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2537 self.tree[close].item.start = self.tree[open].item.start + 1;
2538 } else {
2539 self.tree[open].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2540 self.tree[open].item.end = self.tree[close].item.end;
2541 self.tree[open].next = self.tree[close].next;
2542 }
2543 }
2544
2545 /// Returns the next byte offset on success.
2546 fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<usize> {
2547 let c = *bytes.get(ix)?;
2548 if c == b'!' {
2549 scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)
2550 } else if c == b'?' {
2551 scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)
2552 } else {
2553 let i = scan_html_block_inner(
2554 &bytes[ix..],
2555 Some(&|_bytes| {
2556 let mut line_start = LineStart::new(bytes);
2557 let _ = scan_containers(&self.tree, &mut line_start);
2558 line_start.bytes_scanned()
2559 }),
2560 )?;
2561 Some(i + ix)
2562 }
2563 }
2564
2565 /// Consumes the event iterator and produces an iterator that produces
2566 /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
2567 /// range in the markdown source.
2568 pub fn into_offset_iter(self) -> OffsetIter<'a> {
2569 OffsetIter { inner: self }
2570 }
2571 }
2572
2573 pub(crate) enum LoopInstruction<T> {
2574 /// Continue looking for more special bytes, but skip next few bytes.
2575 ContinueAndSkip(usize),
2576 /// Break looping immediately, returning with the given index and value.
2577 BreakAtWith(usize, T),
2578 }
2579
2580 /// This function walks the byte slices from the given index and
2581 /// calls the callback function on all bytes (and their indices) that are in the following set:
2582 /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
2583 /// It is guaranteed not call the callback on other bytes.
2584 /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
2585 /// will not be called with an index that is less than `ix + n + 1`.
2586 /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
2587 /// called and the function returns immediately with the return value `(end_ix, opt_val)`.
2588 /// If `BreakAtWith(..)` is never returned, this function will return the first
2589 /// index that is outside the byteslice bound and a `None` value.
2590 fn iterate_special_bytes<F, T>(bytes: &[u8], ix: usize, callback: F) -> (usize, Option<T>)
2591 where
2592 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2593 {
2594 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2595 {
2596 crate::simd::iterate_special_bytes(bytes, ix, callback)
2597 }
2598 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2599 {
2600 scalar_iterate_special_bytes(bytes, ix, callback)
2601 }
2602 }
2603
2604 const fn special_bytes() -> [bool; 256] {
2605 let mut bytes = [false; 256];
2606 bytes[b'<' as usize] = true;
2607 bytes[b'!' as usize] = true;
2608 bytes[b'[' as usize] = true;
2609 bytes[b'~' as usize] = true;
2610 bytes[b'`' as usize] = true;
2611 bytes[b'|' as usize] = true;
2612 bytes[b'\\' as usize] = true;
2613 bytes[b'*' as usize] = true;
2614 bytes[b'_' as usize] = true;
2615 bytes[b'\r' as usize] = true;
2616 bytes[b'\n' as usize] = true;
2617 bytes[b']' as usize] = true;
2618 bytes[b'&' as usize] = true;
2619 bytes
2620 }
2621
2622 pub(crate) fn scalar_iterate_special_bytes<F, T>(
2623 bytes: &[u8],
2624 mut ix: usize,
2625 mut callback: F,
2626 ) -> (usize, Option<T>)
2627 where
2628 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2629 {
2630 let special_bytes = special_bytes();
2631
2632 while ix < bytes.len() {
2633 let b = bytes[ix];
2634 if special_bytes[b as usize] {
2635 match callback(ix, b) {
2636 LoopInstruction::ContinueAndSkip(skip) => {
2637 ix += skip;
2638 }
2639 LoopInstruction::BreakAtWith(ix, val) => {
2640 return (ix, val);
2641 }
2642 }
2643 }
2644 ix += 1;
2645 }
2646
2647 (ix, None)
2648 }
2649
2650 /// Markdown event and source range iterator.
2651 ///
2652 /// Generates tuples where the first element is the markdown event and the second
2653 /// is a the corresponding range in the source string.
2654 ///
2655 /// Constructed from a `Parser` using its
2656 /// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
2657 pub struct OffsetIter<'a> {
2658 inner: Parser<'a>,
2659 }
2660
2661 impl<'a> Iterator for OffsetIter<'a> {
2662 type Item = (Event<'a>, Range<usize>);
2663
2664 fn next(&mut self) -> Option<Self::Item> {
2665 match self.inner.tree.cur() {
2666 None => {
2667 let ix = self.inner.tree.pop()?;
2668 let tag = item_to_tag(&self.inner.tree[ix].item, &self.inner.allocs);
2669 self.inner.tree.next_sibling(ix);
2670 Some((
2671 Event::End(tag),
2672 self.inner.tree[ix].item.start..self.inner.tree[ix].item.end,
2673 ))
2674 }
2675 Some(cur_ix) => {
2676 if self.inner.tree[cur_ix].item.body.is_inline() {
2677 self.inner.handle_inline();
2678 }
2679
2680 let node = self.inner.tree[cur_ix];
2681 let item = node.item;
2682 let event = item_to_event(item, self.inner.text, &self.inner.allocs);
2683 if let Event::Start(..) = event {
2684 self.inner.tree.push();
2685 } else {
2686 self.inner.tree.next_sibling(cur_ix);
2687 }
2688 Some((event, item.start..item.end))
2689 }
2690 }
2691 }
2692 }
2693
2694 fn item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a> {
2695 match item.body {
2696 ItemBody::Paragraph => Tag::Paragraph,
2697 ItemBody::Emphasis => Tag::Emphasis,
2698 ItemBody::Strong => Tag::Strong,
2699 ItemBody::Strikethrough => Tag::Strikethrough,
2700 ItemBody::Link(link_ix) => {
2701 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2702 Tag::Link(*link_type, url.clone(), title.clone())
2703 }
2704 ItemBody::Image(link_ix) => {
2705 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2706 Tag::Image(*link_type, url.clone(), title.clone())
2707 }
2708 ItemBody::Heading(level) => Tag::Heading(level),
2709 ItemBody::FencedCodeBlock(cow_ix) => {
2710 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2711 }
2712 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2713 ItemBody::BlockQuote => Tag::BlockQuote,
2714 ItemBody::List(_, c, listitem_start) => {
2715 if c == b'.' || c == b')' {
2716 Tag::List(Some(listitem_start))
2717 } else {
2718 Tag::List(None)
2719 }
2720 }
2721 ItemBody::ListItem(_) => Tag::Item,
2722 ItemBody::TableHead => Tag::TableHead,
2723 ItemBody::TableCell => Tag::TableCell,
2724 ItemBody::TableRow => Tag::TableRow,
2725 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2726 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2727 _ => panic!("unexpected item body {:?}", item.body),
2728 }
2729 }
2730
2731 fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a> {
2732 let tag = match item.body {
2733 ItemBody::Text => return Event::Text(text[item.start..item.end].into()),
2734 ItemBody::Code(cow_ix) => return Event::Code(allocs[cow_ix].clone()),
2735 ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
2736 ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
2737 ItemBody::SoftBreak => return Event::SoftBreak,
2738 ItemBody::HardBreak => return Event::HardBreak,
2739 ItemBody::FootnoteReference(cow_ix) => {
2740 return Event::FootnoteReference(allocs[cow_ix].clone())
2741 }
2742 ItemBody::TaskListMarker(checked) => return Event::TaskListMarker(checked),
2743 ItemBody::Rule => return Event::Rule,
2744
2745 ItemBody::Paragraph => Tag::Paragraph,
2746 ItemBody::Emphasis => Tag::Emphasis,
2747 ItemBody::Strong => Tag::Strong,
2748 ItemBody::Strikethrough => Tag::Strikethrough,
2749 ItemBody::Link(link_ix) => {
2750 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2751 Tag::Link(*link_type, url.clone(), title.clone())
2752 }
2753 ItemBody::Image(link_ix) => {
2754 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2755 Tag::Image(*link_type, url.clone(), title.clone())
2756 }
2757 ItemBody::Heading(level) => Tag::Heading(level),
2758 ItemBody::FencedCodeBlock(cow_ix) => {
2759 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2760 }
2761 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2762 ItemBody::BlockQuote => Tag::BlockQuote,
2763 ItemBody::List(_, c, listitem_start) => {
2764 if c == b'.' || c == b')' {
2765 Tag::List(Some(listitem_start))
2766 } else {
2767 Tag::List(None)
2768 }
2769 }
2770 ItemBody::ListItem(_) => Tag::Item,
2771 ItemBody::TableHead => Tag::TableHead,
2772 ItemBody::TableCell => Tag::TableCell,
2773 ItemBody::TableRow => Tag::TableRow,
2774 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2775 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2776 _ => panic!("unexpected item body {:?}", item.body),
2777 };
2778
2779 Event::Start(tag)
2780 }
2781
2782 // https://english.stackexchange.com/a/285573
2783 fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
2784 let mut list_item = tree[list_ix].child;
2785 while let Some(listitem_ix) = list_item {
2786 // first child is special, controls how we repoint list_item.child
2787 let list_item_firstborn = tree[listitem_ix].child;
2788
2789 // Check that list item has children - this is not necessarily the case!
2790 if let Some(firstborn_ix) = list_item_firstborn {
2791 if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
2792 tree[listitem_ix].child = tree[firstborn_ix].child;
2793 }
2794
2795 let mut list_item_child = Some(firstborn_ix);
2796 let mut node_to_repoint = None;
2797 while let Some(child_ix) = list_item_child {
2798 // surgerize paragraphs
2799 let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
2800 if let Some(child_firstborn) = tree[child_ix].child {
2801 if let Some(repoint_ix) = node_to_repoint {
2802 tree[repoint_ix].next = Some(child_firstborn);
2803 }
2804 let mut child_lastborn = child_firstborn;
2805 while let Some(lastborn_next_ix) = tree[child_lastborn].next {
2806 child_lastborn = lastborn_next_ix;
2807 }
2808 child_lastborn
2809 } else {
2810 child_ix
2811 }
2812 } else {
2813 child_ix
2814 };
2815
2816 node_to_repoint = Some(repoint_ix);
2817 tree[repoint_ix].next = tree[child_ix].next;
2818 list_item_child = tree[child_ix].next;
2819 }
2820 }
2821
2822 list_item = tree[listitem_ix].next;
2823 }
2824 }
2825
2826 impl<'a> Iterator for Parser<'a> {
2827 type Item = Event<'a>;
2828
2829 fn next(&mut self) -> Option<Event<'a>> {
2830 match self.tree.cur() {
2831 None => {
2832 let ix = self.tree.pop()?;
2833 let tag = item_to_tag(&self.tree[ix].item, &self.allocs);
2834 self.tree.next_sibling(ix);
2835 Some(Event::End(tag))
2836 }
2837 Some(cur_ix) => {
2838 if self.tree[cur_ix].item.body.is_inline() {
2839 self.handle_inline();
2840 }
2841
2842 let node = self.tree[cur_ix];
2843 let item = node.item;
2844 let event = item_to_event(item, self.text, &self.allocs);
2845 if let Event::Start(..) = event {
2846 self.tree.push();
2847 } else {
2848 self.tree.next_sibling(cur_ix);
2849 }
2850 Some(event)
2851 }
2852 }
2853 }
2854 }
2855
2856 #[cfg(test)]
2857 mod test {
2858 use super::*;
2859 use crate::tree::Node;
2860
2861 // TODO: move these tests to tests/html.rs?
2862
2863 fn parser_with_extensions(text: &str) -> Parser<'_> {
2864 let mut opts = Options::empty();
2865 opts.insert(Options::ENABLE_TABLES);
2866 opts.insert(Options::ENABLE_FOOTNOTES);
2867 opts.insert(Options::ENABLE_STRIKETHROUGH);
2868 opts.insert(Options::ENABLE_TASKLISTS);
2869
2870 Parser::new_ext(text, opts)
2871 }
2872
2873 #[test]
2874 #[cfg(target_pointer_width = "64")]
2875 fn node_size() {
2876 let node_size = std::mem::size_of::<Node<Item>>();
2877 assert_eq!(48, node_size);
2878 }
2879
2880 #[test]
2881 #[cfg(target_pointer_width = "64")]
2882 fn body_size() {
2883 let body_size = std::mem::size_of::<ItemBody>();
2884 assert_eq!(16, body_size);
2885 }
2886
2887 #[test]
2888 fn single_open_fish_bracket() {
2889 // dont crash
2890 assert_eq!(3, Parser::new("<").count());
2891 }
2892
2893 #[test]
2894 fn lone_hashtag() {
2895 // dont crash
2896 assert_eq!(2, Parser::new("#").count());
2897 }
2898
2899 #[test]
2900 fn lots_of_backslashes() {
2901 // dont crash
2902 Parser::new("\\\\\r\r").count();
2903 Parser::new("\\\r\r\\.\\\\\r\r\\.\\").count();
2904 }
2905
2906 #[test]
2907 fn issue_320() {
2908 // dont crash
2909 parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
2910 }
2911
2912 #[test]
2913 fn issue_319() {
2914 // dont crash
2915 parser_with_extensions("|\r-]([^|\r-]([^").count();
2916 parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
2917 }
2918
2919 #[test]
2920 fn issue_303() {
2921 // dont crash
2922 parser_with_extensions("[^\r\ra]").count();
2923 parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
2924 }
2925
2926 #[test]
2927 fn issue_313() {
2928 // dont crash
2929 parser_with_extensions("*]0[^\r\r*]0[^").count();
2930 parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
2931 }
2932
2933 #[test]
2934 fn issue_311() {
2935 // dont crash
2936 parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
2937 }
2938
2939 #[test]
2940 fn issue_283() {
2941 let input = std::str::from_utf8(b"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
2942 // dont crash
2943 parser_with_extensions(input).count();
2944 }
2945
2946 #[test]
2947 fn issue_289() {
2948 // dont crash
2949 parser_with_extensions("> - \\\n> - ").count();
2950 parser_with_extensions("- \n\n").count();
2951 }
2952
2953 #[test]
2954 fn issue_306() {
2955 // dont crash
2956 parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
2957 }
2958
2959 #[test]
2960 fn issue_305() {
2961 // dont crash
2962 parser_with_extensions("_6**6*_*").count();
2963 }
2964
2965 #[test]
2966 fn another_emphasis_panic() {
2967 parser_with_extensions("*__#_#__*").count();
2968 }
2969
2970 #[test]
2971 fn offset_iter() {
2972 let event_offsets: Vec<_> = Parser::new("*hello* world")
2973 .into_offset_iter()
2974 .map(|(_ev, range)| range)
2975 .collect();
2976 let expected_offsets = vec![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
2977 assert_eq!(expected_offsets, event_offsets);
2978 }
2979
2980 #[test]
2981 fn reference_link_offsets() {
2982 let range =
2983 Parser::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
2984 .into_offset_iter()
2985 .filter_map(|(ev, range)| match ev {
2986 Event::Start(Tag::Link(LinkType::Reference, ..), ..) => Some(range),
2987 _ => None,
2988 })
2989 .next()
2990 .unwrap();
2991 assert_eq!(5..30, range);
2992 }
2993
2994 #[test]
2995 fn footnote_offsets() {
2996 let range = Parser::new("Testing this[^1] out.\n\n[^1]: Footnote.")
2997 .into_offset_iter()
2998 .filter_map(|(ev, range)| match ev {
2999 Event::FootnoteReference(..) => Some(range),
3000 _ => None,
3001 })
3002 .next()
3003 .unwrap();
3004 assert_eq!(12..16, range);
3005 }
3006
3007 #[test]
3008 fn table_offset() {
3009 let markdown = "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
3010 let event_offset = parser_with_extensions(markdown)
3011 .into_offset_iter()
3012 .map(|(_ev, range)| range)
3013 .nth(3)
3014 .unwrap();
3015 let expected_offset = 3..59;
3016 assert_eq!(expected_offset, event_offset);
3017 }
3018
3019 #[test]
3020 fn offset_iter_issue_378() {
3021 let event_offsets: Vec<_> = Parser::new("a [b](c) d")
3022 .into_offset_iter()
3023 .map(|(_ev, range)| range)
3024 .collect();
3025 let expected_offsets = vec![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
3026 assert_eq!(expected_offsets, event_offsets);
3027 }
3028
3029 #[test]
3030 fn offset_iter_issue_404() {
3031 let event_offsets: Vec<_> = Parser::new("###\n")
3032 .into_offset_iter()
3033 .map(|(_ev, range)| range)
3034 .collect();
3035 let expected_offsets = vec![(0..4), (0..4)];
3036 assert_eq!(expected_offsets, event_offsets);
3037 }
3038
3039 // FIXME: add this one regression suite
3040 #[test]
3041 fn link_def_at_eof() {
3042 let test_str = "[My site][world]\n\n[world]: https://vincentprouillet.com";
3043 let expected = "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
3044
3045 let mut buf = String::new();
3046 crate::html::push_html(&mut buf, Parser::new(test_str));
3047 assert_eq!(expected, buf);
3048 }
3049
3050 #[test]
3051 fn ref_def_at_eof() {
3052 let test_str = "[test]:\\";
3053 let expected = "";
3054
3055 let mut buf = String::new();
3056 crate::html::push_html(&mut buf, Parser::new(test_str));
3057 assert_eq!(expected, buf);
3058 }
3059
3060 #[test]
3061 fn ref_def_cr_lf() {
3062 let test_str = "[a]: /u\r\n\n[a]";
3063 let expected = "<p><a href=\"/u\">a</a></p>\n";
3064
3065 let mut buf = String::new();
3066 crate::html::push_html(&mut buf, Parser::new(test_str));
3067 assert_eq!(expected, buf);
3068 }
3069
3070 #[test]
3071 fn no_dest_refdef() {
3072 let test_str = "[a]:";
3073 let expected = "<p>[a]:</p>\n";
3074
3075 let mut buf = String::new();
3076 crate::html::push_html(&mut buf, Parser::new(test_str));
3077 assert_eq!(expected, buf);
3078 }
3079
3080 #[test]
3081 fn simple_broken_link_callback() {
3082 let test_str = "This is a link w/o def: [hello][world]";
3083 let parser = Parser::new_with_broken_link_callback(
3084 test_str,
3085 Options::empty(),
3086 Some(&|norm, raw| {
3087 assert_eq!("world", raw);
3088 assert_eq!("world", norm);
3089 Some(("YOLO".to_owned(), "SWAG".to_owned()))
3090 }),
3091 );
3092 let mut link_tag_count = 0;
3093 for (typ, url, title) in parser.filter_map(|event| match event {
3094 Event::Start(tag) | Event::End(tag) => match tag {
3095 Tag::Link(typ, url, title) => Some((typ, url, title)),
3096 _ => None,
3097 },
3098 _ => None,
3099 }) {
3100 link_tag_count += 1;
3101 assert_eq!(typ, LinkType::ReferenceUnknown);
3102 assert_eq!(url.as_ref(), "YOLO");
3103 assert_eq!(title.as_ref(), "SWAG");
3104 }
3105 assert!(link_tag_count > 0);
3106 }
3107
3108 #[test]
3109 fn code_block_kind_check_fenced() {
3110 let parser = Parser::new("hello\n```test\ntadam\n```");
3111 let mut found = 0;
3112 for (ev, _range) in parser.into_offset_iter() {
3113 match ev {
3114 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(syntax))) => {
3115 assert_eq!(syntax.as_ref(), "test");
3116 found += 1;
3117 }
3118 _ => {}
3119 }
3120 }
3121 assert_eq!(found, 1);
3122 }
3123
3124 #[test]
3125 fn code_block_kind_check_indented() {
3126 let parser = Parser::new("hello\n\n ```test\n tadam\nhello");
3127 let mut found = 0;
3128 for (ev, _range) in parser.into_offset_iter() {
3129 match ev {
3130 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
3131 found += 1;
3132 }
3133 _ => {}
3134 }
3135 }
3136 assert_eq!(found, 1);
3137 }
3138 }