]>
Commit | Line | Data |
---|---|---|
cc61c64b XL |
1 | // Copyright 2015 Google Inc. All rights reserved. |
2 | // | |
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy | |
4 | // of this software and associated documentation files (the "Software"), to deal | |
5 | // in the Software without restriction, including without limitation the rights | |
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
7 | // copies of the Software, and to permit persons to whom the Software is | |
8 | // furnished to do so, subject to the following conditions: | |
9 | // | |
10 | // The above copyright notice and this permission notice shall be included in | |
11 | // all copies or substantial portions of the Software. | |
12 | // | |
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
19 | // THE SOFTWARE. | |
20 | ||
21 | //! Raw parser, for doing a single pass over input. | |
22 | ||
23 | use scanners::*; | |
24 | use utils; | |
25 | use std::borrow::Cow; | |
26 | use std::borrow::Cow::{Borrowed}; | |
27 | use std::collections::{HashMap, HashSet}; | |
28 | use std::cmp; | |
29 | ||
30 | #[derive(PartialEq, Debug)] | |
31 | enum State { | |
32 | StartBlock, | |
33 | InContainers, | |
34 | Inline, | |
35 | TableHead(usize, usize), // limit, next | |
36 | TableBody, | |
37 | TableRow, | |
38 | CodeLineStart, | |
39 | Code, | |
40 | InlineCode, | |
41 | Literal, | |
42 | } | |
43 | ||
44 | #[derive(Copy, Clone, Debug, PartialEq)] | |
45 | enum Container { | |
46 | BlockQuote, | |
47 | List(usize, u8), | |
48 | ListItem(usize), | |
49 | FootnoteDefinition, | |
50 | } | |
51 | ||
52 | pub struct RawParser<'a> { | |
53 | text: &'a str, | |
54 | off: usize, | |
55 | ||
56 | opts: Options, | |
57 | active_tab: [u8; 256], | |
58 | ||
59 | state: State, | |
60 | stack: Vec<(Tag<'a>, usize, usize)>, | |
61 | leading_space: usize, | |
62 | ||
63 | containers: Vec<Container>, | |
64 | last_line_was_empty: bool, | |
65 | ||
66 | // state for code fences | |
67 | fence_char: u8, | |
68 | fence_count: usize, | |
69 | fence_indent: usize, | |
70 | ||
71 | // info, used in second pass | |
72 | loose_lists: HashSet<usize>, // offset is at list marker | |
73 | links: HashMap<String, (Cow<'a, str>, Cow<'a, str>)>, | |
74 | } | |
75 | ||
76 | pub struct ParseInfo<'a> { | |
77 | pub loose_lists: HashSet<usize>, | |
78 | pub links: HashMap<String, (Cow<'a, str>, Cow<'a, str>)>, | |
79 | } | |
80 | ||
81 | #[derive(Clone, Debug)] | |
82 | pub enum Tag<'a> { | |
83 | // block-level tags | |
84 | Paragraph, | |
85 | Rule, | |
86 | Header(i32), | |
87 | BlockQuote, | |
88 | CodeBlock(Cow<'a, str>), | |
89 | List(Option<usize>), // TODO: add delim and tight for ast (not needed for html) | |
90 | Item, | |
91 | FootnoteDefinition(Cow<'a, str>), | |
92 | ||
93 | // tables | |
94 | Table(i32), | |
95 | TableHead, | |
96 | TableRow, | |
97 | TableCell, | |
98 | ||
99 | // span-level tags | |
100 | Emphasis, | |
101 | Strong, | |
102 | Code, | |
103 | Link(Cow<'a, str>, Cow<'a, str>), | |
104 | Image(Cow<'a, str>, Cow<'a, str>), | |
105 | } | |
106 | ||
107 | #[derive(Debug)] | |
108 | pub enum Event<'a> { | |
109 | Start(Tag<'a>), | |
110 | End(Tag<'a>), | |
111 | Text(Cow<'a, str>), | |
112 | Html(Cow<'a, str>), | |
113 | InlineHtml(Cow<'a, str>), | |
114 | FootnoteReference(Cow<'a, str>), | |
115 | SoftBreak, | |
116 | HardBreak, | |
117 | } | |
118 | ||
119 | bitflags! { | |
120 | pub flags Options: u32 { | |
121 | const OPTION_FIRST_PASS = 1 << 0, | |
122 | const OPTION_ENABLE_TABLES = 1 << 1, | |
123 | const OPTION_ENABLE_FOOTNOTES = 1 << 2, | |
124 | } | |
125 | } | |
126 | ||
127 | const MAX_LINK_NEST: usize = 10; | |
128 | ||
129 | impl<'a> RawParser<'a> { | |
130 | pub fn new_with_links(text: &'a str, opts: Options, | |
131 | links: HashMap<String, (Cow<'a, str>, Cow<'a, str>)>) -> RawParser<'a> { | |
132 | let mut ret = RawParser { | |
133 | text: text, | |
134 | off: if text.starts_with("\u{FEFF}") { 3 } else { 0 }, | |
135 | opts: opts, | |
136 | active_tab: [0; 256], | |
137 | state: State::StartBlock, | |
138 | leading_space: 0, | |
139 | stack: Vec::new(), | |
140 | containers: Vec::new(), | |
141 | last_line_was_empty: false, | |
142 | ||
143 | fence_char: 0, | |
144 | fence_count: 0, | |
145 | fence_indent: 0, | |
146 | ||
147 | // info, used in second pass | |
148 | loose_lists: HashSet::new(), | |
149 | links: links, | |
150 | }; | |
151 | ret.init_active(); | |
152 | ret.skip_blank_lines(); | |
153 | ret | |
154 | } | |
155 | ||
156 | pub fn new(text: &'a str, opts: Options) -> RawParser<'a> { | |
157 | RawParser::new_with_links(text, opts, HashMap::new()) | |
158 | } | |
159 | ||
160 | // offset into text representing current parse position, hopefully | |
161 | // useful for building source maps | |
162 | pub fn get_offset(&self) -> usize { | |
163 | self.off | |
164 | } | |
165 | ||
166 | // extract info from parser on finish | |
167 | pub fn get_info(self) -> ParseInfo<'a> { | |
168 | ParseInfo { | |
169 | loose_lists: self.loose_lists, | |
170 | links: self.links, | |
171 | } | |
172 | } | |
173 | ||
174 | fn init_active(&mut self) { | |
175 | if self.opts.contains(OPTION_FIRST_PASS) { | |
176 | self.active_tab[b'\n' as usize] = 1 | |
177 | } else { | |
178 | for &c in b"\x00\t\n\r_\\&*[!`<" { | |
179 | self.active_tab[c as usize] = 1; | |
180 | } | |
181 | } | |
182 | } | |
183 | ||
184 | fn limit(&self) -> usize { | |
185 | match self.stack.last() { | |
186 | Some(&(_, limit, _)) => limit, | |
187 | None => self.text.len() | |
188 | } | |
189 | } | |
190 | ||
191 | // if end is not known, limit should be text.len(), next should be 0 | |
192 | fn start(&mut self, tag: Tag<'a>, limit: usize, next: usize) -> Event<'a> { | |
193 | self.stack.push((tag.clone(), limit, next)); | |
194 | Event::Start(tag) | |
195 | } | |
196 | ||
197 | fn end(&mut self) -> Event<'a> { | |
198 | let (tag, _, next) = self.stack.pop().unwrap(); | |
199 | match tag { | |
200 | // containers | |
201 | Tag::BlockQuote | Tag::List(_) | Tag::Item | Tag::FootnoteDefinition(_) => { | |
202 | let _ = self.containers.pop(); | |
203 | } | |
204 | ||
205 | // block level tags | |
206 | Tag::Paragraph | Tag::Header(_) | Tag::Rule | Tag::CodeBlock(_) | Tag::Table(_) => { | |
207 | self.state = State::StartBlock; | |
208 | // TODO: skip blank lines (for cleaner source maps) | |
209 | } | |
210 | ||
211 | // tables | |
212 | Tag::TableCell => self.state = State::TableRow, | |
213 | Tag::TableRow | Tag::TableHead => self.state = State::TableBody, | |
214 | ||
215 | // inline | |
216 | Tag::Code => self.state = State::Inline, | |
217 | _ => (), | |
218 | } | |
219 | if next != 0 { self.off = next; } | |
220 | ||
221 | /* | |
222 | if self.stack.is_empty() { | |
223 | // TODO maybe: make block ends do this | |
224 | self.state = State::StartBlock; | |
225 | self.skip_blank_lines(); | |
226 | } | |
227 | */ | |
228 | Event::End(tag) | |
229 | } | |
230 | ||
231 | fn skip_leading_whitespace(&mut self) { | |
232 | self.off += scan_whitespace_no_nl(&self.text[self.off .. self.limit()]); | |
233 | } | |
234 | ||
235 | // TODO: this function doesn't respect containers | |
236 | fn skip_blank_lines(&mut self) { | |
237 | loop { | |
238 | let ret = scan_blank_line(&self.text[self.off..]); | |
239 | if ret == 0 { | |
240 | break; | |
241 | } | |
242 | self.off += ret; | |
243 | } | |
244 | } | |
245 | ||
246 | // Scan markers and indentation for current container stack | |
247 | // Return: bytes scanned, whether containers are complete, and remaining space | |
248 | fn scan_containers(&self, text: &str) -> (usize, bool, usize) { | |
249 | let (mut i, mut space) = scan_leading_space(text, 0); | |
250 | for container in self.containers.iter() { | |
251 | match *container { | |
252 | Container::BlockQuote => { | |
253 | if space <= 3 { | |
254 | let n = scan_blockquote_start(&text[i..]); | |
255 | if n > 0 { | |
256 | let (n_sp, next_space) = scan_leading_space(text, i + n); | |
257 | i += n + n_sp; | |
258 | space = next_space; | |
259 | } else { | |
260 | return (i, false, space); | |
261 | } | |
262 | } else { | |
263 | return (i, false, space); | |
264 | } | |
265 | } | |
266 | Container::FootnoteDefinition => (), | |
267 | Container::List(_, _) => (), | |
268 | Container::ListItem(indent) => { | |
269 | if space >= indent { | |
270 | space -= indent; | |
271 | } else if scan_eol(&text[i..]).1 { | |
272 | space = 0; | |
273 | } else { | |
274 | return (i, false, 0); | |
275 | } | |
276 | } | |
277 | } | |
278 | } | |
279 | (i, true, space) | |
280 | } | |
281 | ||
282 | // scans empty lines with current container stack | |
283 | // returns number of bytes scanned, number of empty lines | |
284 | // note: EOF counts as a line ending for counting lines | |
285 | fn scan_empty_lines(&self, text: &str) -> (usize, usize) { | |
286 | let mut i = 0; | |
287 | let mut lines = 0; | |
288 | loop { | |
289 | let (n, scanned, _) = self.scan_containers(&text[i..]); | |
290 | if !scanned { | |
291 | return (i, lines); | |
292 | } | |
293 | if i == text.len() { | |
294 | return (i, lines + 1); | |
295 | } | |
296 | let n_blank = scan_eol(&text[i + n ..]).0; | |
297 | if n_blank == 0 { | |
298 | return (i, lines); | |
299 | } | |
300 | i += n + n_blank; | |
301 | lines += 1; | |
302 | } | |
303 | } | |
304 | ||
305 | // scans whitespace, skipping past containers on newline | |
306 | fn scan_whitespace_inline(&self, text: &str) -> usize { | |
307 | let i = scan_whitespace_no_nl(text); | |
308 | if let (n, true) = scan_eol(&text[i..]) { | |
309 | let (n_containers, _, space) = self.scan_containers(&text[i + n ..]); | |
310 | let j = i + n + n_containers; | |
311 | if !self.is_inline_block_end(&text[j..], space) { | |
312 | return j; | |
313 | } | |
314 | } | |
315 | i | |
316 | } | |
317 | ||
318 | fn at_list(&self, level: usize) -> Option<usize> { | |
319 | let len = self.containers.len(); | |
320 | if len >= level { | |
321 | if let Container::List(offset, _) = self.containers[len - level] { | |
322 | return Some(offset); | |
323 | } | |
324 | } | |
325 | None | |
326 | } | |
327 | ||
328 | fn start_block(&mut self) -> Option<Event<'a>> { | |
329 | let size = self.text.len(); | |
330 | //println!("start_block {}", self.off); | |
331 | while self.off < size { | |
332 | //println!("start_block loop {} {}", self.off, self.last_line_was_empty); | |
333 | if self.off >= self.limit() { | |
334 | return Some(self.end()); | |
335 | } | |
336 | if self.state != State::InContainers { | |
337 | let (n, scanned, space) = self.scan_containers(&self.text[self.off ..]); | |
338 | if !scanned { | |
339 | return Some(self.end()); | |
340 | } | |
341 | self.leading_space = space; | |
342 | self.off += n; | |
343 | self.state = State::InContainers; | |
344 | } | |
345 | ||
346 | let (n, at_eol) = scan_eol(&self.text[self.off ..]); | |
347 | if at_eol { | |
348 | self.off += n; | |
349 | self.state = State::StartBlock; | |
350 | // two empty lines closes lists and footnotes | |
351 | let (n, empty_lines) = self.scan_empty_lines(&self.text[self.off ..]); | |
352 | //println!("{} empty lines (n = {})", empty_lines, n); | |
353 | let mut closed = false; | |
354 | if empty_lines >= 1 { | |
355 | let mut close_tags: Vec<&mut (Tag<'a>, usize, usize)> = self.stack.iter_mut().skip_while(|tag| { | |
356 | match tag.0 { | |
357 | Tag::List(_) | Tag::FootnoteDefinition(_) => false, | |
358 | _ => true, | |
359 | } | |
360 | }).collect(); | |
361 | if close_tags.len() != 0 { | |
362 | for tag in &mut close_tags { | |
363 | tag.1 = self.off; // limit | |
364 | tag.2 = self.off; // next | |
365 | } | |
366 | close_tags[0].2 = self.off + n; // next | |
367 | closed = true; | |
368 | } | |
369 | } | |
370 | if closed { | |
371 | return Some(self.end()); | |
372 | } | |
373 | self.off += n; | |
374 | if let Some(_) = self.at_list(2) { | |
375 | self.last_line_was_empty = true; | |
376 | } | |
377 | continue; | |
378 | } | |
379 | ||
380 | //println!("checking loose {} {:?}", self.last_line_was_empty, self.at_list(2)); | |
381 | if self.last_line_was_empty { | |
382 | if let Some(offset) = self.at_list(2) { | |
383 | // list item contains two blocks separated by empty line | |
384 | self.loose_lists.insert(offset); | |
385 | } | |
386 | } | |
387 | ||
388 | if self.leading_space >= 4 && !self.at_list(1).is_some() { | |
389 | // see below | |
390 | if let Some(&Container::List(_, _)) = self.containers.last() { | |
391 | return Some(self.end()); | |
392 | } | |
393 | return Some(self.start_indented_code()); | |
394 | } | |
395 | ||
396 | let tail = &self.text[self.off ..]; | |
397 | ||
398 | // must be before list item because ambiguous | |
399 | let n = scan_hrule(tail); | |
400 | if n != 0 { | |
401 | self.last_line_was_empty = false; | |
402 | // see below | |
403 | if let Some(&Container::List(_, _)) = self.containers.last() { | |
404 | return Some(self.end()); | |
405 | } | |
406 | self.off += n; | |
407 | return Some(self.start_hrule()); | |
408 | } | |
409 | ||
410 | let (n, c, start, indent) = scan_listitem(tail); | |
411 | if n != 0 { | |
412 | if self.last_line_was_empty { | |
413 | if let Some(offset) = self.at_list(1) { | |
414 | // two list items separated by empty line | |
415 | self.loose_lists.insert(offset); | |
416 | } | |
417 | } | |
418 | self.last_line_was_empty = false; | |
419 | return Some(self.start_listitem(n, c, start, indent)); | |
420 | } | |
421 | ||
422 | // not a list item, so if we're in a list, close it | |
423 | if let Some(&Container::List(_, _)) = self.containers.last() { | |
424 | return Some(self.end()); | |
425 | } | |
426 | self.last_line_was_empty = false; | |
427 | ||
428 | let c = tail.as_bytes()[0]; | |
429 | match c { | |
430 | b'#' => { | |
431 | let (n, level) = scan_atx_header(tail); | |
432 | if n != 0 { | |
433 | self.off += n; | |
434 | return Some(self.start_atx_header(level)); | |
435 | } | |
436 | } | |
437 | b'`' | b'~' => { | |
438 | let (n, ch) = scan_code_fence(tail); | |
439 | if n != 0 { | |
440 | return Some(self.start_code_fence(n, ch, n)); | |
441 | } | |
442 | } | |
443 | b'>' => { | |
444 | let n = scan_blockquote_start(tail); | |
445 | if n != 0 { | |
446 | self.off += n; | |
447 | let (n, space) = scan_leading_space(self.text, self.off); | |
448 | self.off += n; | |
449 | self.leading_space = space; | |
450 | self.containers.push(Container::BlockQuote); | |
451 | return Some(self.start(Tag::BlockQuote, self.text.len(), 0)); | |
452 | } | |
453 | } | |
454 | b'<' => { | |
455 | if self.is_html_block(tail) { | |
456 | return Some(self.do_html_block()); | |
457 | } | |
458 | } | |
459 | b'[' => { | |
460 | if self.opts.contains(OPTION_ENABLE_FOOTNOTES) { | |
461 | if let Some((name, n)) = self.parse_footnote_definition(tail) { | |
462 | if self.containers.last() == Some(&Container::FootnoteDefinition) { | |
463 | return Some(self.end()); | |
464 | } | |
465 | self.off += n; | |
466 | self.containers.push(Container::FootnoteDefinition); | |
467 | return Some(self.start(Tag::FootnoteDefinition(Cow::Borrowed(name)), self.text.len(), 0)); | |
468 | } | |
469 | } | |
470 | if self.try_link_reference_definition(tail) { | |
471 | continue; | |
472 | } | |
473 | } | |
474 | _ => () | |
475 | } | |
476 | return Some(self.start_paragraph()); | |
477 | } | |
478 | None | |
479 | } | |
480 | ||
481 | // can start a paragraph, a setext header, or a table, as they start similarly | |
482 | fn start_paragraph(&mut self) -> Event<'a> { | |
483 | let mut i = self.off + scan_nextline(&self.text[self.off..]); | |
484 | ||
485 | if let (n, true, space) = self.scan_containers(&self.text[i..]) { | |
486 | i += n; | |
487 | if space <= 3 { | |
488 | let (n, level) = scan_setext_header(&self.text[i..]); | |
489 | if n != 0 { | |
490 | let next = i + n; | |
491 | while i > self.off && is_ascii_whitespace(self.text.as_bytes()[i - 1]) { | |
492 | i -= 1; | |
493 | } | |
494 | self.state = State::Inline; | |
495 | return self.start(Tag::Header(level), i, next); | |
496 | } | |
497 | if self.opts.contains(OPTION_ENABLE_TABLES) { | |
498 | let (n, cols) = scan_table_head(&self.text[i..]); | |
499 | if n != 0 { | |
500 | let next = i + n; | |
501 | while i > self.off && is_ascii_whitespace(self.text.as_bytes()[i - 1]) { | |
502 | i -= 1; | |
503 | } | |
504 | self.state = State::TableHead(i, next); | |
505 | return self.start(Tag::Table(cols), self.text.len(), 0); | |
506 | } | |
507 | } | |
508 | } | |
509 | } | |
510 | ||
511 | let size = self.text.len(); | |
512 | self.state = State::Inline; | |
513 | self.start(Tag::Paragraph, size, 0) | |
514 | } | |
515 | ||
516 | fn start_table_head(&mut self) -> Event<'a> { | |
517 | assert!(self.opts.contains(OPTION_ENABLE_TABLES)); | |
518 | if let State::TableHead(limit, next) = self.state { | |
519 | self.state = State::TableRow; | |
520 | return self.start(Tag::TableHead, limit, next); | |
521 | } else { | |
522 | panic!(); | |
523 | } | |
524 | } | |
525 | ||
526 | fn start_table_body(&mut self) -> Event<'a> { | |
527 | assert!(self.opts.contains(OPTION_ENABLE_TABLES)); | |
528 | let (off, _) = match self.scan_containers(&self.text[self.off ..]) { | |
529 | (n, true, space) => (self.off + n, space), | |
530 | _ => { | |
531 | return self.end(); | |
532 | } | |
533 | }; | |
534 | let n = scan_blank_line(&self.text[off..]); | |
535 | if n != 0 { | |
536 | self.off = off + n; | |
537 | return self.end(); | |
538 | } | |
539 | self.state = State::TableRow; | |
540 | self.off = off; | |
541 | return self.start(Tag::TableRow, self.text.len(), 0); | |
542 | } | |
543 | ||
544 | fn start_hrule(&mut self) -> Event<'a> { | |
545 | let limit = self.off; // body of hrule is empty | |
546 | self.state = State::Inline; // handy state for producing correct end tag | |
547 | self.start(Tag::Rule, limit, limit) | |
548 | } | |
549 | ||
550 | fn start_atx_header(&mut self, level: i32) -> Event<'a> { | |
551 | self.skip_leading_whitespace(); | |
552 | let tail = &self.text[self.off..]; | |
553 | let next = scan_nextline(tail); | |
554 | let mut limit = next; | |
555 | while limit > 0 && is_ascii_whitespace(tail.as_bytes()[limit - 1]) { | |
556 | limit -= 1; | |
557 | } | |
558 | let mut end = limit; | |
559 | while end > 0 && tail.as_bytes()[end - 1] == b'#' { | |
560 | end -= 1; | |
561 | } | |
562 | if end == 0 { | |
563 | limit = end; | |
564 | } else if is_ascii_whitespace(tail.as_bytes()[end - 1]) { | |
565 | limit = end - 1; | |
566 | } | |
567 | while limit > 0 && is_ascii_whitespace(tail.as_bytes()[limit - 1]) { | |
568 | limit -= 1; | |
569 | } | |
570 | let limit = limit + self.off; | |
571 | let next = next + self.off; | |
572 | self.state = State::Inline; | |
573 | self.start(Tag::Header(level), limit, next) | |
574 | } | |
575 | ||
576 | fn start_indented_code(&mut self) -> Event<'a> { | |
577 | self.fence_char = b'\0'; | |
578 | self.fence_indent = 4; | |
579 | let size = self.text.len(); | |
580 | self.state = State::Code; | |
581 | self.start(Tag::CodeBlock(Borrowed("")), size, 0) | |
582 | } | |
583 | ||
584 | fn start_listitem(&mut self, n: usize, c: u8, start: usize, indent: usize) -> Event<'a> { | |
585 | let indent = self.leading_space + indent; | |
586 | match self.containers.last() { | |
587 | Some(&Container::List(_, c2)) => { | |
588 | if c != c2 { | |
589 | // mismatched list type or delimeter | |
590 | return self.end(); | |
591 | } | |
592 | self.off += n; | |
593 | let n_blank = scan_blank_line(&self.text[self.off ..]); | |
594 | if n_blank != 0 { | |
595 | self.off += n_blank; | |
596 | self.state = State::StartBlock; | |
597 | } else { | |
598 | // TODO: deal with tab | |
599 | let (n, space) = scan_leading_space(self.text, self.off); | |
600 | self.off += n; | |
601 | self.leading_space = space; | |
602 | } | |
603 | self.containers.push(Container::ListItem(indent)); | |
604 | self.start(Tag::Item, self.text.len(), 0) | |
605 | } | |
606 | _ => { | |
607 | self.containers.push(Container::List(self.off, c)); | |
608 | // arguably this should be done in the scanner, it should return option | |
609 | let startopt = if c == b'.' || c == b')' { Some(start) } else { None }; | |
610 | self.start(Tag::List(startopt), self.text.len(), 0) | |
611 | } | |
612 | } | |
613 | } | |
614 | ||
615 | fn start_code_fence(&mut self, n: usize, ch: u8, count: usize) -> Event<'a> { | |
616 | self.fence_char = ch; | |
617 | self.fence_count = count; | |
618 | self.fence_indent = self.leading_space; | |
619 | let beg_info = self.off + n; | |
620 | let next_line = beg_info + scan_nextline(&self.text[beg_info..]); | |
621 | self.off = next_line; | |
622 | let info = unescape(&self.text[beg_info..next_line].trim()); | |
623 | let size = self.text.len(); | |
624 | self.state = State::CodeLineStart; | |
625 | self.start(Tag::CodeBlock(info), size, 0) | |
626 | } | |
627 | ||
628 | fn next_code_line_start(&mut self) -> Event<'a> { | |
629 | let (off, space) = match self.scan_containers(&self.text[self.off ..]) { | |
630 | (n, true, space) => (self.off + n, space), | |
631 | _ => { | |
632 | return self.end(); | |
633 | } | |
634 | }; | |
635 | ||
636 | if self.fence_char == b'\0' { | |
637 | let n = scan_blank_line(&self.text[off..]); | |
638 | if n != 0 { | |
639 | // TODO performance: this scanning is O(n^2) in the number of empty lines | |
640 | let (n_empty, _lines) = self.scan_empty_lines(&self.text[off + n ..]); | |
641 | let next = off + n + n_empty; | |
642 | let (n_containers, scanned, nspace) = self.scan_containers(&self.text[next..]); | |
643 | // TODO; handle space | |
644 | if !scanned || self.is_code_block_end(next + n_containers, nspace) { | |
645 | //println!("was end: {}", next + n_containers); | |
646 | return self.end(); | |
647 | } else { | |
648 | self.off = off; | |
649 | //println!("code line start space={}, off={}", space, off); | |
650 | self.leading_space = space; | |
651 | return self.next_code(); | |
652 | } | |
653 | } | |
654 | } | |
655 | ||
656 | if self.is_code_block_end(off, space) { | |
657 | let ret = self.end(); | |
658 | if self.fence_char != b'\0' { | |
659 | self.off = off + scan_nextline(&self.text[off..]); | |
660 | } | |
661 | ret | |
662 | } else { | |
663 | self.off = off; | |
664 | self.state = State::Code; | |
665 | self.leading_space = space; | |
666 | self.next_code() | |
667 | } | |
668 | } | |
669 | ||
670 | fn next_code(&mut self) -> Event<'a> { | |
671 | if self.leading_space > self.fence_indent { | |
672 | // TODO: might try to combine spaces in text, for fewer events | |
673 | let space = self.leading_space; | |
674 | self.leading_space = 0; | |
675 | return Event::Text(spaces(space - self.fence_indent)); | |
676 | } | |
677 | let bytes = self.text.as_bytes(); | |
678 | let mut beg = self.off; | |
679 | let mut i = beg; | |
680 | loop { | |
681 | match bytes[i..].iter().position(|&c| c < b' ') { | |
682 | Some(j) => i += j, | |
683 | None => { | |
684 | i += bytes[i..].len(); | |
685 | break; | |
686 | } | |
687 | } | |
688 | match bytes[i] { | |
689 | b'\n' => { | |
690 | i += 1; | |
691 | self.state = State::CodeLineStart; | |
692 | break; | |
693 | } | |
694 | b'\t' => { | |
695 | if i > beg { break; } | |
696 | return self.char_tab(); | |
697 | } | |
698 | b'\r' => { | |
699 | // just skip it (does not support '\r' only line break) | |
700 | if i > beg { break; } | |
701 | beg += 1; | |
702 | } | |
703 | _ => () | |
704 | } | |
705 | i += 1; | |
706 | } | |
707 | self.off = i; | |
708 | Event::Text(Borrowed(&self.text[beg..i])) | |
709 | } | |
710 | ||
711 | fn is_code_block_end(&self, loc: usize, space: usize) -> bool { | |
712 | let tail = &self.text[loc..]; | |
713 | if self.fence_char == b'\0' { | |
714 | // indented code block | |
715 | space < 4 | |
716 | } else if space <= 3 { | |
717 | let (n, c) = scan_code_fence(tail); | |
718 | if c != self.fence_char || n < self.fence_count { | |
719 | return false; | |
720 | } | |
721 | if n < tail.len() && scan_blank_line(&tail[n..]) == 0 { | |
722 | // Closing code fences cannot have info strings | |
723 | return false; | |
724 | } | |
725 | return true; | |
726 | } else { | |
727 | false | |
728 | } | |
729 | } | |
730 | ||
731 | // # HTML blocks | |
732 | ||
733 | fn scan_html_block_tag(&self, data: &'a str) -> (usize, &'a str) { | |
734 | let mut i = scan_ch(data, b'<'); | |
735 | if i == 0 { return (0, "") } | |
736 | i += scan_ch(&data[i..], b'/'); | |
737 | let n = scan_while(&data[i..], is_ascii_alphanumeric); | |
738 | // TODO: scan attributes and > | |
739 | (i + n, &data[i .. i + n]) | |
740 | } | |
741 | ||
742 | fn is_html_block(&self, data: &str) -> bool { | |
743 | let (n_tag, tag) = self.scan_html_block_tag(data); | |
744 | (n_tag > 0 && is_html_tag(tag)) || | |
745 | data.starts_with("<?") || | |
746 | data.starts_with("<!") | |
747 | } | |
748 | ||
749 | fn do_html_block(&mut self) -> Event<'a> { | |
750 | let size = self.text.len(); | |
751 | let mut out = Borrowed(""); | |
752 | let mut i = self.off; | |
753 | let mut mark = i; | |
754 | loop { | |
755 | let n = scan_nextline(&self.text[i..]); | |
756 | i += n; | |
757 | if n >= 2 && self.text.as_bytes()[i - 2] == b'\r' { | |
758 | if self.leading_space > 0 { | |
759 | out = utils::cow_append(out, spaces(self.leading_space)); | |
760 | self.leading_space = 0; | |
761 | } | |
762 | out = utils::cow_append(out, Borrowed(&self.text[mark .. i - 2])); | |
763 | mark = i - 1; | |
764 | } | |
765 | let (n, scanned, space) = self.scan_containers(&self.text[i..]); | |
766 | let n_blank = scan_blank_line(&self.text[i + n ..]); | |
767 | if n != 0 || !scanned || i + n == size || n_blank != 0 { | |
768 | if self.leading_space > 0 { | |
769 | out = utils::cow_append(out, spaces(self.leading_space)); | |
770 | } | |
771 | self.leading_space = space; | |
772 | out = utils::cow_append(out, Borrowed(&self.text[mark..i])); | |
773 | mark = i + n; | |
774 | } | |
775 | if !scanned || i + n == size || n_blank != 0 { | |
776 | self.off = i; // TODO: skip blank lines (cleaner source maps) | |
777 | self.state = State::StartBlock; | |
778 | return Event::Html(out) | |
779 | } | |
780 | } | |
781 | } | |
782 | ||
783 | // # Link reference definitions | |
784 | ||
785 | fn try_link_reference_definition(&mut self, data: &'a str) -> bool { | |
786 | let (n_link, text_beg, text_end, max_nest) = self.scan_link_label(data); | |
787 | if n_link == 0 || max_nest > 1 { return false; } | |
788 | let n_colon = scan_ch(&data[n_link ..], b':'); | |
789 | if n_colon == 0 { return false; } | |
790 | let mut i = n_link + n_colon; | |
791 | i += self.scan_whitespace_inline(&data[i..]); | |
792 | let linkdest = scan_link_dest(&data[i..]); | |
793 | if linkdest.is_none() { return false; } | |
794 | let (n_dest, raw_dest) = linkdest.unwrap(); | |
795 | if n_dest == 0 { return false; } | |
796 | i += n_dest; | |
797 | i += scan_whitespace_no_nl(&data[i..]); | |
798 | let n_nl = self.scan_whitespace_inline(&data[i..]); | |
799 | let (n_title, title_beg, title_end) = self.scan_link_title(&data[i + n_nl ..]); | |
800 | let title = if n_title == 0 { | |
801 | Borrowed("") | |
802 | } else { | |
803 | let (title_beg, title_end) = (i + n_nl + title_beg, i + n_nl + title_end); | |
804 | i += n_nl + n_title; | |
805 | unescape(&data[title_beg..title_end]) | |
806 | }; | |
807 | i += scan_whitespace_no_nl(&data[i..]); | |
808 | if let (n_eol, true) = scan_eol(&data[i..]) { | |
809 | i += n_eol; | |
810 | } else { | |
811 | return false; | |
812 | } | |
813 | ||
814 | let linktext = self.normalize_link_ref(&data[text_beg..text_end]); | |
815 | if linktext.is_empty() { | |
816 | return false; | |
817 | } | |
818 | if !self.links.contains_key(&linktext) { | |
819 | let dest = unescape(raw_dest); | |
820 | self.links.insert(linktext, (dest, title)); | |
821 | } | |
822 | self.state = State::StartBlock; | |
823 | self.off += i; | |
824 | true | |
825 | } | |
826 | ||
827 | // normalize whitespace and case-fold | |
828 | fn normalize_link_ref(&self, raw: &str) -> String { | |
829 | let mut need_space = false; | |
830 | let mut result = String::new(); | |
831 | let mut i = 0; | |
832 | while i < raw.len() { | |
833 | let n = scan_nextline(&raw[i..]); | |
834 | for c in raw[i.. i + n].chars() { | |
835 | if c.is_whitespace() { | |
836 | need_space = true; | |
837 | } else { | |
838 | if need_space && !result.is_empty() { | |
839 | result.push(' '); | |
840 | } | |
841 | // TODO: Unicode case folding can differ from lowercase (ß) | |
842 | result.extend(c.to_lowercase()); | |
843 | need_space = false; | |
844 | } | |
845 | } | |
846 | i += n; | |
847 | if i == raw.len() { break; } | |
848 | i += self.scan_containers(&raw[i..]).0; | |
849 | need_space = true; | |
850 | } | |
851 | result | |
852 | } | |
853 | ||
854 | // determine whether the line starting at loc ends the block | |
855 | fn is_inline_block_end(&self, data: &str, space: usize) -> bool { | |
856 | data.is_empty() || | |
857 | scan_blank_line(data) != 0 || | |
858 | space <= 3 && (scan_hrule(data) != 0 || | |
859 | scan_atx_header(data).0 != 0 || | |
860 | scan_code_fence(data).0 != 0 || | |
861 | scan_blockquote_start(data) != 0 || | |
862 | scan_listitem(data).0 != 0 || | |
863 | self.is_html_block(data)) | |
864 | } | |
865 | ||
866 | fn next_table_cell(&mut self) -> Event<'a> { | |
867 | assert!(self.opts.contains(OPTION_ENABLE_TABLES)); | |
868 | let bytes = self.text.as_bytes(); | |
869 | let mut beg = self.off + scan_whitespace_no_nl(&self.text[self.off ..]); | |
870 | let mut i = beg; | |
871 | let limit = self.limit(); | |
872 | if i < limit && bytes[i] == b'|' { | |
873 | i += 1; | |
874 | beg += 1; | |
875 | self.off += 1; | |
876 | } | |
877 | if i >= limit { | |
878 | self.off = limit; | |
879 | return self.end(); | |
880 | } | |
881 | let mut n = 0; | |
882 | while i < limit { | |
883 | let c = bytes[i]; | |
884 | if c == b'\\' && i + 1 < limit && bytes[i + 1] == b'|' { | |
885 | i += 2; | |
886 | continue; | |
887 | } else if c == b'|' { | |
888 | n = 0; | |
889 | break; | |
890 | } | |
891 | n = if is_ascii_whitespace(bytes[i]) { scan_blank_line(&self.text[i..]) } else { 0 }; | |
892 | if n != 0 { | |
893 | if i > beg { | |
894 | n = 0; | |
895 | } | |
896 | break; | |
897 | } | |
898 | i += 1; | |
899 | } | |
900 | if i > beg { | |
901 | self.state = State::Inline; | |
902 | self.start(Tag::TableCell, i, i + n) | |
903 | } else { | |
904 | self.off = i + n; | |
905 | self.end() | |
906 | } | |
907 | } | |
908 | ||
909 | fn next_inline(&mut self) -> Event<'a> { | |
910 | let bytes = self.text.as_bytes(); | |
911 | let beg = self.off; | |
912 | let mut i = beg; | |
913 | let limit = self.limit(); | |
914 | while i < limit { | |
915 | match bytes[i..limit].iter().position(|&c| self.active_tab[c as usize] != 0) { | |
916 | Some(pos) => i += pos, | |
917 | None => { i = limit; break; } | |
918 | } | |
919 | let c = bytes[i]; | |
920 | if c == b'\n' || c == b'\r' { | |
921 | let n = scan_trailing_whitespace(&self.text[beg..i]); | |
922 | let end = i - n; | |
923 | if end > beg { | |
924 | self.off = end; | |
925 | return Event::Text(Borrowed(&self.text[beg..end])); | |
926 | } | |
927 | if c == b'\r' && i + 1 < limit && self.text.as_bytes()[i + 1] == b'\n' { | |
928 | i += 1; | |
929 | } | |
930 | i += 1; | |
931 | let next = i; | |
932 | let (n_containers, _, space) = self.scan_containers(&self.text[i..limit]); | |
933 | i += n_containers; | |
934 | if self.is_inline_block_end(&self.text[i..limit], space) { | |
935 | self.off = next; | |
936 | return self.end(); | |
937 | } | |
938 | i += scan_whitespace_no_nl(&self.text[i..limit]); | |
939 | self.off = i; | |
940 | return if n >= 2 { Event::HardBreak } else { Event::SoftBreak }; | |
941 | } | |
942 | self.off = i; | |
943 | if i > beg { | |
944 | return Event::Text(Borrowed(&self.text[beg..i])); | |
945 | } | |
946 | if let Some(event) = self.active_char(c) { | |
947 | return event; | |
948 | } | |
949 | i = self.off; // let handler advance offset even on None | |
950 | i += 1; | |
951 | } | |
952 | if i > beg { | |
953 | self.off = i; | |
954 | Event::Text(Borrowed(&self.text[beg..i])) | |
955 | } else { | |
956 | self.end() | |
957 | } | |
958 | } | |
959 | ||
960 | fn active_char(&mut self, c: u8) -> Option<Event<'a>> { | |
961 | match c { | |
962 | b'\x00' => Some(self.char_null()), | |
963 | b'\t' => Some(self.char_tab()), | |
964 | b'\\' => self.char_backslash(), | |
965 | b'&' => self.char_entity(), | |
966 | b'_' => self.char_emphasis(), | |
967 | b'*' => self.char_emphasis(), | |
968 | b'[' if self.opts.contains(OPTION_ENABLE_FOOTNOTES) => self.char_link_footnote(), | |
969 | b'[' | b'!' => self.char_link(), | |
970 | b'`' => self.char_backtick(), | |
971 | b'<' => self.char_lt(), | |
972 | _ => None | |
973 | } | |
974 | } | |
975 | ||
976 | fn char_null(&mut self) -> Event<'a> { | |
977 | self.off += 1; | |
978 | Event::Text(Borrowed(&"\u{fffd}")) | |
979 | } | |
980 | ||
981 | // expand tab in content (used for code and inline) | |
982 | // scan backward to find offset, counting unicode code points | |
983 | fn char_tab(&mut self) -> Event<'a> { | |
984 | let count = count_tab(&self.text.as_bytes()[.. self.off]); | |
985 | self.off += 1; | |
986 | Event::Text(Borrowed(&" "[..count])) | |
987 | } | |
988 | ||
989 | fn char_backslash(&mut self) -> Option<Event<'a>> { | |
990 | let limit = self.limit(); | |
991 | if self.off + 1 < limit { | |
992 | if let (_, true) = scan_eol(&self.text[self.off + 1 .. limit]) { | |
993 | let n_white = self.scan_whitespace_inline(&self.text[self.off + 1 .. limit]); | |
994 | let space = 0; // TODO: figure this out | |
995 | if !self.is_inline_block_end(&self.text[self.off + 1 + n_white .. limit], space) { | |
996 | self.off += 1 + n_white; | |
997 | return Some(Event::HardBreak); | |
998 | } | |
999 | } | |
1000 | let c = self.text.as_bytes()[self.off + 1]; | |
1001 | if is_ascii_punctuation(c) { | |
1002 | self.off += 2; | |
1003 | return Some(Event::Text(Borrowed(&self.text[self.off - 1 .. self.off]))); | |
1004 | } | |
1005 | } | |
1006 | None | |
1007 | } | |
1008 | ||
1009 | fn char_entity(&mut self) -> Option<Event<'a>> { | |
1010 | match scan_entity(&self.text[self.off ..]) { | |
1011 | (n, Some(value)) => { | |
1012 | self.off += n; | |
1013 | Some(Event::Text(value)) | |
1014 | } | |
1015 | _ => None | |
1016 | } | |
1017 | } | |
1018 | ||
1019 | fn char_emphasis(&mut self) -> Option<Event<'a>> { | |
1020 | // can see to left for flanking info, but not past limit | |
1021 | let limit = self.limit(); | |
1022 | let data = &self.text[..limit]; | |
1023 | ||
1024 | let c = data.as_bytes()[self.off]; | |
1025 | let (n, can_open, _can_close) = compute_open_close(data, self.off, c); | |
1026 | if !can_open { | |
1027 | return None; | |
1028 | } | |
1029 | let mut stack = vec![n]; // TODO performance: don't allocate | |
1030 | let mut i = self.off + n; | |
1031 | while i < limit { | |
1032 | let c2 = data.as_bytes()[i]; | |
1033 | if c2 == b'\n' && !is_escaped(data, i) { | |
1034 | let space = 0; // TODO: scan containers | |
1035 | if self.is_inline_block_end(&self.text[i + 1 .. limit], space) { | |
1036 | return None | |
1037 | } else { | |
1038 | i += 1; | |
1039 | } | |
1040 | } else if c2 == c && !is_escaped(data, i) { | |
1041 | let (mut n2, can_open, can_close) = compute_open_close(data, i, c); | |
1042 | if can_close { | |
1043 | loop { | |
1044 | let ntos = stack.pop().unwrap(); | |
1045 | if ntos > n2 { | |
1046 | stack.push(ntos - n2); | |
1047 | break; | |
1048 | } | |
1049 | if stack.is_empty() { | |
1050 | let npop = if ntos < n2 { ntos } else { n2 }; | |
1051 | if npop == 1 { | |
1052 | self.off += 1; | |
1053 | return Some(self.start(Tag::Emphasis, i, i + 1)); | |
1054 | } else { | |
1055 | self.off += 2; | |
1056 | let next = i + npop; | |
1057 | return Some(self.start(Tag::Strong, next - 2, next)); | |
1058 | } | |
1059 | } else { | |
1060 | i += ntos; | |
1061 | n2 -= ntos; | |
1062 | } | |
1063 | } | |
1064 | } else if can_open { | |
1065 | stack.push(n2); | |
1066 | } | |
1067 | i += n2; | |
1068 | } else if c2 == b'`' { | |
1069 | let (n, beg, _) = self.scan_inline_code(&self.text[i..limit]); | |
1070 | if n != 0 { | |
1071 | i += n; | |
1072 | } else { | |
1073 | i += beg; | |
1074 | } | |
1075 | } else if c2 == b'<' { | |
1076 | let n = self.scan_autolink_or_html(&self.text[i..limit]); | |
1077 | if n != 0 { | |
1078 | i += n; | |
1079 | } else { | |
1080 | i += 1; | |
1081 | } | |
1082 | } else if c2 == b'[' { | |
1083 | if self.opts.contains(OPTION_ENABLE_FOOTNOTES) { | |
1084 | if let Some((_, n)) = self.parse_footnote(&self.text[i..limit]) { | |
1085 | i += n; | |
1086 | continue; | |
1087 | } | |
1088 | } | |
1089 | if let Some((_, _, _, n)) = self.parse_link(&self.text[i..limit], false) { | |
1090 | i += n; | |
1091 | } else { | |
1092 | i += 1; | |
1093 | } | |
1094 | } else { | |
1095 | i += 1; | |
1096 | } | |
1097 | } | |
1098 | None | |
1099 | } | |
1100 | ||
1101 | // # Links | |
1102 | ||
1103 | // scans a link label, example [link] | |
1104 | // return value is: total bytes, start of text, end of text, max nesting | |
1105 | fn scan_link_label(&self, data: &str) -> (usize, usize, usize, usize) { | |
1106 | let mut i = scan_ch(data, b'['); | |
1107 | if i == 0 { return (0, 0, 0, 0); } | |
1108 | let text_beg = i; | |
1109 | let mut max_nest = 1; | |
1110 | let mut nest = 1; | |
1111 | loop { | |
1112 | if i >= data.len() { return (0, 0, 0, 0); } | |
1113 | match data.as_bytes()[i] { | |
1114 | b'\n' => { | |
1115 | let n = self.scan_whitespace_inline(&data[i..]); | |
1116 | if n == 0 { return (0, 0, 0, 0); } | |
1117 | i += n; | |
1118 | } | |
1119 | b'[' => { | |
1120 | nest += 1; | |
1121 | if nest == MAX_LINK_NEST { return (0, 0, 0, 0); } | |
1122 | max_nest = cmp::max(max_nest, nest); | |
1123 | i += 1; | |
1124 | } | |
1125 | b']' => { | |
1126 | nest -= 1; | |
1127 | if nest == 0 { | |
1128 | break; | |
1129 | } | |
1130 | i += 1; | |
1131 | } | |
1132 | b'\\' => i += 1, | |
1133 | b'<' => { | |
1134 | let n = self.scan_autolink_or_html(&data[i..]); | |
1135 | if n != 0 { | |
1136 | i += n; | |
1137 | } else { | |
1138 | i += 1; | |
1139 | } | |
1140 | } | |
1141 | b'`' => { | |
1142 | let (n, beg, _) = self.scan_inline_code(&data[i..]); | |
1143 | if n != 0 { | |
1144 | i += n; | |
1145 | } else { | |
1146 | i += beg; | |
1147 | } | |
1148 | } | |
1149 | _ => i += 1 | |
1150 | } | |
1151 | } | |
1152 | let text_end = i; | |
1153 | i += 1; // skip closing ] | |
1154 | (i, text_beg, text_end, max_nest) | |
1155 | } | |
1156 | ||
1157 | fn scan_link_title(&self, data: &str) -> (usize, usize, usize) { | |
1158 | let size = data.len(); | |
1159 | if size == 0 { return (0, 0, 0); } | |
1160 | let mut i = 0; | |
1161 | let titleclose = match data.as_bytes()[i] { | |
1162 | b'(' => b')', | |
1163 | b'\'' => b'\'', | |
1164 | b'\"' => b'\"', | |
1165 | _ => return (0, 0, 0) | |
1166 | }; | |
1167 | i += 1; | |
1168 | let title_beg = i; | |
1169 | while i < size { | |
1170 | match data.as_bytes()[i] { | |
1171 | x if x == titleclose => break, | |
1172 | b'\\' => i += 2, // may be > size | |
1173 | b'\n' => { | |
1174 | let n = self.scan_whitespace_inline(&data[i..]); | |
1175 | if n == 0 { return (0, 0, 0); } | |
1176 | i += n; | |
1177 | } | |
1178 | _ => i += 1 | |
1179 | } | |
1180 | } | |
1181 | if i >= size { return (0, 0, 0); } | |
1182 | let title_end = i; | |
1183 | i += 1; | |
1184 | (i, title_beg, title_end) | |
1185 | } | |
1186 | ||
1187 | fn char_link(&mut self) -> Option<Event<'a>> { | |
1188 | self.parse_link(&self.text[self.off .. self.limit()], false).map(|(tag, beg, end, n)| { | |
1189 | let off = self.off; | |
1190 | self.off += beg; | |
1191 | self.start(tag, off + end, off + n) | |
1192 | }) | |
1193 | } | |
1194 | ||
1195 | // return: tag, begin, end, total size | |
1196 | fn parse_link(&self, data: &'a str, recur: bool) -> Option<(Tag<'a>, usize, usize, usize)> { | |
1197 | let size = data.len(); | |
1198 | ||
1199 | // scan link text | |
1200 | let i = scan_ch(data, b'!'); | |
1201 | let is_image = i == 1; | |
1202 | let (n, text_beg, text_end, max_nest) = self.scan_link_label(&data[i..]); | |
1203 | if n == 0 { return None; } | |
1204 | let (text_beg, text_end) = (text_beg + i, text_end + i); | |
1205 | if !is_image && !recur && max_nest > 1 && self.contains_link(&data[text_beg..text_end]) { | |
1206 | // disallow nested links in links (but ok in images) | |
1207 | return None; | |
1208 | } | |
1209 | let mut i = i + n; | |
1210 | ||
1211 | // scan dest | |
1212 | let (dest, title, beg, end, next) = if data[i..].starts_with("(") { | |
1213 | i += 1; | |
1214 | i += self.scan_whitespace_inline(&data[i..]); | |
1215 | if i >= size { return None; } | |
1216 | ||
1217 | let linkdest = scan_link_dest(&data[i..]); | |
1218 | if linkdest.is_none() { return None; } | |
1219 | let (n, raw_dest) = linkdest.unwrap(); | |
1220 | let dest = unescape(raw_dest); | |
1221 | i += n; | |
1222 | ||
1223 | i += self.scan_whitespace_inline(&data[i..]); | |
1224 | if i == size { return None; } | |
1225 | ||
1226 | // scan title | |
1227 | let (n_title, title_beg, title_end) = self.scan_link_title(&data[i..]); | |
1228 | let title = if n_title == 0 { | |
1229 | Borrowed("") | |
1230 | } else { | |
1231 | let (title_beg, title_end) = (i + title_beg, i + title_end); | |
1232 | i += n_title; | |
1233 | // TODO: not just unescape, remove containers from newlines | |
1234 | unescape(&data[title_beg..title_end]) | |
1235 | }; | |
1236 | i += self.scan_whitespace_inline(&data[i..]); | |
1237 | if i == size || data.as_bytes()[i] != b')' { return None; } | |
1238 | i += 1; | |
1239 | (dest, title, text_beg, text_end, i) | |
1240 | } else { | |
1241 | // try link reference | |
1242 | let j = i + self.scan_whitespace_inline(&data[i..]); | |
1243 | let (n_ref, ref_beg, ref_end, _) = self.scan_link_label(&data[j..]); | |
1244 | let (ref_beg, ref_end) = if n_ref == 0 || ref_beg == ref_end { | |
1245 | (text_beg, text_end) | |
1246 | } else { | |
1247 | (j + ref_beg, j + ref_end) | |
1248 | }; | |
1249 | if n_ref != 0 { | |
1250 | i = j + n_ref; | |
1251 | } | |
1252 | let reference = self.normalize_link_ref(&data[ref_beg..ref_end]); | |
1253 | let (dest, title) = match self.links.get(&reference) { | |
1254 | Some(&(ref dest, ref title)) => (dest.clone(), title.clone()), | |
1255 | None => return None | |
1256 | }; | |
1257 | (dest, title, text_beg, text_end, i) | |
1258 | }; | |
1259 | if is_image { | |
1260 | Some((Tag::Image(dest, title), beg, end, next)) | |
1261 | } else { | |
1262 | Some((Tag::Link(dest, title), beg, end, next)) | |
1263 | } | |
1264 | } | |
1265 | ||
1266 | // determine whether there's a link anywhere in the text | |
1267 | // TODO: code duplication with scan_link_label is unpleasant | |
1268 | fn contains_link(&self, data: &str) -> bool { | |
1269 | let mut i = 0; | |
1270 | while i < data.len() { | |
1271 | match data.as_bytes()[i] { | |
1272 | b'\n' => { | |
1273 | let n = self.scan_whitespace_inline(&data[i..]); | |
1274 | if n == 0 { return false; } | |
1275 | i += n; | |
1276 | continue; | |
1277 | } | |
1278 | b'!' => { | |
1279 | if scan_ch(&data[i + 1 ..], b'[') != 0 { | |
1280 | // ok to contain image, skip over opening bracket | |
1281 | i += 1; | |
1282 | } | |
1283 | } | |
1284 | b'[' => { | |
1285 | if self.opts.contains(OPTION_ENABLE_FOOTNOTES) && self.parse_footnote(&data[i..]).is_some() { | |
1286 | return false; | |
1287 | } | |
1288 | if self.parse_link(&data[i..], true).is_some() { return true; } | |
1289 | } | |
1290 | b'\\' => i += 1, | |
1291 | b'<' => { | |
1292 | let n = self.scan_autolink_or_html(&data[i..]); | |
1293 | if n != 0 { | |
1294 | i += n; | |
1295 | } else { | |
1296 | i += 1; | |
1297 | } | |
1298 | } | |
1299 | b'`' => { | |
1300 | let (n, beg, _) = self.scan_inline_code(&data[i..]); | |
1301 | if n != 0 { | |
1302 | i += n; | |
1303 | } else { | |
1304 | i += beg; | |
1305 | } | |
1306 | } | |
1307 | _ => () | |
1308 | } | |
1309 | i += 1; | |
1310 | } | |
1311 | false | |
1312 | } | |
1313 | ||
1314 | // # Footnotes | |
1315 | ||
1316 | fn parse_footnote_definition<'b>(&self, data: &'b str) -> Option<(&'b str, usize)> { | |
1317 | assert!(self.opts.contains(OPTION_ENABLE_FOOTNOTES)); | |
1318 | self.parse_footnote(data).and_then(|(name, len)| { | |
1319 | let n_colon = scan_ch(&data[len ..], b':'); | |
1320 | if n_colon == 0 { | |
1321 | None | |
1322 | } else { | |
1323 | let space = scan_whitespace_no_nl(&data[len + n_colon..]); | |
1324 | Some((name, len + n_colon + space)) | |
1325 | } | |
1326 | }) | |
1327 | } | |
1328 | ||
1329 | fn char_link_footnote(&mut self) -> Option<Event<'a>> { | |
1330 | assert!(self.opts.contains(OPTION_ENABLE_FOOTNOTES)); | |
1331 | if let Some((name, end)) = self.parse_footnote(&self.text[self.off .. self.limit()]) { | |
1332 | self.off += end; | |
1333 | Some(Event::FootnoteReference(Cow::Borrowed(name))) | |
1334 | } else { | |
1335 | self.char_link() | |
1336 | } | |
1337 | } | |
1338 | ||
1339 | fn parse_footnote<'b>(&self, data: &'b str) -> Option<(&'b str, usize)> { | |
1340 | assert!(self.opts.contains(OPTION_ENABLE_FOOTNOTES)); | |
1341 | let (n_footnote, text_beg, text_end) = self.scan_footnote_label(data); | |
1342 | if n_footnote == 0 { return None; } | |
1343 | return Some((&data[text_beg..text_end], n_footnote)); | |
1344 | } | |
1345 | ||
1346 | fn scan_footnote_label(&self, data: &str) -> (usize, usize, usize) { | |
1347 | assert!(self.opts.contains(OPTION_ENABLE_FOOTNOTES)); | |
1348 | let mut i = scan_ch(data, b'['); | |
1349 | if i == 0 { return (0, 0, 0); } | |
1350 | if i >= data.len() || data.as_bytes()[i] != b'^' { return (0, 0, 0); } | |
1351 | i += 1; | |
1352 | let text_beg = i; | |
1353 | loop { | |
1354 | if i >= data.len() { return (0, 0, 0); } | |
1355 | match data.as_bytes()[i] { | |
1356 | b'\n' => { | |
1357 | let n = self.scan_whitespace_inline(&data[i..]); | |
1358 | if n == 0 { return (0, 0, 0); } | |
1359 | i += n; | |
1360 | continue; | |
1361 | } | |
1362 | b']' => break, | |
1363 | b'\\' => i += 1, | |
1364 | _ => () | |
1365 | } | |
1366 | i += 1; | |
1367 | } | |
1368 | let text_end = i; | |
1369 | i += 1; // skip closing ] | |
1370 | (i, text_beg, text_end) | |
1371 | } | |
1372 | ||
1373 | // # Autolinks and inline HTML | |
1374 | ||
1375 | fn char_lt(&mut self) -> Option<Event<'a>> { | |
1376 | let tail = &self.text[self.off .. self.limit()]; | |
1377 | if let Some((n, link)) = scan_autolink(tail) { | |
1378 | let next = self.off + n; | |
1379 | self.off += 1; | |
1380 | self.state = State::Literal; | |
1381 | return Some(self.start(Tag::Link(link, Borrowed("")), next - 1, next)) | |
1382 | } | |
1383 | let n = self.scan_inline_html(tail); | |
1384 | if n != 0 { | |
1385 | return Some(self.inline_html_event(n)) | |
1386 | } | |
1387 | None | |
1388 | } | |
1389 | ||
1390 | fn scan_autolink_or_html(&self, data: &str) -> usize { | |
1391 | if let Some((n, _)) = scan_autolink(data) { | |
1392 | n | |
1393 | } else { | |
1394 | self.scan_inline_html(data) | |
1395 | } | |
1396 | } | |
1397 | ||
1398 | fn scan_inline_html(&self, data: &str) -> usize { | |
1399 | let n = self.scan_html_tag(data); | |
1400 | if n != 0 { return n; } | |
1401 | let n = self.scan_html_comment(data); | |
1402 | if n != 0 { return n; } | |
1403 | let n = self.scan_processing_instruction(data); | |
1404 | if n != 0 { return n; } | |
1405 | let n = self.scan_declaration(data); | |
1406 | if n != 0 { return n; } | |
1407 | let n = self.scan_cdata(data); | |
1408 | if n != 0 { return n; } | |
1409 | 0 | |
1410 | } | |
1411 | ||
1412 | fn scan_html_tag(&self, data: &str) -> usize { | |
1413 | let size = data.len(); | |
1414 | let mut i = 0; | |
1415 | if scan_ch(data, b'<') == 0 { return 0; } | |
1416 | i += 1; | |
1417 | let n_slash = scan_ch(&data[i..], b'/'); | |
1418 | i += n_slash; | |
1419 | if i == size || !is_ascii_alpha(data.as_bytes()[i]) { return 0; } | |
1420 | i += 1; | |
1421 | i += scan_while(&data[i..], is_ascii_alphanumeric); | |
1422 | if n_slash == 0 { | |
1423 | loop { | |
1424 | let n = self.scan_whitespace_inline(&data[i..]); | |
1425 | if n == 0 { break; } | |
1426 | i += n; | |
1427 | let n = scan_attribute_name(&data[i..]); | |
1428 | if n == 0 { break; } | |
1429 | i += n; | |
1430 | let n = self.scan_whitespace_inline(&data[i..]); | |
1431 | if scan_ch(&data[i + n ..], b'=') != 0 { | |
1432 | i += n + 1; | |
1433 | i += self.scan_whitespace_inline(&data[i..]); | |
1434 | let n_attr = self.scan_attribute_value(&data[i..]); | |
1435 | if n_attr == 0 { return 0; } | |
1436 | i += n_attr; | |
1437 | } | |
1438 | } | |
1439 | i += self.scan_whitespace_inline(&data[i..]); | |
1440 | i += scan_ch(&data[i..], b'/'); | |
1441 | } else { | |
1442 | i += self.scan_whitespace_inline(&data[i..]); | |
1443 | } | |
1444 | if scan_ch(&data[i..], b'>') == 0 { return 0; } | |
1445 | i += 1; | |
1446 | i | |
1447 | } | |
1448 | ||
1449 | fn scan_attribute_value(&self, data: &str) -> usize { | |
1450 | let size = data.len(); | |
1451 | if size == 0 { return 0; } | |
1452 | let open = data.as_bytes()[0]; | |
1453 | let quoted = open == b'\'' || open == b'"'; | |
1454 | let mut i = if quoted { 1 } else { 0 }; | |
1455 | while i < size { | |
1456 | let c = data.as_bytes()[i]; | |
1457 | match c { | |
1458 | b'\n' => { | |
1459 | if !quoted { break; } | |
1460 | let n = self.scan_whitespace_inline(&data[i..]); | |
1461 | if n == 0 { return 0; } | |
1462 | i += n; | |
1463 | } | |
1464 | b'\'' | b'"' | b'=' | b'<' | b'>' | b'`' | b'\t' ... b' ' => { | |
1465 | if !quoted || c == open { break; } | |
1466 | i += 1; | |
1467 | } | |
1468 | _ => i += 1 | |
1469 | } | |
1470 | } | |
1471 | if quoted { | |
1472 | if i == size || data.as_bytes()[i] != open { return 0; } | |
1473 | i += 1; | |
1474 | } | |
1475 | i | |
1476 | } | |
1477 | ||
1478 | fn scan_html_comment(&self, data: &str) -> usize { | |
1479 | if !data.starts_with("<!--") { return 0; } | |
1480 | if let Some(n) = data[4..].find("--") { | |
1481 | let text = &data[4..4 + n]; | |
1482 | if !text.starts_with('>') && !text.starts_with("->") && | |
1483 | data[n + 6 ..].starts_with('>') { | |
1484 | return n + 7; | |
1485 | } | |
1486 | } | |
1487 | 0 | |
1488 | } | |
1489 | ||
1490 | fn scan_processing_instruction(&self, data: &str) -> usize { | |
1491 | if !data.starts_with("<?") { return 0; } | |
1492 | if let Some(n) = data[2..].find("?>") { | |
1493 | return n + 4; | |
1494 | } | |
1495 | 0 | |
1496 | } | |
1497 | ||
1498 | fn scan_declaration(&self, data: &str) -> usize { | |
1499 | if !data.starts_with("<!") { return 0; } | |
1500 | let n = scan_while(&data[2..], is_ascii_upper); | |
1501 | if n == 0 { return 0; } | |
1502 | let i = n + 2; | |
1503 | let n = self.scan_whitespace_inline(&data[i..]); | |
1504 | if n == 0 { return 0; } | |
1505 | let mut i = i + n; | |
1506 | while i < data.len() { | |
1507 | match data.as_bytes()[i] { | |
1508 | b'>' => return i + 1, | |
1509 | b'\n' => i += self.scan_whitespace_inline(&data[i..]), | |
1510 | _ => i += 1 | |
1511 | } | |
1512 | } | |
1513 | 0 | |
1514 | } | |
1515 | ||
1516 | fn scan_cdata(&self, data: &str) -> usize { | |
1517 | if !data.starts_with("<![CDATA[") { return 0; } | |
1518 | if let Some(n) = data[9..].find("]]>") { | |
1519 | return n + 12; | |
1520 | } | |
1521 | 0 | |
1522 | } | |
1523 | ||
1524 | fn inline_html_event(&mut self, n: usize) -> Event<'a> { | |
1525 | let data = &self.text[self.off .. self.off + n]; | |
1526 | let size = data.len(); | |
1527 | let mut out = Borrowed(""); | |
1528 | let mut i = 0; | |
1529 | let mut mark = 0; | |
1530 | while i < size { | |
1531 | let n = scan_nextline(&data[i..]); | |
1532 | i += n; | |
1533 | if n >= 2 && data.as_bytes()[i - 2] == b'\r' { | |
1534 | out = utils::cow_append(out, Borrowed(&data[mark .. i - 2])); | |
1535 | mark = i - 1; | |
1536 | } | |
1537 | if i < size { | |
1538 | let (n, _, _) = self.scan_containers(&data[i..]); | |
1539 | if n != 0 { | |
1540 | out = utils::cow_append(out, Borrowed(&data[mark..i])); | |
1541 | mark = i + n; | |
1542 | } | |
1543 | } | |
1544 | } | |
1545 | out = utils::cow_append(out, Borrowed(&data[mark..n])); | |
1546 | self.off += n; | |
1547 | Event::InlineHtml(out) | |
1548 | } | |
1549 | ||
1550 | // link text is literal, with no processing of markup | |
1551 | fn next_literal(&mut self) -> Event<'a> { | |
1552 | self.state = State::Inline; | |
1553 | let beg = self.off; | |
1554 | let end = self.limit(); | |
1555 | self.off = end; | |
1556 | Event::Text(Borrowed(&self.text[beg..end])) | |
1557 | } | |
1558 | ||
1559 | // second return value is number of backticks even if not closed | |
1560 | fn scan_inline_code(&self, data: &str) -> (usize, usize, usize) { | |
1561 | let size = data.len(); | |
1562 | let backtick_len = scan_backticks(data); | |
1563 | let mut i = backtick_len; | |
1564 | while i < size { | |
1565 | match data.as_bytes()[i] { | |
1566 | b'`' => { | |
1567 | let close_len = scan_backticks(&data[i..]); | |
1568 | if close_len == backtick_len { | |
1569 | return (i + backtick_len, backtick_len, i); | |
1570 | } else { | |
1571 | i += close_len; | |
1572 | } | |
1573 | } | |
1574 | b'\n' => { | |
1575 | i += 1; | |
1576 | let (n, _, space) = self.scan_containers(&data[i..]); | |
1577 | i += n; | |
1578 | if self.is_inline_block_end(&data[i..], space) { | |
1579 | return (0, backtick_len, 0); | |
1580 | } | |
1581 | } | |
1582 | // TODO: '<' | |
1583 | _ => i += 1 | |
1584 | } | |
1585 | } | |
1586 | (0, backtick_len, 0) | |
1587 | } | |
1588 | ||
1589 | fn char_backtick(&mut self) -> Option<Event<'a>> { | |
1590 | let beg = self.off; | |
1591 | let limit = self.limit(); | |
1592 | let mut i = beg; | |
1593 | let (n, code_beg, code_end) = self.scan_inline_code(&self.text[i..limit]); | |
1594 | if n == 0 { | |
1595 | self.off += code_beg - 1; | |
1596 | return None; | |
1597 | } | |
1598 | i += code_beg; | |
1599 | let end = beg + code_end; | |
1600 | let next = beg + n; | |
1601 | i += self.scan_whitespace_inline(&self.text[i..limit]); | |
1602 | self.off = i; | |
1603 | self.state = State::InlineCode; | |
1604 | Some(self.start(Tag::Code, end, next)) | |
1605 | } | |
1606 | ||
1607 | fn next_inline_code(&mut self) -> Event<'a> { | |
1608 | let beg = self.off; | |
1609 | let mut i = beg; | |
1610 | let limit = self.limit(); | |
1611 | while i < limit { | |
1612 | let c = self.text.as_bytes()[i]; | |
1613 | if is_ascii_whitespace(c) { | |
1614 | let n = self.scan_whitespace_inline(&self.text[i..limit]); | |
1615 | if i + n == limit || n == 0 { | |
1616 | if i > beg { | |
1617 | break; | |
1618 | } else { | |
1619 | return self.end(); | |
1620 | } | |
1621 | } | |
1622 | if c == b' ' && n == 1 { | |
1623 | // optimization to reduce number of text blocks produced | |
1624 | i += 1; | |
1625 | } else { | |
1626 | if i > beg { | |
1627 | break; | |
1628 | } | |
1629 | i += n; | |
1630 | self.off = i; | |
1631 | return Event::Text(Borrowed(" ")); | |
1632 | } | |
1633 | } else { | |
1634 | i += 1; | |
1635 | } | |
1636 | } | |
1637 | if i > beg { | |
1638 | self.off = i; | |
1639 | Event::Text(Borrowed(&self.text[beg..i])) | |
1640 | } else { | |
1641 | self.end() | |
1642 | } | |
1643 | } | |
1644 | } | |
1645 | ||
1646 | impl<'a> Iterator for RawParser<'a> { | |
1647 | type Item = Event<'a>; | |
1648 | ||
1649 | fn next(&mut self) -> Option<Event<'a>> { | |
1650 | //println!("off {} {:?}, stack {:?} containers {:?}", | |
1651 | // self.off, self.state, self.stack, self.containers); | |
1652 | if self.off < self.text.len() { | |
1653 | match self.state { | |
1654 | State::StartBlock | State::InContainers => { | |
1655 | let ret = self.start_block(); | |
1656 | if ret.is_some() { | |
1657 | return ret; | |
1658 | } | |
1659 | } | |
1660 | State::Inline => return Some(self.next_inline()), | |
1661 | State::TableHead(_, _) => return Some(self.start_table_head()), | |
1662 | State::TableBody => return Some(self.start_table_body()), | |
1663 | State::TableRow => return Some(self.next_table_cell()), | |
1664 | State::CodeLineStart => return Some(self.next_code_line_start()), | |
1665 | State::Code => return Some(self.next_code()), | |
1666 | State::InlineCode => return Some(self.next_inline_code()), | |
1667 | State::Literal => return Some(self.next_literal()), | |
1668 | } | |
1669 | } | |
1670 | match self.stack.pop() { | |
1671 | Some((tag, _, _)) => Some(Event::End(tag)), | |
1672 | None => None | |
1673 | } | |
1674 | } | |
1675 | } |