]>
Commit | Line | Data |
---|---|---|
83c7162d XL |
1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. | |
3 | // | |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
7 | // option. This file may not be copied, modified, or distributed | |
8 | // except according to those terms. | |
9 | ||
10 | //! High-level interface to the parser. | |
11 | ||
3dfed10e XL |
12 | use crate::buffer_queue::BufferQueue; |
13 | use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; | |
14 | use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; | |
15 | use crate::{Attribute, QualName}; | |
83c7162d XL |
16 | |
17 | use std::borrow::Cow; | |
18 | ||
3dfed10e XL |
19 | use crate::tendril; |
20 | use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; | |
21 | use crate::tendril::StrTendril; | |
83c7162d XL |
22 | |
23 | /// All-encompassing options struct for the parser. | |
24 | #[derive(Clone, Default)] | |
25 | pub struct ParseOpts { | |
26 | /// Tokenizer options. | |
27 | pub tokenizer: TokenizerOpts, | |
28 | ||
29 | /// Tree builder options. | |
30 | pub tree_builder: TreeBuilderOpts, | |
31 | } | |
32 | ||
33 | /// Parse an HTML document | |
34 | /// | |
35 | /// The returned value implements `tendril::TendrilSink` | |
36 | /// so that Unicode input may be provided incrementally, | |
37 | /// or all at once with the `one` method. | |
38 | /// | |
39 | /// If your input is bytes, use `Parser::from_utf8`. | |
dc9dc135 XL |
40 | pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> |
41 | where | |
42 | Sink: TreeSink, | |
43 | { | |
83c7162d XL |
44 | let tb = TreeBuilder::new(sink, opts.tree_builder); |
45 | let tok = Tokenizer::new(tb, opts.tokenizer); | |
dc9dc135 XL |
46 | Parser { |
47 | tokenizer: tok, | |
48 | input_buffer: BufferQueue::new(), | |
49 | } | |
83c7162d XL |
50 | } |
51 | ||
52 | /// Parse an HTML fragment | |
53 | /// | |
54 | /// The returned value implements `tendril::TendrilSink` | |
55 | /// so that Unicode input may be provided incrementally, | |
56 | /// or all at once with the `one` method. | |
57 | /// | |
58 | /// If your input is bytes, use `Parser::from_utf8`. | |
dc9dc135 XL |
59 | pub fn parse_fragment<Sink>( |
60 | mut sink: Sink, | |
61 | opts: ParseOpts, | |
62 | context_name: QualName, | |
63 | context_attrs: Vec<Attribute>, | |
64 | ) -> Parser<Sink> | |
65 | where | |
66 | Sink: TreeSink, | |
67 | { | |
83c7162d XL |
68 | let context_elem = create_element(&mut sink, context_name, context_attrs); |
69 | parse_fragment_for_element(sink, opts, context_elem, None) | |
70 | } | |
71 | ||
72 | /// Like `parse_fragment`, but with an existing context element | |
73 | /// and optionally a form element. | |
dc9dc135 XL |
74 | pub fn parse_fragment_for_element<Sink>( |
75 | sink: Sink, | |
76 | opts: ParseOpts, | |
77 | context_element: Sink::Handle, | |
78 | form_element: Option<Sink::Handle>, | |
79 | ) -> Parser<Sink> | |
80 | where | |
81 | Sink: TreeSink, | |
82 | { | |
83c7162d XL |
83 | let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder); |
84 | let tok_opts = TokenizerOpts { | |
85 | initial_state: Some(tb.tokenizer_state_for_context_elem()), | |
dc9dc135 | 86 | ..opts.tokenizer |
83c7162d XL |
87 | }; |
88 | let tok = Tokenizer::new(tb, tok_opts); | |
dc9dc135 XL |
89 | Parser { |
90 | tokenizer: tok, | |
91 | input_buffer: BufferQueue::new(), | |
92 | } | |
83c7162d XL |
93 | } |
94 | ||
95 | /// An HTML parser, | |
96 | /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. | |
dc9dc135 XL |
97 | pub struct Parser<Sink> |
98 | where | |
99 | Sink: TreeSink, | |
100 | { | |
83c7162d XL |
101 | pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>, |
102 | pub input_buffer: BufferQueue, | |
103 | } | |
104 | ||
105 | impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { | |
106 | fn process(&mut self, t: StrTendril) { | |
107 | self.input_buffer.push_back(t); | |
108 | // FIXME: Properly support </script> somehow. | |
109 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} | |
110 | } | |
111 | ||
112 | // FIXME: Is it too noisy to report every character decoding error? | |
113 | fn error(&mut self, desc: Cow<'static, str>) { | |
114 | self.tokenizer.sink.sink.parse_error(desc) | |
115 | } | |
116 | ||
117 | type Output = Sink::Output; | |
118 | ||
119 | fn finish(mut self) -> Self::Output { | |
120 | // FIXME: Properly support </script> somehow. | |
121 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} | |
122 | assert!(self.input_buffer.is_empty()); | |
123 | self.tokenizer.end(); | |
124 | self.tokenizer.sink.sink.finish() | |
125 | } | |
126 | } | |
127 | ||
128 | impl<Sink: TreeSink> Parser<Sink> { | |
129 | /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. | |
130 | /// | |
131 | /// Use this when your input is bytes that are known to be in the UTF-8 encoding. | |
132 | /// Decoding is lossy, like `String::from_utf8_lossy`. | |
04454e1e | 133 | #[allow(clippy::wrong_self_convention)] |
83c7162d XL |
134 | pub fn from_utf8(self) -> Utf8LossyDecoder<Self> { |
135 | Utf8LossyDecoder::new(self) | |
136 | } | |
137 | } |