]> git.proxmox.com Git - rustc.git/blob - src/vendor/html5ever/src/driver.rs
New upstream version 1.31.0+dfsg1
[rustc.git] / src / vendor / html5ever / src / driver.rs
1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! High-level interface to the parser.
11
12 use {Attribute, QualName};
13 use buffer_queue::BufferQueue;
14 use tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
15 use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink, create_element};
16
17 use std::borrow::Cow;
18
19 use tendril;
20 use tendril::StrTendril;
21 use tendril::stream::{TendrilSink, Utf8LossyDecoder};
22
23 /// All-encompassing options struct for the parser.
24 #[derive(Clone, Default)]
25 pub struct ParseOpts {
26 /// Tokenizer options.
27 pub tokenizer: TokenizerOpts,
28
29 /// Tree builder options.
30 pub tree_builder: TreeBuilderOpts,
31 }
32
33 /// Parse an HTML document
34 ///
35 /// The returned value implements `tendril::TendrilSink`
36 /// so that Unicode input may be provided incrementally,
37 /// or all at once with the `one` method.
38 ///
39 /// If your input is bytes, use `Parser::from_utf8`.
40 pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink {
41 let tb = TreeBuilder::new(sink, opts.tree_builder);
42 let tok = Tokenizer::new(tb, opts.tokenizer);
43 Parser { tokenizer: tok, input_buffer: BufferQueue::new() }
44 }
45
46 /// Parse an HTML fragment
47 ///
48 /// The returned value implements `tendril::TendrilSink`
49 /// so that Unicode input may be provided incrementally,
50 /// or all at once with the `one` method.
51 ///
52 /// If your input is bytes, use `Parser::from_utf8`.
53 pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
54 context_name: QualName, context_attrs: Vec<Attribute>)
55 -> Parser<Sink>
56 where Sink: TreeSink {
57 let context_elem = create_element(&mut sink, context_name, context_attrs);
58 parse_fragment_for_element(sink, opts, context_elem, None)
59 }
60
61 /// Like `parse_fragment`, but with an existing context element
62 /// and optionally a form element.
63 pub fn parse_fragment_for_element<Sink>(sink: Sink, opts: ParseOpts,
64 context_element: Sink::Handle,
65 form_element: Option<Sink::Handle>)
66 -> Parser<Sink>
67 where Sink: TreeSink {
68 let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
69 let tok_opts = TokenizerOpts {
70 initial_state: Some(tb.tokenizer_state_for_context_elem()),
71 .. opts.tokenizer
72 };
73 let tok = Tokenizer::new(tb, tok_opts);
74 Parser { tokenizer: tok, input_buffer: BufferQueue::new() }
75 }
76
77 /// An HTML parser,
78 /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
79 pub struct Parser<Sink> where Sink: TreeSink {
80 pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
81 pub input_buffer: BufferQueue,
82 }
83
84 impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
85 fn process(&mut self, t: StrTendril) {
86 self.input_buffer.push_back(t);
87 // FIXME: Properly support </script> somehow.
88 while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
89 }
90
91 // FIXME: Is it too noisy to report every character decoding error?
92 fn error(&mut self, desc: Cow<'static, str>) {
93 self.tokenizer.sink.sink.parse_error(desc)
94 }
95
96 type Output = Sink::Output;
97
98 fn finish(mut self) -> Self::Output {
99 // FIXME: Properly support </script> somehow.
100 while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
101 assert!(self.input_buffer.is_empty());
102 self.tokenizer.end();
103 self.tokenizer.sink.sink.finish()
104 }
105 }
106
107 impl<Sink: TreeSink> Parser<Sink> {
108 /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
109 ///
110 /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
111 /// Decoding is lossy, like `String::from_utf8_lossy`.
112 pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
113 Utf8LossyDecoder::new(self)
114 }
115 }
116
117 #[cfg(test)]
118 mod tests {
119 use rcdom::RcDom;
120 use serialize::serialize;
121 use tendril::TendrilSink;
122 use super::*;
123
124 #[test]
125 fn from_utf8() {
126 let dom = parse_document(RcDom::default(), ParseOpts::default())
127 .from_utf8()
128 .one("<title>Test".as_bytes());
129 let mut serialized = Vec::new();
130 serialize(&mut serialized, &dom.document, Default::default()).unwrap();
131 assert_eq!(String::from_utf8(serialized).unwrap().replace(" ", ""),
132 "<html><head><title>Test</title></head><body></body></html>");
133 }
134 }