1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 //! High-level interface to the parser.
12 use {Attribute, QualName}
;
13 use buffer_queue
::BufferQueue
;
14 use tokenizer
::{Tokenizer, TokenizerOpts, TokenizerResult}
;
15 use tree_builder
::{TreeBuilderOpts, TreeBuilder, TreeSink, create_element}
;
20 use tendril
::StrTendril
;
21 use tendril
::stream
::{TendrilSink, Utf8LossyDecoder}
;
23 /// All-encompassing options struct for the parser.
24 #[derive(Clone, Default)]
25 pub struct ParseOpts
{
26 /// Tokenizer options.
27 pub tokenizer
: TokenizerOpts
,
29 /// Tree builder options.
30 pub tree_builder
: TreeBuilderOpts
,
33 /// Parse an HTML document
35 /// The returned value implements `tendril::TendrilSink`
36 /// so that Unicode input may be provided incrementally,
37 /// or all at once with the `one` method.
39 /// If your input is bytes, use `Parser::from_utf8`.
40 pub fn parse_document
<Sink
>(sink
: Sink
, opts
: ParseOpts
) -> Parser
<Sink
> where Sink
: TreeSink
{
41 let tb
= TreeBuilder
::new(sink
, opts
.tree_builder
);
42 let tok
= Tokenizer
::new(tb
, opts
.tokenizer
);
43 Parser { tokenizer: tok, input_buffer: BufferQueue::new() }
46 /// Parse an HTML fragment
48 /// The returned value implements `tendril::TendrilSink`
49 /// so that Unicode input may be provided incrementally,
50 /// or all at once with the `one` method.
52 /// If your input is bytes, use `Parser::from_utf8`.
53 pub fn parse_fragment
<Sink
>(mut sink
: Sink
, opts
: ParseOpts
,
54 context_name
: QualName
, context_attrs
: Vec
<Attribute
>)
56 where Sink
: TreeSink
{
57 let context_elem
= create_element(&mut sink
, context_name
, context_attrs
);
58 parse_fragment_for_element(sink
, opts
, context_elem
, None
)
61 /// Like `parse_fragment`, but with an existing context element
62 /// and optionally a form element.
63 pub fn parse_fragment_for_element
<Sink
>(sink
: Sink
, opts
: ParseOpts
,
64 context_element
: Sink
::Handle
,
65 form_element
: Option
<Sink
::Handle
>)
67 where Sink
: TreeSink
{
68 let tb
= TreeBuilder
::new_for_fragment(sink
, context_element
, form_element
, opts
.tree_builder
);
69 let tok_opts
= TokenizerOpts
{
70 initial_state
: Some(tb
.tokenizer_state_for_context_elem()),
73 let tok
= Tokenizer
::new(tb
, tok_opts
);
74 Parser { tokenizer: tok, input_buffer: BufferQueue::new() }
78 /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
79 pub struct Parser
<Sink
> where Sink
: TreeSink
{
80 pub tokenizer
: Tokenizer
<TreeBuilder
<Sink
::Handle
, Sink
>>,
81 pub input_buffer
: BufferQueue
,
84 impl<Sink
: TreeSink
> TendrilSink
<tendril
::fmt
::UTF8
> for Parser
<Sink
> {
85 fn process(&mut self, t
: StrTendril
) {
86 self.input_buffer
.push_back(t
);
87 // FIXME: Properly support </script> somehow.
88 while let TokenizerResult
::Script(_
) = self.tokenizer
.feed(&mut self.input_buffer
) {}
91 // FIXME: Is it too noisy to report every character decoding error?
92 fn error(&mut self, desc
: Cow
<'
static, str>) {
93 self.tokenizer
.sink
.sink
.parse_error(desc
)
96 type Output
= Sink
::Output
;
98 fn finish(mut self) -> Self::Output
{
99 // FIXME: Properly support </script> somehow.
100 while let TokenizerResult
::Script(_
) = self.tokenizer
.feed(&mut self.input_buffer
) {}
101 assert
!(self.input_buffer
.is_empty());
102 self.tokenizer
.end();
103 self.tokenizer
.sink
.sink
.finish()
107 impl<Sink
: TreeSink
> Parser
<Sink
> {
108 /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
110 /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
111 /// Decoding is lossy, like `String::from_utf8_lossy`.
112 pub fn from_utf8(self) -> Utf8LossyDecoder
<Self> {
113 Utf8LossyDecoder
::new(self)
120 use serialize
::serialize
;
121 use tendril
::TendrilSink
;
126 let dom
= parse_document(RcDom
::default(), ParseOpts
::default())
128 .one("<title>Test".as_bytes());
129 let mut serialized
= Vec
::new();
130 serialize(&mut serialized
, &dom
.document
, Default
::default()).unwrap();
131 assert_eq
!(String
::from_utf8(serialized
).unwrap().replace(" ", ""),
132 "<html><head><title>Test</title></head><body></body></html>");