2 extern crate html5ever
;
4 extern crate lazy_static
;
6 use html5ever
::serialize
::{serialize, SerializeOpts}
;
7 use html5ever
::{driver as html, QualName}
;
8 use markup5ever_rcdom
::{Handle, NodeData, RcDom, SerializableHandle}
;
9 use pulldown_cmark
::{Options, Parser}
;
12 use std
::collections
::HashSet
;
14 use std
::rc
::{Rc, Weak}
;
15 use tendril
::stream
::TendrilSink
;
20 pub fn test_markdown_html(input
: &str, output
: &str) {
21 let mut s
= String
::new();
23 let mut opts
= Options
::empty();
24 opts
.insert(Options
::ENABLE_TABLES
);
25 opts
.insert(Options
::ENABLE_FOOTNOTES
);
26 opts
.insert(Options
::ENABLE_STRIKETHROUGH
);
27 opts
.insert(Options
::ENABLE_TASKLISTS
);
29 let p
= Parser
::new_ext(input
, opts
);
30 pulldown_cmark
::html
::push_html(&mut s
, p
);
32 assert_eq
!(normalize_html(output
), normalize_html(&s
));
36 static ref WHITESPACE_RE
: Regex
= Regex
::new(r
"\s+").unwrap();
37 static ref LEADING_WHITESPACE_RE
: Regex
= Regex
::new(r
"\A\s+").unwrap();
38 static ref TRAILING_WHITESPACE_RE
: Regex
= Regex
::new(r
"\s+\z").unwrap();
39 static ref BLOCK_TAGS
: HashSet
<&'
static str> = [
94 static ref WHITESPACE_SENSITIVE_TAGS
: HashSet
<&'
static str> =
95 ["pre", "code", "h1", "h2", "h3", "h4", "h5", "h6"]
99 static ref TABLE_TAGS
: HashSet
<&'
static str> = ["table", "thead", "tbody", "tr", "td"]
105 fn make_html_parser() -> html
::Parser
<RcDom
> {
106 html
::parse_fragment(
108 html
::ParseOpts
::default(),
109 QualName
::new(None
, ns
!(html
), local_name
!("div")),
114 fn normalize_html(s
: &str) -> String
{
115 let parser
= make_html_parser();
116 let dom
= parser
.one(s
);
117 let body
: SerializableHandle
= normalize_dom(&dom
).into();
118 let opts
= SerializeOpts
::default();
119 let mut ret_val
= Vec
::new();
120 serialize(&mut ret_val
, &body
, opts
)
121 .expect("Writing to a string shouldn't fail (expect on OOM)");
122 String
::from_utf8(ret_val
).expect("html5ever should always produce UTF8")
125 fn normalize_dom(dom
: &RcDom
) -> Handle
{
127 let children
= dom
.document
.children
.borrow();
130 let mut current_level
= Vec
::new();
131 let mut next_level
= Vec
::new();
132 current_level
.extend(body
.children
.borrow().iter().cloned().rev());
134 while let Some(mut node
) = current_level
.pop() {
135 let parent
= node
.parent
.replace(None
);
136 node
.parent
.replace(parent
.clone());
138 .expect("a node in the DOM will have a parent, except the root, which is not processed")
139 .upgrade().expect("a node's parent will be pointed to by its parent (or the root pointer), and will not be dropped");
140 let retain
= normalize_node(&parent
, &mut node
);
142 let mut siblings
= parent
.children
.borrow_mut();
143 siblings
.retain(|s
| !Rc
::ptr_eq(&node
, s
));
145 next_level
.extend(node
.children
.borrow().iter().cloned().rev());
148 if next_level
.is_empty() {
151 mem
::swap(&mut next_level
, &mut current_level
);
156 // Returns false if node is an empty text node or an empty tbody.
157 // Returns true otherwise.
158 fn normalize_node(parent
: &Handle
, node
: &mut Handle
) -> bool
{
160 NodeData
::Comment { .. }
161 | NodeData
::Doctype { .. }
163 | NodeData
::ProcessingInstruction { .. }
=> true,
164 NodeData
::Text { ref contents, .. }
=> {
165 let mut contents
= contents
.borrow_mut();
167 let mut parent
= parent
.clone();
169 let is_pre
= if let NodeData
::Element { ref name, .. }
= parent
.data
{
170 WHITESPACE_SENSITIVE_TAGS
.contains(&&*name
.local
.to_ascii_lowercase())
177 let parent_
= parent
.parent
.replace(None
);
178 parent
.parent
.replace(parent_
.clone());
179 let parent_
= parent_
.as_ref().and_then(Weak
::upgrade
);
180 if let Some(parent_
) = parent_
{
188 let (is_first_in_block
, is_last_in_block
) = {
189 let mut is_first_in_block
= true;
190 let mut is_last_in_block
= true;
191 let mut parent
= parent
.clone();
192 let mut node
= node
.clone();
194 let reached_block
= if let NodeData
::Element { ref name, .. }
= parent
.data
196 BLOCK_TAGS
.contains(&&*name
.local
.to_ascii_lowercase())
200 let (is_first
, is_last
) = {
201 let siblings
= parent
.children
.borrow();
204 siblings
.get(0).map(|s
| Rc
::ptr_eq(s
, n
)).unwrap_or(false),
207 .get(siblings
.len() - 1)
208 .map(|s
| Rc
::ptr_eq(s
, n
))
212 is_first_in_block
= is_first_in_block
&& is_first
;
213 is_last_in_block
= is_last_in_block
&& is_last
;
214 if (is_first_in_block
|| is_last_in_block
) && !reached_block
{
215 node
= parent
.clone();
216 let parent_
= parent
.parent
.replace(None
);
217 parent
.parent
.replace(parent_
.clone());
218 let parent_
= parent_
.as_ref().and_then(Weak
::upgrade
);
219 if let Some(parent_
) = parent_
{
222 break (is_first_in_block
, is_last_in_block
);
225 break (is_first_in_block
, is_last_in_block
);
229 let is_preceeded_by_ws
= {
230 let mut parent
= parent
.clone();
231 let mut node
= node
.clone();
234 let siblings
= parent
.children
.borrow();
236 siblings
.get(0).map(|s
| Rc
::ptr_eq(s
, n
)).unwrap_or(false)
239 node
= parent
.clone();
240 let parent_
= parent
.parent
.replace(None
);
241 parent
.parent
.replace(parent_
.clone());
242 let parent_
= parent_
.as_ref().and_then(Weak
::upgrade
);
243 if let Some(parent_
) = parent_
{
249 let siblings
= parent
.children
.borrow();
252 'search
: for (i
, s
) in siblings
.iter().enumerate() {
253 if Rc
::ptr_eq(s
, n
) {
260 "The list of node's parent's children shall contain node"
264 "If node is not first, then node's position shall not be zero"
266 let mut preceeding
= siblings
[pos
- 1].clone();
268 if let NodeData
::Text { .. }
= preceeding
.data
{
272 let ch
= preceeding
.children
.borrow();
276 if let Some(preceeding_
) = ch
.get(ch
.len() - 1) {
283 if let NodeData
::Text { ref contents, .. }
= preceeding
.data
{
284 break 'ascent TRAILING_WHITESPACE_RE
.is_match(&*contents
.borrow());
292 let is_in_table
= if let NodeData
::Element { ref name, .. }
= parent
.data
{
293 TABLE_TAGS
.contains(&&*name
.local
.to_ascii_lowercase())
297 let whitespace_replacement
= if is_in_table { "" }
else { " " }
;
298 *contents
= WHITESPACE_RE
299 .replace_all(&*contents
, whitespace_replacement
)
303 if is_first_in_block
|| is_preceeded_by_ws
{
304 *contents
= LEADING_WHITESPACE_RE
305 .replace_all(&*contents
, "")
309 if is_last_in_block
{
310 *contents
= TRAILING_WHITESPACE_RE
311 .replace_all(&*contents
, "")
315 // TODO: collapse whitespace when adjacent to whitespace.
316 // For example, the whitespace in the span should be collapsed in all of these cases:
318 // " <span> q </span> "
319 // "<b>q </b><span> q</span>"
320 // "<b>q <i></i></b><span> q</span>"
321 // "<b>q <i></i></b><span> q</span>"
322 // "q <b></b><span> q</span>"
331 let mut attrs
= attrs
.borrow_mut();
332 for a
in attrs
.iter_mut() {
333 a
.name
.local
= a
.name
.local
.to_ascii_lowercase().into();
335 attrs
.sort_by(|a
: &html5ever
::Attribute
, b
: &html5ever
::Attribute
| {
336 (&*a
.name
.local
).cmp(&*b
.name
.local
)
338 let ascii_name
= &*name
.local
.to_ascii_lowercase();
339 // drop empty tbody's
340 ascii_name
!= "tbody"
341 || node
.children
.borrow().len() > 1
347 .map(|only_child
| match only_child
.data
{
348 NodeData
::Text { ref contents, .. }
=> {
349 !contents
.borrow().chars().all(|c
| c
.is_whitespace())
359 fn strip_div_newline() {
360 assert_eq
!("<div></div>", normalize_html("<div>\n</div>"));
364 fn strip_end_newline() {
365 assert_eq
!("test", normalize_html("test\n"));
369 fn strip_double_space() {
370 assert_eq
!("test mess", normalize_html("test mess"));
374 fn strip_inline_internal_text() {
376 "<u>a </u>b <u>c</u>",
377 normalize_html("<u> a </u> b <u> c </u>")
382 fn strip_inline_block_internal_text() {
384 "<u>a </u>b <u>c</u>",
385 normalize_html(" <u> a </u> b <u> c </u> ")
390 fn leaves_necessary_whitespace_alone() {
391 assert_eq
!("<u>a</u> b <u>c</u>", normalize_html("<u>a</u> b <u>c</u>"))
395 fn leaves_necessary_whitespace_alone_weird() {
397 "<u>a </u>b <u>c</u>",
398 normalize_html(" <u>a </u>b <u>c</u>")
403 fn leaves_necessary_whitespace_all_nested() {
405 "<u></u><u></u><u></u><u></u>",
406 normalize_html("<u> </u><u> </u><u> </u><u> </u>")
411 fn drops_empty_tbody() {
413 "<table><thead><tr><td>hi</td></tr></thead></table>",
414 normalize_html("<table><thead><tr><td>hi</td></tr></thead><tbody> </tbody></table>")
419 fn leaves_nonempty_tbody() {
420 let input
= "<table><thead><tr><td>hi</td></tr></thead><tbody><tr></tr></tbody></table>";
421 assert_eq
!(input
, normalize_html(input
))