]>
Commit | Line | Data |
---|---|---|
83c7162d XL |
1 | use std::borrow::Cow; |
2 | use std::collections::{HashMap, HashSet}; | |
3 | use std::path::Path; | |
4 | ||
dc9dc135 | 5 | use elasticlunr::Index; |
83c7162d | 6 | use pulldown_cmark::*; |
83c7162d | 7 | |
dc9dc135 XL |
8 | use crate::book::{Book, BookItem}; |
9 | use crate::config::Search; | |
10 | use crate::errors::*; | |
11 | use crate::theme::searcher; | |
12 | use crate::utils; | |
83c7162d XL |
13 | |
14 | /// Creates all files required for search. | |
15 | pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { | |
16 | let mut index = Index::new(&["title", "body", "breadcrumbs"]); | |
9fa01778 | 17 | let mut doc_urls = Vec::with_capacity(book.sections.len()); |
83c7162d XL |
18 | |
19 | for item in book.iter() { | |
a2a8927a | 20 | render_item(&mut index, search_config, &mut doc_urls, item)?; |
83c7162d XL |
21 | } |
22 | ||
a2a8927a | 23 | let index = write_to_json(index, search_config, doc_urls)?; |
83c7162d | 24 | debug!("Writing search index ✓"); |
9fa01778 XL |
25 | if index.len() > 10_000_000 { |
26 | warn!("searchindex.json is very large ({} bytes)", index.len()); | |
27 | } | |
83c7162d XL |
28 | |
29 | if search_config.copy_js { | |
9fa01778 XL |
30 | utils::fs::write_file(destination, "searchindex.json", index.as_bytes())?; |
31 | utils::fs::write_file( | |
32 | destination, | |
33 | "searchindex.js", | |
dc9dc135 | 34 | format!("Object.assign(window.search, {});", index).as_bytes(), |
9fa01778 | 35 | )?; |
83c7162d XL |
36 | utils::fs::write_file(destination, "searcher.js", searcher::JS)?; |
37 | utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?; | |
38 | utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?; | |
39 | debug!("Copying search files ✓"); | |
40 | } | |
41 | ||
42 | Ok(()) | |
43 | } | |
44 | ||
45 | /// Uses the given arguments to construct a search document, then inserts it to the given index. | |
9fa01778 | 46 | fn add_doc( |
83c7162d | 47 | index: &mut Index, |
9fa01778 XL |
48 | doc_urls: &mut Vec<String>, |
49 | anchor_base: &str, | |
83c7162d XL |
50 | section_id: &Option<String>, |
51 | items: &[&str], | |
52 | ) { | |
9fa01778 XL |
53 | let url = if let Some(ref id) = *section_id { |
54 | Cow::Owned(format!("{}#{}", anchor_base, id)) | |
83c7162d | 55 | } else { |
9fa01778 | 56 | Cow::Borrowed(anchor_base) |
83c7162d | 57 | }; |
9fa01778 XL |
58 | let url = utils::collapse_whitespace(url.trim()); |
59 | let doc_ref = doc_urls.len().to_string(); | |
60 | doc_urls.push(url.into()); | |
61 | ||
83c7162d XL |
62 | let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim())); |
63 | index.add_doc(&doc_ref, items); | |
64 | } | |
65 | ||
66 | /// Renders markdown into flat unformatted text and adds it to the search index. | |
67 | fn render_item( | |
68 | index: &mut Index, | |
69 | search_config: &Search, | |
9fa01778 | 70 | doc_urls: &mut Vec<String>, |
83c7162d XL |
71 | item: &BookItem, |
72 | ) -> Result<()> { | |
9fa01778 | 73 | let chapter = match *item { |
f035d41b | 74 | BookItem::Chapter(ref ch) if !ch.is_draft_chapter() => ch, |
83c7162d XL |
75 | _ => return Ok(()), |
76 | }; | |
77 | ||
f035d41b XL |
78 | let chapter_path = chapter |
79 | .path | |
80 | .as_ref() | |
81 | .expect("Checked that path exists above"); | |
82 | let filepath = Path::new(&chapter_path).with_extension("html"); | |
83c7162d XL |
83 | let filepath = filepath |
84 | .to_str() | |
f035d41b | 85 | .with_context(|| "Could not convert HTML path to str")?; |
83c7162d XL |
86 | let anchor_base = utils::fs::normalize_path(filepath); |
87 | ||
a2a8927a | 88 | let mut p = utils::new_cmark_parser(&chapter.content, false).peekable(); |
83c7162d | 89 | |
e74abb32 XL |
90 | let mut in_heading = false; |
91 | let max_section_depth = u32::from(search_config.heading_split_level); | |
83c7162d XL |
92 | let mut section_id = None; |
93 | let mut heading = String::new(); | |
94 | let mut body = String::new(); | |
95 | let mut breadcrumbs = chapter.parent_names.clone(); | |
96 | let mut footnote_numbers = HashMap::new(); | |
97 | ||
5869c6ff XL |
98 | breadcrumbs.push(chapter.name.clone()); |
99 | ||
e74abb32 | 100 | while let Some(event) = p.next() { |
83c7162d | 101 | match event { |
a2a8927a | 102 | Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => { |
9fa01778 | 103 | if !heading.is_empty() { |
e74abb32 | 104 | // Section finished, the next heading is following now |
83c7162d XL |
105 | // Write the data to the index, and clear it for the next section |
106 | add_doc( | |
107 | index, | |
9fa01778 | 108 | doc_urls, |
83c7162d XL |
109 | &anchor_base, |
110 | §ion_id, | |
111 | &[&heading, &body, &breadcrumbs.join(" » ")], | |
112 | ); | |
113 | section_id = None; | |
114 | heading.clear(); | |
115 | body.clear(); | |
116 | breadcrumbs.pop(); | |
117 | } | |
118 | ||
e74abb32 | 119 | in_heading = true; |
83c7162d | 120 | } |
a2a8927a | 121 | Event::End(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => { |
e74abb32 | 122 | in_heading = false; |
83c7162d XL |
123 | section_id = Some(utils::id_from_content(&heading)); |
124 | breadcrumbs.push(heading.clone()); | |
125 | } | |
126 | Event::Start(Tag::FootnoteDefinition(name)) => { | |
127 | let number = footnote_numbers.len() + 1; | |
128 | footnote_numbers.entry(name).or_insert(number); | |
129 | } | |
dc9dc135 | 130 | Event::Html(html) => { |
e74abb32 XL |
131 | let mut html_block = html.into_string(); |
132 | ||
133 | // As of pulldown_cmark 0.6, html events are no longer contained | |
134 | // in an HtmlBlock tag. We must collect consecutive Html events | |
135 | // into a block ourselves. | |
136 | while let Some(Event::Html(html)) = p.peek() { | |
a2a8927a | 137 | html_block.push_str(html); |
e74abb32 XL |
138 | p.next(); |
139 | } | |
140 | ||
dc9dc135 | 141 | body.push_str(&clean_html(&html_block)); |
dc9dc135 | 142 | } |
e74abb32 | 143 | Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => { |
94222f64 | 144 | // Insert spaces where HTML output would usually separate text |
83c7162d | 145 | // to ensure words don't get merged together |
e74abb32 | 146 | if in_heading { |
83c7162d XL |
147 | heading.push(' '); |
148 | } else { | |
149 | body.push(' '); | |
150 | } | |
151 | } | |
dc9dc135 | 152 | Event::Text(text) | Event::Code(text) => { |
e74abb32 | 153 | if in_heading { |
83c7162d XL |
154 | heading.push_str(&text); |
155 | } else { | |
156 | body.push_str(&text); | |
157 | } | |
158 | } | |
83c7162d XL |
159 | Event::FootnoteReference(name) => { |
160 | let len = footnote_numbers.len() + 1; | |
161 | let number = footnote_numbers.entry(name).or_insert(len); | |
162 | body.push_str(&format!(" [{}] ", number)); | |
163 | } | |
dc9dc135 | 164 | Event::TaskListMarker(_checked) => {} |
83c7162d XL |
165 | } |
166 | } | |
167 | ||
a2a8927a XL |
168 | if !body.is_empty() || !heading.is_empty() { |
169 | if heading.is_empty() { | |
170 | if let Some(chapter) = breadcrumbs.first() { | |
171 | heading = chapter.clone(); | |
172 | } | |
173 | } | |
83c7162d XL |
174 | // Make sure the last section is added to the index |
175 | add_doc( | |
176 | index, | |
9fa01778 | 177 | doc_urls, |
83c7162d XL |
178 | &anchor_base, |
179 | §ion_id, | |
180 | &[&heading, &body, &breadcrumbs.join(" » ")], | |
181 | ); | |
182 | } | |
183 | ||
184 | Ok(()) | |
185 | } | |
186 | ||
9fa01778 | 187 | fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> { |
dc9dc135 | 188 | use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField}; |
9fa01778 | 189 | use std::collections::BTreeMap; |
83c7162d XL |
190 | |
191 | #[derive(Serialize)] | |
192 | struct ResultsOptions { | |
193 | limit_results: u32, | |
194 | teaser_word_count: u32, | |
195 | } | |
196 | ||
197 | #[derive(Serialize)] | |
198 | struct SearchindexJson { | |
199 | /// The options used for displaying search results | |
9fa01778 | 200 | results_options: ResultsOptions, |
83c7162d | 201 | /// The searchoptions for elasticlunr.js |
9fa01778 XL |
202 | search_options: SearchOptions, |
203 | /// Used to lookup a document's URL from an integer document ref. | |
204 | doc_urls: Vec<String>, | |
83c7162d XL |
205 | /// The index for elasticlunr.js |
206 | index: elasticlunr::Index, | |
207 | } | |
208 | ||
209 | let mut fields = BTreeMap::new(); | |
210 | let mut opt = SearchOptionsField::default(); | |
211 | opt.boost = Some(search_config.boost_title); | |
212 | fields.insert("title".into(), opt); | |
213 | opt.boost = Some(search_config.boost_paragraph); | |
214 | fields.insert("body".into(), opt); | |
215 | opt.boost = Some(search_config.boost_hierarchy); | |
216 | fields.insert("breadcrumbs".into(), opt); | |
217 | ||
9fa01778 | 218 | let search_options = SearchOptions { |
83c7162d XL |
219 | bool: if search_config.use_boolean_and { |
220 | SearchBool::And | |
221 | } else { | |
222 | SearchBool::Or | |
223 | }, | |
224 | expand: search_config.expand, | |
225 | fields, | |
226 | }; | |
227 | ||
9fa01778 | 228 | let results_options = ResultsOptions { |
83c7162d XL |
229 | limit_results: search_config.limit_results, |
230 | teaser_word_count: search_config.teaser_word_count, | |
231 | }; | |
232 | ||
233 | let json_contents = SearchindexJson { | |
9fa01778 XL |
234 | results_options, |
235 | search_options, | |
236 | doc_urls, | |
83c7162d XL |
237 | index, |
238 | }; | |
9fa01778 XL |
239 | |
240 | // By converting to serde_json::Value as an intermediary, we use a | |
241 | // BTreeMap internally and can force a stable ordering of map keys. | |
242 | let json_contents = serde_json::to_value(&json_contents)?; | |
83c7162d XL |
243 | let json_contents = serde_json::to_string(&json_contents)?; |
244 | ||
9fa01778 | 245 | Ok(json_contents) |
83c7162d XL |
246 | } |
247 | ||
248 | fn clean_html(html: &str) -> String { | |
249 | lazy_static! { | |
250 | static ref AMMONIA: ammonia::Builder<'static> = { | |
251 | let mut clean_content = HashSet::new(); | |
252 | clean_content.insert("script"); | |
253 | clean_content.insert("style"); | |
254 | let mut builder = ammonia::Builder::new(); | |
255 | builder | |
256 | .tags(HashSet::new()) | |
257 | .tag_attributes(HashMap::new()) | |
258 | .generic_attributes(HashSet::new()) | |
259 | .link_rel(None) | |
260 | .allowed_classes(HashMap::new()) | |
261 | .clean_content_tags(clean_content); | |
262 | builder | |
263 | }; | |
264 | } | |
265 | AMMONIA.clean(html).to_string() | |
266 | } |