]> git.proxmox.com Git - rustc.git/blob - vendor/elasticlunr-rs/src/lib.rs
New upstream version 1.49.0+dfsg1
[rustc.git] / vendor / elasticlunr-rs / src / lib.rs
1 //!# elasticlunr-rs
2 //!
3 //! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs)
4 //! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs)
5 //! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs)
6 //!
7 //! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to
8 //! be used for generating compatible search indices.
9 //!
10 //! Access to all index-generating functionality is provided. Most users will only need to use the
11 //! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types.
12 //!
13 //! ## Example
14 //!
15 //! ```
16 //! use std::fs::File;
17 //! use std::io::Write;
18 //! use elasticlunr::Index;
19 //!
20 //! let mut index = Index::new(&["title", "body"]);
21 //! index.add_doc("1", &["This is a title", "This is body text!"]);
22 //! // Add more docs...
23 //! let mut file = File::create("out.json").unwrap();
24 //! file.write_all(index.to_json_pretty().as_bytes());
25 //! ```
26
27 #![cfg_attr(feature = "bench", feature(test))]
28
29 #[macro_use]
30 extern crate lazy_static;
31 extern crate regex;
32 extern crate serde;
33 #[macro_use]
34 extern crate serde_derive;
35 extern crate serde_json;
36 extern crate strum;
37 #[macro_use]
38 extern crate strum_macros;
39
40 #[cfg(feature = "rust-stemmers")]
41 extern crate rust_stemmers;
42
43 #[cfg(test)]
44 #[macro_use]
45 extern crate maplit;
46 #[cfg(feature = "zh")]
47 extern crate jieba_rs;
48 #[cfg(feature = "ja")]
49 extern crate lindera;
50
51 /// The version of elasticlunr.js this library was designed for.
52 pub const ELASTICLUNR_VERSION: &str = "0.9.5";
53
54 pub mod config;
55 pub mod document_store;
56 pub mod inverted_index;
57 pub mod lang;
58 pub mod pipeline;
59
60 use std::collections::{BTreeMap, BTreeSet};
61
62 use document_store::DocumentStore;
63 use inverted_index::InvertedIndex;
64 pub use lang::Language;
65 pub use pipeline::Pipeline;
66
67 /// A builder for an `Index` with custom parameters.
68 ///
69 /// # Example
70 /// ```
71 /// # use elasticlunr::{Index, IndexBuilder};
72 /// let mut index = IndexBuilder::new()
73 /// .save_docs(false)
74 /// .add_fields(&["title", "subtitle", "body"])
75 /// .set_ref("doc_id")
76 /// .build();
77 /// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]);
78 /// ```
79 pub struct IndexBuilder {
80 save: bool,
81 fields: BTreeSet<String>,
82 ref_field: String,
83 pipeline: Option<Pipeline>,
84 }
85
86 impl Default for IndexBuilder {
87 fn default() -> Self {
88 IndexBuilder {
89 save: true,
90 fields: BTreeSet::new(),
91 ref_field: "id".into(),
92 pipeline: None,
93 }
94 }
95 }
96
97 impl IndexBuilder {
98 pub fn new() -> Self {
99 Default::default()
100 }
101
102 /// Set whether or not documents should be saved in the `Index`'s document store.
103 pub fn save_docs(mut self, save: bool) -> Self {
104 self.save = save;
105 self
106 }
107
108 /// Add a document field to the `Index`.
109 ///
110 /// If the `Index` already contains a field with an identical name, adding it again is a no-op.
111 pub fn add_field(mut self, field: &str) -> Self {
112 self.fields.insert(field.into());
113 self
114 }
115
116 /// Add the document fields to the `Index`.
117 ///
118 /// If the `Index` already contains a field with an identical name, adding it again is a no-op.
119 pub fn add_fields<I>(mut self, fields: I) -> Self
120 where
121 I: IntoIterator,
122 I::Item: AsRef<str>,
123 {
124 self.fields
125 .extend(fields.into_iter().map(|f| f.as_ref().into()));
126 self
127 }
128
129 /// Set the key used to store the document reference field.
130 pub fn set_ref(mut self, ref_field: &str) -> Self {
131 self.ref_field = ref_field.into();
132 self
133 }
134
135 /// Set the pipeline used by the `Index`.
136 pub fn set_pipeline(mut self, pipeline: Pipeline) -> Self {
137 self.pipeline = Some(pipeline);
138 self
139 }
140
141 /// Build an `Index` from this builder.
142 pub fn build(self) -> Index {
143 let index = self
144 .fields
145 .iter()
146 .map(|f| (f.clone(), InvertedIndex::new()))
147 .collect();
148
149 Index {
150 index,
151 fields: self.fields.into_iter().collect(),
152 ref_field: self.ref_field,
153 document_store: DocumentStore::new(self.save),
154 pipeline: self.pipeline.unwrap_or_default(),
155 version: ::ELASTICLUNR_VERSION,
156 lang: Language::English,
157 }
158 }
159 }
160
161 /// An elasticlunr search index.
162 #[derive(Serialize, Deserialize, Debug)]
163 #[serde(rename_all = "camelCase")]
164 pub struct Index {
165 // TODO(3.0): Use a BTreeSet<String>
166 pub fields: Vec<String>,
167 pub pipeline: Pipeline,
168 #[serde(rename = "ref")]
169 pub ref_field: String,
170 pub version: &'static str,
171 index: BTreeMap<String, InvertedIndex>,
172 pub document_store: DocumentStore,
173 lang: Language,
174 }
175
176 impl Index {
177 /// Create a new index with the provided fields.
178 ///
179 /// # Example
180 ///
181 /// ```
182 /// # use elasticlunr::Index;
183 /// let mut index = Index::new(&["title", "body", "breadcrumbs"]);
184 /// index.add_doc("1", &["How to Foo", "First, you need to `bar`.", "Chapter 1 > How to Foo"]);
185 /// ```
186 ///
187 /// # Panics
188 ///
189 /// Panics if multiple given fields are identical.
190 pub fn new<I>(fields: I) -> Self
191 where
192 I: IntoIterator,
193 I::Item: AsRef<str>,
194 {
195 Index::with_language(Language::English, fields)
196 }
197
198 /// Create a new index with the provided fields for the given
199 /// [`Language`](lang/enum.Language.html).
200 ///
201 /// # Example
202 ///
203 /// ```
204 /// # use elasticlunr::{Index, Language};
205 /// let mut index = Index::with_language(Language::English, &["title", "body"]);
206 /// index.add_doc("1", &["this is a title", "this is body text"]);
207 /// ```
208 ///
209 /// # Panics
210 ///
211 /// Panics if multiple given fields are identical.
212 pub fn with_language<I>(lang: Language, fields: I) -> Self
213 where
214 I: IntoIterator,
215 I::Item: AsRef<str>,
216 {
217 let mut indices = BTreeMap::new();
218 let mut field_vec = Vec::new();
219 for field in fields {
220 let field = field.as_ref().to_string();
221 if field_vec.contains(&field) {
222 panic!("The Index already contains the field {}", field);
223 }
224 field_vec.push(field.clone());
225 indices.insert(field, InvertedIndex::new());
226 }
227
228 Index {
229 fields: field_vec,
230 index: indices,
231 pipeline: lang.make_pipeline(),
232 ref_field: "id".into(),
233 version: ::ELASTICLUNR_VERSION,
234 document_store: DocumentStore::new(true),
235 lang: lang,
236 }
237 }
238
239 /// Add the data from a document to the index.
240 ///
241 /// *NOTE: The elements of `data` should be provided in the same order as
242 /// the fields used to create the index.*
243 ///
244 /// # Example
245 /// ```
246 /// # use elasticlunr::Index;
247 /// let mut index = Index::new(&["title", "body"]);
248 /// index.add_doc("1", &["this is a title", "this is body text"]);
249 /// ```
250 pub fn add_doc<I>(&mut self, doc_ref: &str, data: I)
251 where
252 I: IntoIterator,
253 I::Item: AsRef<str>,
254 {
255 let mut doc = BTreeMap::new();
256 doc.insert(self.ref_field.clone(), doc_ref.into());
257 let mut token_freq = BTreeMap::new();
258
259 for (field, value) in self.fields.iter().zip(data) {
260 doc.insert(field.clone(), value.as_ref().to_string());
261
262 if field == &self.ref_field {
263 continue;
264 }
265
266 let raw_tokens: Vec<String>;
267
268 match self.lang {
269 #[cfg(feature = "zh")]
270 Language::Chinese => {
271 raw_tokens = pipeline::tokenize_chinese(value.as_ref());
272 }
273 #[cfg(feature = "ja")]
274 Language::Japanese => {
275 raw_tokens = pipeline::tokenize_japanese(value.as_ref());
276 }
277 _ => {
278 raw_tokens = pipeline::tokenize(value.as_ref());
279 }
280 }
281
282 let tokens = self.pipeline.run(raw_tokens);
283
284 self.document_store
285 .add_field_length(doc_ref, field, tokens.len());
286
287 for token in tokens {
288 *token_freq.entry(token).or_insert(0u64) += 1;
289 }
290
291 for (token, count) in &token_freq {
292 let freq = (*count as f64).sqrt();
293
294 self.index
295 .get_mut(field)
296 .expect(&format!("InvertedIndex does not exist for field {}", field))
297 .add_token(doc_ref, token, freq);
298 }
299 }
300
301 self.document_store.add_doc(doc_ref, doc);
302 }
303
304 pub fn get_fields(&self) -> &[String] {
305 &self.fields
306 }
307
308 /// Returns the index, serialized to pretty-printed JSON.
309 pub fn to_json_pretty(&self) -> String {
310 serde_json::to_string_pretty(&self).unwrap()
311 }
312
313 /// Returns the index, serialized to JSON.
314 pub fn to_json(&self) -> String {
315 serde_json::to_string(&self).unwrap()
316 }
317 }
318
319 #[cfg(test)]
320 mod tests {
321 use super::*;
322
323 #[test]
324 fn add_field_to_builder() {
325 let idx = IndexBuilder::new()
326 .add_field("foo")
327 .add_fields(&["foo", "bar", "baz"])
328 .build();
329
330 let idx_fields = idx.get_fields();
331 for f in &["foo", "bar", "baz"] {
332 assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1);
333 }
334 }
335
336 #[test]
337 fn adding_document_to_index() {
338 let mut idx = Index::new(&["body"]);
339 idx.add_doc("1", &["this is a test"]);
340
341 assert_eq!(idx.document_store.len(), 1);
342 assert_eq!(
343 idx.document_store.get_doc("1").unwrap(),
344 btreemap! {
345 "id".into() => "1".into(),
346 "body".into() => "this is a test".into(),
347 }
348 );
349 }
350
351 #[test]
352 fn adding_document_with_empty_field() {
353 let mut idx = Index::new(&["title", "body"]);
354
355 idx.add_doc("1", &["", "test"]);
356 assert_eq!(idx.index["body"].get_doc_frequency("test"), 1);
357 assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.);
358 }
359
360 #[test]
361 #[should_panic]
362 fn creating_index_with_identical_fields_panics() {
363 let _idx = Index::new(&["title", "body", "title"]);
364 }
365 }