2 use std
::collections
::hash_map
::Entry
;
3 use std
::collections
::{HashMap, HashSet}
;
6 use utf8_ranges
::{Utf8Range, Utf8Sequences}
;
8 use crate::automaton
::Automaton
;
10 const STATE_LIMIT
: usize = 10_000; // currently at least 20MB >_<
12 /// An error that occurred while building a Levenshtein automaton.
14 /// This error is only defined when the `levenshtein` crate feature is enabled.
16 pub enum LevenshteinError
{
17 /// If construction of the automaton reaches some hard-coded limit
18 /// on the number of states, then this error is returned.
20 /// The number given is the limit that was exceeded.
24 impl fmt
::Display
for LevenshteinError
{
25 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
27 LevenshteinError
::TooManyStates(size_limit
) => write
!(
29 "Levenshtein automaton exceeds size limit of \
37 impl std
::error
::Error
for LevenshteinError {}
39 /// A Unicode aware Levenshtein automaton for running efficient fuzzy queries.
41 /// This is only defined when the `levenshtein` crate feature is enabled.
43 /// A Levenshtein automata is one way to search any finite state transducer
44 /// for keys that *approximately* match a given query. A Levenshtein automaton
45 /// approximates this by returning all keys within a certain edit distance of
46 /// the query. The edit distance is defined by the number of insertions,
47 /// deletions and substitutions required to turn the query into the key.
48 /// Insertions, deletions and substitutions are based on
49 /// **Unicode characters** (where each character is a single Unicode scalar
54 /// This example shows how to find all keys within an edit distance of `1`
58 /// use fst::automaton::Levenshtein;
59 /// use fst::{IntoStreamer, Streamer, Set};
62 /// let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"];
63 /// let set = Set::from_iter(keys).unwrap();
65 /// let lev = Levenshtein::new("foo", 1).unwrap();
66 /// let mut stream = set.search(&lev).into_stream();
68 /// let mut keys = vec![];
69 /// while let Some(key) = stream.next() {
70 /// keys.push(key.to_vec());
72 /// assert_eq!(keys, vec![
73 /// "fo".as_bytes(), // 1 deletion
74 /// "fob".as_bytes(), // 1 substitution
75 /// "foo".as_bytes(), // 0 insertions/deletions/substitutions
76 /// "food".as_bytes(), // 1 insertion
81 /// This example only uses ASCII characters, but it will work equally well
82 /// on Unicode characters.
84 /// # Warning: experimental
86 /// While executing this Levenshtein automaton against a finite state
87 /// transducer will be very fast, *constructing* an automaton may not be.
88 /// Namely, this implementation is a proof of concept. While I believe the
89 /// algorithmic complexity is not exponential, the implementation is not speedy
90 /// and it can use enormous amounts of memory (tens of MB before a hard-coded
91 /// limit will cause an error to be returned).
93 /// This is important functionality, so one should count on this implementation
94 /// being vastly improved in the future.
95 pub struct Levenshtein
{
96 prog
: DynamicLevenshtein
,
101 /// Create a new Levenshtein query.
103 /// The query finds all matching terms that are at most `distance`
104 /// edit operations from `query`. (An edit operation may be an insertion,
105 /// a deletion or a substitution.)
107 /// If the underlying automaton becomes too big, then an error is returned.
109 /// A `Levenshtein` value satisfies the `Automaton` trait, which means it
110 /// can be used with the `search` method of any finite state transducer.
115 ) -> Result
<Levenshtein
, LevenshteinError
> {
116 let lev
= DynamicLevenshtein
{
117 query
: query
.to_owned(),
118 dist
: distance
as usize,
120 let dfa
= DfaBuilder
::new(lev
.clone()).build()?
;
121 Ok(Levenshtein { prog: lev, dfa }
)
125 impl fmt
::Debug
for Levenshtein
{
126 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
129 "Levenshtein(query: {:?}, distance: {:?})",
130 self.prog
.query
, self.prog
.dist
136 struct DynamicLevenshtein
{
141 impl DynamicLevenshtein
{
142 fn start(&self) -> Vec
<usize> {
143 (0..self.query
.chars().count() + 1).collect()
146 fn is_match(&self, state
: &[usize]) -> bool
{
147 state
.last().map(|&n
| n
<= self.dist
).unwrap_or(false)
150 fn can_match(&self, state
: &[usize]) -> bool
{
151 state
.iter().min().map(|&n
| n
<= self.dist
).unwrap_or(false)
154 fn accept(&self, state
: &[usize], chr
: Option
<char>) -> Vec
<usize> {
155 let mut next
= vec
![state
[0] + 1];
156 for (i
, c
) in self.query
.chars().enumerate() {
157 let cost
= if Some(c
) == chr { 0 }
else { 1 }
;
159 cmp
::min(next
[i
] + 1, state
[i
+ 1] + 1),
162 next
.push(cmp
::min(v
, self.dist
+ 1));
168 impl Automaton
for Levenshtein
{
169 type State
= Option
<usize>;
172 fn start(&self) -> Option
<usize> {
177 fn is_match(&self, state
: &Option
<usize>) -> bool
{
178 state
.map(|state
| self.dfa
.states
[state
].is_match
).unwrap_or(false)
182 fn can_match(&self, state
: &Option
<usize>) -> bool
{
187 fn accept(&self, state
: &Option
<usize>, byte
: u8) -> Option
<usize> {
188 state
.and_then(|state
| self.dfa
.states
[state
].next
[byte
as usize])
198 next
: [Option
<usize>; 256],
202 impl fmt
::Debug
for State
{
203 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
204 writeln
!(f
, "State {{")?
;
205 writeln
!(f
, " is_match: {:?}", self.is_match
)?
;
207 if let Some(si
) = self.next
[i
] {
208 writeln
!(f
, " {:?}: {:?}", i
, si
)?
;
217 lev
: DynamicLevenshtein
,
218 cache
: HashMap
<Vec
<usize>, usize>,
222 fn new(lev
: DynamicLevenshtein
) -> DfaBuilder
{
224 dfa
: Dfa { states: Vec::with_capacity(16) }
,
226 cache
: HashMap
::with_capacity(1024),
230 fn build(mut self) -> Result
<Dfa
, LevenshteinError
> {
231 let mut stack
= vec
![self.lev
.start()];
232 let mut seen
= HashSet
::new();
233 let query
= self.lev
.query
.clone(); // temp work around of borrowck
234 while let Some(lev_state
) = stack
.pop() {
235 let dfa_si
= self.cached_state(&lev_state
).unwrap();
236 let mismatch
= self.add_mismatch_utf8_states(dfa_si
, &lev_state
);
237 if let Some((next_si
, lev_next
)) = mismatch
{
238 if !seen
.contains(&next_si
) {
239 seen
.insert(next_si
);
240 stack
.push(lev_next
);
243 for (i
, c
) in query
.chars().enumerate() {
244 if lev_state
[i
] > self.lev
.dist
{
247 let lev_next
= self.lev
.accept(&lev_state
, Some(c
));
248 let next_si
= self.cached_state(&lev_next
);
249 if let Some(next_si
) = next_si
{
250 self.add_utf8_sequences(true, dfa_si
, next_si
, c
, c
);
251 if !seen
.contains(&next_si
) {
252 seen
.insert(next_si
);
253 stack
.push(lev_next
);
257 if self.dfa
.states
.len() > STATE_LIMIT
{
258 return Err(LevenshteinError
::TooManyStates(STATE_LIMIT
));
264 fn cached_state(&mut self, lev_state
: &[usize]) -> Option
<usize> {
265 self.cached(lev_state
).map(|(si
, _
)| si
)
268 fn cached(&mut self, lev_state
: &[usize]) -> Option
<(usize, bool
)> {
269 if !self.lev
.can_match(lev_state
) {
272 Some(match self.cache
.entry(lev_state
.to_vec()) {
273 Entry
::Occupied(v
) => (*v
.get(), true),
274 Entry
::Vacant(v
) => {
275 let is_match
= self.lev
.is_match(lev_state
);
276 self.dfa
.states
.push(State { next: [None; 256], is_match }
);
277 (*v
.insert(self.dfa
.states
.len() - 1), false)
282 fn add_mismatch_utf8_states(
286 ) -> Option
<(usize, Vec
<usize>)> {
287 let mismatch_state
= self.lev
.accept(lev_state
, None
);
288 let to_si
= match self.cached(&mismatch_state
) {
291 // Some((si, true)) => return Some((si, mismatch_state)),
292 // Some((si, false)) => si,
294 self.add_utf8_sequences(false, from_si
, to_si
, '
\u{0}'
, '
\u{10FFFF}'
);
295 return Some((to_si
, mismatch_state
));
298 fn add_utf8_sequences(
306 for seq
in Utf8Sequences
::new(from_chr
, to_chr
) {
307 let mut fsi
= from_si
;
308 for range
in &seq
.as_slice()[0..seq
.len() - 1] {
309 let tsi
= self.new_state(false);
310 self.add_utf8_range(overwrite
, fsi
, tsi
, range
);
317 &seq
.as_slice()[seq
.len() - 1],
329 for b
in range
.start
as usize..range
.end
as usize + 1 {
330 if overwrite
|| self.dfa
.states
[from
].next
[b
].is_none() {
331 self.dfa
.states
[from
].next
[b
] = Some(to
);
336 fn new_state(&mut self, is_match
: bool
) -> usize {
337 self.dfa
.states
.push(State { next: [None; 256], is_match }
);
338 self.dfa
.states
.len() - 1