1 //! This library implements string similarity metrics.
4 use std
::cmp
::{max, min}
;
5 use std
::collections
::HashMap
;
7 #[derive(Debug, PartialEq)]
12 pub type HammingResult
= Result
<usize, StrSimError
>;
14 /// Calculates the number of positions in the two strings where the characters
15 /// differ. Returns an error if the strings have different lengths.
18 /// use strsim::hamming;
20 /// match hamming("hamming", "hammers") {
21 /// Ok(distance) => assert_eq!(3, distance),
22 /// Err(why) => panic!("{:?}", why)
25 pub fn hamming(a
: &str, b
: &str) -> HammingResult
{
26 let (mut ita
, mut itb
, mut count
) = (a
.chars(), b
.chars(), 0);
28 match (ita
.next(), itb
.next()){
29 (Some(x
), Some(y
)) => if x
!= y { count += 1 }
,
30 (None
, None
) => return Ok(count
),
31 _
=> return Err(StrSimError
::DifferentLengthArgs
),
36 /// Calculates the Jaro similarity between two strings. The returned value
37 /// is between 0.0 and 1.0 (higher value means more similar).
42 /// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
45 pub fn jaro(a
: &str, b
: &str) -> f64 {
46 if a
== b { return 1.0; }
48 let a_len
= a
.chars().count();
49 let b_len
= b
.chars().count();
51 // The check for lengths of one here is to prevent integer overflow when
52 // calculating the search range.
53 if a_len
== 0 || b_len
== 0 || (a_len
== 1 && b_len
== 1) {
57 let search_range
= (max(a_len
, b_len
) / 2) - 1;
59 let mut b_consumed
= Vec
::with_capacity(b_len
);
61 b_consumed
.push(false);
63 let mut matches
= 0.0;
65 let mut transpositions
= 0.0;
66 let mut b_match_index
= 0;
68 for (i
, a_char
) in a
.chars().enumerate() {
70 // prevent integer wrapping
72 max(0, i
- search_range
)
77 let max_bound
= min(b_len
- 1, i
+ search_range
);
79 if min_bound
> max_bound
{
83 for (j
, b_char
) in b
.chars().enumerate() {
84 if min_bound
<= j
&& j
<= max_bound
&& a_char
== b_char
&&
89 if j
< b_match_index
{
90 transpositions
+= 1.0;
102 (1.0 / 3.0) * ((matches
/ a_len
as f64) +
103 (matches
/ b_len
as f64) +
104 ((matches
- transpositions
) / matches
))
108 /// Like Jaro but gives a boost to strings that have a common prefix.
111 /// use strsim::jaro_winkler;
113 /// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
116 pub fn jaro_winkler(a
: &str, b
: &str) -> f64 {
117 let jaro_distance
= jaro(a
, b
);
119 // Don't limit the length of the common prefix
120 let prefix_length
= a
.chars()
122 .take_while(|&(a_char
, b_char
)| a_char
== b_char
)
125 let jaro_winkler_distance
=
126 jaro_distance
+ (0.1 * prefix_length
as f64 * (1.0 - jaro_distance
));
128 if jaro_winkler_distance
<= 1.0 {
129 jaro_winkler_distance
135 /// Calculates the minimum number of insertions, deletions, and substitutions
136 /// required to change one string into the other.
139 /// use strsim::levenshtein;
141 /// assert_eq!(3, levenshtein("kitten", "sitting"));
143 pub fn levenshtein(a
: &str, b
: &str) -> usize {
144 if a
== b { return 0; }
146 let a_len
= a
.chars().count();
147 let b_len
= b
.chars().count();
149 if a_len
== 0 { return b_len; }
150 if b_len
== 0 { return a_len; }
152 let mut cache
: Vec
<usize> = (1..b_len
+1).collect();
158 for (i
, a_char
) in a
.chars().enumerate() {
162 for (j
, b_char
) in b
.chars().enumerate() {
163 let cost
= if a_char
== b_char { 0 }
else { 1 }
;
164 distance_a
= distance_b
+ cost
;
165 distance_b
= cache
[j
];
166 result
= min(result
+ 1, min(distance_a
, distance_b
+ 1));
174 /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and
175 /// 1.0 (inclusive), where 1.0 means the strings are the same.
178 /// use strsim::normalized_levenshtein;
180 /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
181 /// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
182 /// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
183 /// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
184 /// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
186 pub fn normalized_levenshtein(a
: &str, b
: &str) -> f64 {
187 if a
.is_empty() && b
.is_empty() {
190 1.0 - (levenshtein(a
, b
) as f64) / (a
.chars().count().max(b
.chars().count()) as f64)
193 /// Like Levenshtein but allows for adjacent transpositions. Each substring can
194 /// only be edited once.
197 /// use strsim::osa_distance;
199 /// assert_eq!(3, osa_distance("ab", "bca"));
201 pub fn osa_distance(a
: &str, b
: &str) -> usize {
202 let a_len
= a
.chars().count();
203 let b_len
= b
.chars().count();
204 if a
== b { return 0; }
205 else if a_len
== 0 { return b_len; }
206 else if b_len
== 0 { return a_len; }
208 let mut prev_two_distances
: Vec
<usize> = Vec
::with_capacity(b_len
+ 1);
209 let mut prev_distances
: Vec
<usize> = Vec
::with_capacity(b_len
+ 1);
210 let mut curr_distances
: Vec
<usize> = Vec
::with_capacity(b_len
+ 1);
212 let mut prev_a_char
= char::MAX
;
213 let mut prev_b_char
= char::MAX
;
215 for i
in 0..(b_len
+ 1) {
216 prev_two_distances
.push(i
);
217 prev_distances
.push(i
);
218 curr_distances
.push(0);
221 for (i
, a_char
) in a
.chars().enumerate() {
222 curr_distances
[0] = i
+ 1;
224 for (j
, b_char
) in b
.chars().enumerate() {
225 let cost
= if a_char
== b_char { 0 }
else { 1 }
;
226 curr_distances
[j
+ 1] = min(curr_distances
[j
] + 1,
227 min(prev_distances
[j
+ 1] + 1,
228 prev_distances
[j
] + cost
));
229 if i
> 0 && j
> 0 && a_char
!= b_char
&&
230 a_char
== prev_b_char
&& b_char
== prev_a_char
{
231 curr_distances
[j
+ 1] = min(curr_distances
[j
+ 1],
232 prev_two_distances
[j
- 1] + 1);
235 prev_b_char
= b_char
;
238 prev_two_distances
.clone_from(&prev_distances
);
239 prev_distances
.clone_from(&curr_distances
);
240 prev_a_char
= a_char
;
243 curr_distances
[b_len
]
247 /// Like optimal string alignment, but substrings can be edited an unlimited
248 /// number of times, and the triangle inequality holds.
251 /// use strsim::damerau_levenshtein;
253 /// assert_eq!(2, damerau_levenshtein("ab", "bca"));
255 pub fn damerau_levenshtein(a
: &str, b
: &str) -> usize {
256 if a
== b { return 0; }
258 let a_chars
: Vec
<char> = a
.chars().collect();
259 let b_chars
: Vec
<char> = b
.chars().collect();
260 let a_len
= a_chars
.len();
261 let b_len
= b_chars
.len();
263 if a_len
== 0 { return b_len; }
264 if b_len
== 0 { return a_len; }
266 let mut distances
= vec
![vec
![0; b_len
+ 2]; a_len
+ 2];
267 let max_distance
= a_len
+ b_len
;
268 distances
[0][0] = max_distance
;
270 for i
in 0..(a_len
+ 1) {
271 distances
[i
+ 1][0] = max_distance
;
272 distances
[i
+ 1][1] = i
;
275 for j
in 0..(b_len
+ 1) {
276 distances
[0][j
+ 1] = max_distance
;
277 distances
[1][j
+ 1] = j
;
280 let mut chars
: HashMap
<char, usize> = HashMap
::new();
282 for i
in 1..(a_len
+ 1) {
285 for j
in 1..(b_len
+ 1) {
286 let k
= match chars
.get(&b_chars
[j
- 1]) {
287 Some(value
) => value
.clone(),
294 if a_chars
[i
- 1] == b_chars
[j
- 1] {
299 let substitution_cost
= distances
[i
][j
] + cost
;
300 let insertion_cost
= distances
[i
][j
+ 1] + 1;
301 let deletion_cost
= distances
[i
+ 1][j
] + 1;
302 let transposition_cost
= distances
[k
][l
] + (i
- k
- 1) + 1 +
305 distances
[i
+ 1][j
+ 1] = min(substitution_cost
,
308 transposition_cost
)));
311 chars
.insert(a_chars
[i
- 1], i
);
314 distances
[a_len
+ 1][b_len
+ 1]
317 /// Calculates a normalized score of the Damerau–Levenshtein algorithm between
318 /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same.
321 /// use strsim::normalized_damerau_levenshtein;
323 /// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
324 /// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
325 /// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
326 /// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
327 /// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
329 pub fn normalized_damerau_levenshtein(a
: &str, b
: &str) -> f64 {
330 if a
.is_empty() && b
.is_empty() {
333 1.0 - (damerau_levenshtein(a
, b
) as f64) / (a
.chars().count().max(b
.chars().count()) as f64)
342 match hamming("", "") {
343 Ok(distance
) => { assert_eq!(0, distance); }
,
344 Err(why
) => { panic!("{:?}
", why); }
350 match hamming("hamming
", "hamming
") {
351 Ok(distance) => { assert_eq!(0, distance); },
352 Err(why) => { panic!("{:?}", why
); }
358 match hamming("hamming", "hammers") {
359 Ok(distance
) => { assert_eq!(3, distance); }
,
360 Err(why
) => { panic!("{:?}
", why); }
365 fn hamming_diff_multibyte() {
366 match hamming("hamming
", "h香mmüng
") {
367 Ok(distance) => { assert_eq!(2, distance); },
368 Err(why) => { panic!("{:?}", why
); }
373 fn hamming_unequal_length() {
374 match hamming("ham", "hamming") {
375 Ok(_
) => { panic!(); }
,
376 Err(why
) => { assert_eq!(why, StrSimError::DifferentLengthArgs); }
382 match hamming("Friedrich Nietzs", "Jean-Paul Sartre") {
383 Ok(distance
) => { assert_eq!(14, distance); }
,
384 Err(why
) => { panic!("{:?}
", why); }
389 fn jaro_both_empty() {
390 assert_eq!(1.0, jaro("", ""));
394 fn jaro_first_empty() {
395 assert_eq!(0.0, jaro("", "jaro
"));
399 fn jaro_second_empty() {
400 assert_eq!(0.0, jaro("distance
", ""));
405 assert_eq!(1.0, jaro("jaro
", "jaro
"));
409 fn jaro_multibyte() {
410 assert!((0.818 - jaro("testabctest
", "testöঙ香test
")) < 0.001);
411 assert!((0.818 - jaro("testöঙ香test
", "testabctest
")) < 0.001);
415 fn jaro_diff_short() {
416 assert!((0.767 - jaro("dixon
", "dicksonx
")).abs() < 0.001);
420 fn jaro_diff_one_character() {
421 assert_eq!(0.0, jaro("a
", "b
"));
425 fn jaro_diff_one_and_two() {
426 assert!((0.83 - jaro("a
", "ab
")).abs() < 0.01);
430 fn jaro_diff_two_and_one() {
431 assert!((0.83 - jaro("ab
", "a
")).abs() < 0.01);
435 fn jaro_diff_no_transposition() {
436 assert!((0.822 - jaro("dwayne
", "duane
")).abs() < 0.001);
440 fn jaro_diff_with_transposition() {
441 assert!((0.944 - jaro("martha
", "marhta
")).abs() < 0.001);
446 assert!((0.392 - jaro("Friedrich Nietzsche
",
447 "Jean
-Paul Sartre
")).abs() < 0.001);
451 fn jaro_winkler_both_empty() {
452 assert_eq!(1.0, jaro_winkler("", ""));
456 fn jaro_winkler_first_empty() {
457 assert_eq!(0.0, jaro_winkler("", "jaro
-winkler
"));
461 fn jaro_winkler_second_empty() {
462 assert_eq!(0.0, jaro_winkler("distance
", ""));
466 fn jaro_winkler_same() {
467 assert_eq!(1.0, jaro_winkler("Jaro
-Winkler
", "Jaro
-Winkler
"));
471 fn jaro_winkler_multibyte() {
472 assert!((0.89 - jaro_winkler("testabctest
", "testöঙ香test
")).abs() <
474 assert!((0.89 - jaro_winkler("testöঙ香test
", "testabctest
")).abs() <
479 fn jaro_winkler_diff_short() {
480 assert!((0.813 - jaro_winkler("dixon
", "dicksonx
")).abs() < 0.001);
481 assert!((0.813 - jaro_winkler("dicksonx
", "dixon
")).abs() < 0.001);
485 fn jaro_winkler_diff_one_character() {
486 assert_eq!(0.0, jaro_winkler("a
", "b
"));
490 fn jaro_winkler_diff_no_transposition() {
491 assert!((0.840 - jaro_winkler("dwayne
", "duane
")).abs() < 0.001);
495 fn jaro_winkler_diff_with_transposition() {
496 assert!((0.961 - jaro_winkler("martha
", "marhta
")).abs() < 0.001);
500 fn jaro_winkler_names() {
501 assert!((0.562 - jaro_winkler("Friedrich Nietzsche
",
502 "Fran
-Paul Sartre
")).abs() < 0.001);
506 fn jaro_winkler_long_prefix() {
507 assert!((0.911 - jaro_winkler("cheeseburger
", "cheese fries
")).abs() <
512 fn jaro_winkler_more_names() {
513 assert!((0.868 - jaro_winkler("Thorkel
", "Thorgier
")).abs() < 0.001);
517 fn jaro_winkler_length_of_one() {
518 assert!((0.738 - jaro_winkler("Dinsdale
", "D
")).abs() < 0.001);
522 fn jaro_winkler_very_long_prefix() {
523 assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx
",
524 "thequickbrownfoxjumpedovery
")).abs() <
529 fn levenshtein_empty() {
530 assert_eq!(0, levenshtein("", ""));
534 fn levenshtein_same() {
535 assert_eq!(0, levenshtein("levenshtein
", "levenshtein
"));
539 fn levenshtein_diff_short() {
540 assert_eq!(3, levenshtein("kitten
", "sitting
"));
544 fn levenshtein_diff_with_space() {
545 assert_eq!(5, levenshtein("hello
, world
", "bye
, world
"));
549 fn levenshtein_diff_multibyte() {
550 assert_eq!(3, levenshtein("öঙ香
", "abc
"));
551 assert_eq!(3, levenshtein("abc
", "öঙ香
"));
555 fn levenshtein_diff_longer() {
556 let a = "The quick brown fox jumped over the angry dog
.";
557 let b = "Lorem ipsum dolor sit amet
, dicta latine an eam
.";
558 assert_eq!(37, levenshtein(a, b));
562 fn levenshtein_first_empty() {
563 assert_eq!(7, levenshtein("", "sitting
"));
567 fn levenshtein_second_empty() {
568 assert_eq!(6, levenshtein("kitten
", ""));
572 fn normalized_levenshtein_diff_short() {
573 assert!((normalized_levenshtein("kitten
", "sitting
") - 0.57142).abs() < 0.00001);
577 fn normalized_levenshtein_for_empty_strings() {
578 assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
582 fn normalized_levenshtein_first_empty() {
583 assert!(normalized_levenshtein("", "second
").abs() < 0.00001);
587 fn normalized_levenshtein_second_empty() {
588 assert!(normalized_levenshtein("first
", "").abs() < 0.00001);
592 fn normalized_levenshtein_identical_strings() {
593 assert!((normalized_levenshtein("identical
", "identical
") - 1.0).abs() < 0.00001);
597 fn osa_distance_empty() {
598 assert_eq!(0, osa_distance("", ""));
602 fn osa_distance_same() {
603 assert_eq!(0, osa_distance("damerau
", "damerau
"));
607 fn osa_distance_first_empty() {
608 assert_eq!(7, osa_distance("", "damerau
"));
612 fn osa_distance_second_empty() {
613 assert_eq!(7, osa_distance("damerau
", ""));
617 fn osa_distance_diff() {
618 assert_eq!(3, osa_distance("ca
", "abc
"));
622 fn osa_distance_diff_short() {
623 assert_eq!(3, osa_distance("damerau
", "aderua
"));
627 fn osa_distance_diff_reversed() {
628 assert_eq!(3, osa_distance("aderua
", "damerau
"));
632 fn osa_distance_diff_multibyte() {
633 assert_eq!(3, osa_distance("öঙ香
", "abc
"));
634 assert_eq!(3, osa_distance("abc
", "öঙ香
"));
638 fn osa_distance_diff_unequal_length() {
639 assert_eq!(6, osa_distance("damerau
", "aderuaxyz
"));
643 fn osa_distance_diff_unequal_length_reversed() {
644 assert_eq!(6, osa_distance("aderuaxyz
", "damerau
"));
648 fn osa_distance_diff_comedians() {
649 assert_eq!(5, osa_distance("Stewart
", "Colbert
"));
653 fn osa_distance_many_transpositions() {
654 assert_eq!(4, osa_distance("abcdefghijkl
", "bacedfgihjlk
"));
658 fn osa_distance_diff_longer() {
659 let a = "The quick brown fox jumped over the angry dog
.";
660 let b = "Lehem ipsum dolor sit amet
, dicta latine an eam
.";
661 assert_eq!(36, osa_distance(a, b));
665 fn osa_distance_beginning_transposition() {
666 assert_eq!(1, osa_distance("foobar
", "ofobar
"));
670 fn osa_distance_end_transposition() {
671 assert_eq!(1, osa_distance("specter
", "spectre
"));
675 fn osa_distance_restricted_edit() {
676 assert_eq!(4, osa_distance("a cat
", "an abct
"));
680 fn damerau_levenshtein_empty() {
681 assert_eq!(0, damerau_levenshtein("", ""));
685 fn damerau_levenshtein_same() {
686 assert_eq!(0, damerau_levenshtein("damerau
", "damerau
"));
690 fn damerau_levenshtein_first_empty() {
691 assert_eq!(7, damerau_levenshtein("", "damerau
"));
695 fn damerau_levenshtein_second_empty() {
696 assert_eq!(7, damerau_levenshtein("damerau
", ""));
700 fn damerau_levenshtein_diff() {
701 assert_eq!(2, damerau_levenshtein("ca
", "abc
"));
705 fn damerau_levenshtein_diff_short() {
706 assert_eq!(3, damerau_levenshtein("damerau
", "aderua
"));
710 fn damerau_levenshtein_diff_reversed() {
711 assert_eq!(3, damerau_levenshtein("aderua
", "damerau
"));
715 fn damerau_levenshtein_diff_multibyte() {
716 assert_eq!(3, damerau_levenshtein("öঙ香
", "abc
"));
717 assert_eq!(3, damerau_levenshtein("abc
", "öঙ香
"));
721 fn damerau_levenshtein_diff_unequal_length() {
722 assert_eq!(6, damerau_levenshtein("damerau
", "aderuaxyz
"));
726 fn damerau_levenshtein_diff_unequal_length_reversed() {
727 assert_eq!(6, damerau_levenshtein("aderuaxyz
", "damerau
"));
731 fn damerau_levenshtein_diff_comedians() {
732 assert_eq!(5, damerau_levenshtein("Stewart
", "Colbert
"));
736 fn damerau_levenshtein_many_transpositions() {
737 assert_eq!(4, damerau_levenshtein("abcdefghijkl
", "bacedfgihjlk
"));
741 fn damerau_levenshtein_diff_longer() {
742 let a = "The quick brown fox jumped over the angry dog
.";
743 let b = "Lehem ipsum dolor sit amet
, dicta latine an eam
.";
744 assert_eq!(36, damerau_levenshtein(a, b));
748 fn damerau_levenshtein_beginning_transposition() {
749 assert_eq!(1, damerau_levenshtein("foobar
", "ofobar
"));
753 fn damerau_levenshtein_end_transposition() {
754 assert_eq!(1, damerau_levenshtein("specter
", "spectre
"));
758 fn damerau_levenshtein_unrestricted_edit() {
759 assert_eq!(3, damerau_levenshtein("a cat
", "an abct
"));
763 fn normalized_damerau_levenshtein_diff_short() {
764 assert!((normalized_damerau_levenshtein("levenshtein
", "löwenbräu
") - 0.27272).abs() < 0.00001);
768 fn normalized_damerau_levenshtein_for_empty_strings() {
769 assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
773 fn normalized_damerau_levenshtein_first_empty() {
774 assert!(normalized_damerau_levenshtein("", "flower
").abs() < 0.00001);
778 fn normalized_damerau_levenshtein_second_empty() {
779 assert!(normalized_damerau_levenshtein("tree
", "").abs() < 0.00001);
783 fn normalized_damerau_levenshtein_identical_strings() {
784 assert!((normalized_damerau_levenshtein("sunglasses
", "sunglasses
") - 1.0).abs() < 0.00001);