]>
Commit | Line | Data |
---|---|---|
07800601 IM |
1 | #include "cache.h" |
2 | #include "levenshtein.h" | |
3 | ||
4 | /* | |
5 | * This function implements the Damerau-Levenshtein algorithm to | |
6 | * calculate a distance between strings. | |
7 | * | |
8 | * Basically, it says how many letters need to be swapped, substituted, | |
9 | * deleted from, or added to string1, at least, to get string2. | |
10 | * | |
11 | * The idea is to build a distance matrix for the substrings of both | |
12 | * strings. To avoid a large space complexity, only the last three rows | |
13 | * are kept in memory (if swaps had the same or higher cost as one deletion | |
14 | * plus one insertion, only two rows would be needed). | |
15 | * | |
16 | * At any stage, "i + 1" denotes the length of the current substring of | |
17 | * string1 that the distance is calculated for. | |
18 | * | |
19 | * row2 holds the current row, row1 the previous row (i.e. for the substring | |
20 | * of string1 of length "i"), and row0 the row before that. | |
21 | * | |
22 | * In other words, at the start of the big loop, row2[j + 1] contains the | |
23 | * Damerau-Levenshtein distance between the substring of string1 of length | |
24 | * "i" and the substring of string2 of length "j + 1". | |
25 | * | |
26 | * All the big loop does is determine the partial minimum-cost paths. | |
27 | * | |
28 | * It does so by calculating the costs of the path ending in characters | |
29 | * i (in string1) and j (in string2), respectively, given that the last | |
30 | * operation is a substition, a swap, a deletion, or an insertion. | |
31 | * | |
32 | * This implementation allows the costs to be weighted: | |
33 | * | |
34 | * - w (as in "sWap") | |
35 | * - s (as in "Substitution") | |
36 | * - a (for insertion, AKA "Add") | |
37 | * - d (as in "Deletion") | |
38 | * | |
39 | * Note that this algorithm calculates a distance _iff_ d == a. | |
40 | */ | |
41 | int levenshtein(const char *string1, const char *string2, | |
42 | int w, int s, int a, int d) | |
43 | { | |
44 | int len1 = strlen(string1), len2 = strlen(string2); | |
45 | int *row0 = malloc(sizeof(int) * (len2 + 1)); | |
46 | int *row1 = malloc(sizeof(int) * (len2 + 1)); | |
47 | int *row2 = malloc(sizeof(int) * (len2 + 1)); | |
48 | int i, j; | |
49 | ||
50 | for (j = 0; j <= len2; j++) | |
51 | row1[j] = j * a; | |
52 | for (i = 0; i < len1; i++) { | |
53 | int *dummy; | |
54 | ||
55 | row2[0] = (i + 1) * d; | |
56 | for (j = 0; j < len2; j++) { | |
57 | /* substitution */ | |
58 | row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); | |
59 | /* swap */ | |
60 | if (i > 0 && j > 0 && string1[i - 1] == string2[j] && | |
61 | string1[i] == string2[j - 1] && | |
62 | row2[j + 1] > row0[j - 1] + w) | |
63 | row2[j + 1] = row0[j - 1] + w; | |
64 | /* deletion */ | |
65 | if (row2[j + 1] > row1[j + 1] + d) | |
66 | row2[j + 1] = row1[j + 1] + d; | |
67 | /* insertion */ | |
68 | if (row2[j + 1] > row2[j] + a) | |
69 | row2[j + 1] = row2[j] + a; | |
70 | } | |
71 | ||
72 | dummy = row0; | |
73 | row0 = row1; | |
74 | row1 = row2; | |
75 | row2 = dummy; | |
76 | } | |
77 | ||
78 | i = row1[len2]; | |
79 | free(row0); | |
80 | free(row1); | |
81 | free(row2); | |
82 | ||
83 | return i; | |
84 | } |