2626using F23 . StringSimilarity . Interfaces ;
2727// ReSharper disable SuggestVarOrType_Elsewhere
2828// ReSharper disable TooWideLocalVariableScope
29+ // ReSharper disable IntroduceOptionalParameters.Global
2930
3031namespace F23 . StringSimilarity
3132{
@@ -34,24 +35,54 @@ namespace F23.StringSimilarity
3435 public class WeightedLevenshtein : IStringDistance
3536 {
3637 private readonly ICharacterSubstitution _characterSubstitution ;
38+ private readonly ICharacterInsDel _characterInsDel ;
3739
3840 /// <summary>
39- /// Create a new instance with provided character substitution.
41+ /// Instantiate with provided character substitution.
4042 /// </summary>
4143 /// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
4244 public WeightedLevenshtein ( ICharacterSubstitution characterSubstitution )
45+ : this ( characterSubstitution , null )
46+ {
47+ }
48+
49+ /// <summary>
50+ /// Instantiate with provided character substitution, insertion, and
51+ /// deletion weights.
52+ /// </summary>
53+ /// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
54+ /// <param name="characterInsDel">The strategy to determine character insertion/deletion weights.</param>
55+ public WeightedLevenshtein ( ICharacterSubstitution characterSubstitution ,
56+ ICharacterInsDel characterInsDel )
4357 {
4458 _characterSubstitution = characterSubstitution ;
59+ _characterInsDel = characterInsDel ;
60+ }
61+
62+ /// <summary>
63+ /// Equivalent to Distance(s1, s2, Double.MaxValue).
64+ /// </summary>
65+ /// <param name="s1">The first string to compare.</param>
66+ /// <param name="s2">The second string to compare.</param>
67+ /// <returns>The computed weighted Levenshtein distance.</returns>
68+ public double Distance ( string s1 , string s2 )
69+ {
70+ return Distance ( s1 , s2 , double . MaxValue ) ;
4571 }
4672
4773 /// <summary>
4874 /// Compute Levenshtein distance using provided weights for substitution.
4975 /// </summary>
5076 /// <param name="s1">The first string to compare.</param>
5177 /// <param name="s2">The second string to compare.</param>
78+ /// <param name="limit">The maximum result to compute before stopping. This
79+ /// means that the calculation can terminate early if you
80+ /// only care about strings with a certain similarity.
81+ /// Set this to Double.MaxValue if you want to run the
82+ /// calculation to completion in every case.</param>
5283 /// <returns>The computed weighted Levenshtein distance.</returns>
5384 /// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
54- public double Distance ( string s1 , string s2 )
85+ public double Distance ( string s1 , string s2 , double limit )
5586 {
5687 if ( s1 == null )
5788 {
@@ -78,51 +109,79 @@ public double Distance(string s1, string s2)
78109 return s1 . Length ;
79110 }
80111
81- // create two work vectors of integer distances
112+ // create two work vectors of floating point (i.e. weighted) distances
82113 double [ ] v0 = new double [ s2 . Length + 1 ] ;
83114 double [ ] v1 = new double [ s2 . Length + 1 ] ;
84115 double [ ] vtemp ;
85116
86117 // initialize v0 (the previous row of distances)
87- // this row is A[0][i]: edit distance for an empty s
88- // the distance is just the number of characters to delete from t
89- for ( int i = 0 ; i < v0 . Length ; i ++ )
118+ // this row is A[0][i]: edit distance for an empty s1
119+ // the distance is the cost of inserting each character of s2
120+ v0 [ 0 ] = 0 ;
121+ for ( int i = 1 ; i < v0 . Length ; i ++ )
90122 {
91- v0 [ i ] = i ;
123+ v0 [ i ] = v0 [ i - 1 ] + InsertionCost ( s2 [ i - 1 ] ) ;
92124 }
93125
94126 for ( int i = 0 ; i < s1 . Length ; i ++ )
95127 {
128+ char s1i = s1 [ i ] ;
129+ double deletionCost = DeletionCost ( s1i ) ;
130+
96131 // calculate v1 (current row distances) from the previous row v0
97132 // first element of v1 is A[i+1][0]
98- // edit distance is delete (i+1) chars from s to match empty t
99- v1 [ 0 ] = i + 1 ;
133+ // Edit distance is the cost of deleting characters from s1
134+ // to match empty t.
135+ v1 [ 0 ] = v0 [ 0 ] + deletionCost ;
136+
137+ double minv1 = v1 [ 0 ] ;
100138
101139 // use formula to fill in the rest of the row
102140 for ( int j = 0 ; j < s2 . Length ; j ++ )
103141 {
142+ char s2j = s2 [ j ] ;
104143 double cost = 0 ;
105- if ( s1 [ i ] != s2 [ j ] )
144+
145+ if ( s1i != s2j )
106146 {
107- cost = _characterSubstitution . Cost ( s1 [ i ] , s2 [ j ] ) ;
147+ cost = _characterSubstitution . Cost ( s1i , s2j ) ;
108148 }
149+
150+ double insertionCost = InsertionCost ( s2j ) ;
151+
109152 v1 [ j + 1 ] = Math . Min (
110- v1 [ j ] + 1 , // Cost of insertion
153+ v1 [ j ] + insertionCost , // Cost of insertion
111154 Math . Min (
112- v0 [ j + 1 ] + 1 , // Cost of remove
155+ v0 [ j + 1 ] + deletionCost , // Cost of deletion
113156 v0 [ j ] + cost ) ) ; // Cost of substitution
157+
158+ minv1 = Math . Min ( minv1 , v1 [ j + 1 ] ) ;
159+ }
160+
161+ if ( minv1 >= limit )
162+ {
163+ return limit ;
114164 }
115165
116166 // copy v1 (current row) to v0 (previous row) for next iteration
117- //System.arraycopy(v1, 0, v0, 0, v0.length);
167+ // System.arraycopy(v1, 0, v0, 0, v0.length);
118168 // Flip references to current and previous row
119169 vtemp = v0 ;
120170 v0 = v1 ;
121171 v1 = vtemp ;
122-
123172 }
124173
125174 return v0 [ s2 . Length ] ;
126175 }
176+
177+ private double InsertionCost ( char c )
178+ {
179+ return _characterInsDel ? . InsertionCost ( c ) ?? 1.0 ;
180+ }
181+
182+ private double DeletionCost ( char c )
183+ {
184+ return _characterInsDel ? . DeletionCost ( c ) ?? 1.0 ;
185+ }
127186 }
128187}
0 commit comments