Skip to content
This repository was archived by the owner on Jan 27, 2023. It is now read-only.

Commit e8b4c08

Browse files
committed
update token filter to generate ngrams from min to max length size
1 parent 7af1d2b commit e8b4c08

1 file changed

Lines changed: 15 additions & 4 deletions

File tree

lib/cipherstash/analysis/token_filters.rb

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,24 @@ def perform(str_or_array)
1515

1616
class NGram < Base
1717
def perform(str_or_array)
18-
token_length = @opts["tokenLength"] || 3
18+
min_length = @opts["minLength"] || 3
19+
max_length = @opts["maxLength"] || 8
20+
1921
Array(str_or_array).flat_map do |token|
20-
[].tap do |out|
21-
(token.length - token_length + 1).times do |i|
22-
out << token[i, token_length]
22+
token_length = token.length
23+
24+
ngrams = [].tap do |out|
25+
(min_length..max_length).each do |n|
26+
ngram = token.chars.each_cons(n).map(&:join)
27+
out << ngram
28+
end
29+
30+
if token_length > max_length
31+
out << token
2332
end
2433
end
34+
35+
ngrams.flatten
2536
end
2637
end
2738
end

0 commit comments

Comments
 (0)