Skip to content
This repository was archived by the owner on Jan 27, 2023. It is now read-only.

Commit 492524a

Browse files
committed
add text processor spec
1 parent e8b4c08 commit 492524a

1 file changed

Lines changed: 87 additions & 0 deletions

File tree

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
require 'cipherstash/analysis/text_processor'
2+
require "cipherstash/client"
3+
4+
RSpec.describe CipherStash::Analysis::TextProcessor do
5+
describe "Standard text processor" do
6+
it "splits text based on word boundaries" do
7+
tokenizer =
8+
CipherStash::Analysis::TextProcessor.new({
9+
"tokenFilters" => [
10+
{ "kind" => "downcase" }
11+
],
12+
"tokenizer" => { "kind" => "standard" }
13+
})
14+
result = tokenizer.perform("This is an example of a standard tokenizer")
15+
expect(result.length).to eq(8)
16+
expect(result).to eq(["this", "is", "an", "example", "of", "a", "standard", "tokenizer"])
17+
end
18+
end
19+
20+
describe "Standard text processor with an ngram filter" do
21+
["1", { foo: "bar" }, Object.new].each do |length|
22+
it "raises an error if invalid length of #{length.inspect} provided" do
23+
expect {
24+
CipherStash::Analysis::TextProcessor.new({
25+
"tokenFilters" => [
26+
{ "kind" => "downcase" },
27+
{ "kind" => "ngram", "minLength" => length, "maxLength" => length }
28+
],
29+
"tokenizer" => { "kind" => "standard" }
30+
})
31+
}.to raise_error(CipherStash::Client::Error::InternalError, "The values provided to the min and max length must be of type Integer.")
32+
end
33+
end
34+
35+
it "raises an error if the min length is greater than the max length" do
36+
expect {
37+
CipherStash::Analysis::TextProcessor.new({
38+
"tokenFilters" => [
39+
{ "kind" => "downcase" },
40+
{ "kind" => "ngram", "minLength" => 4, "maxLength" => 3 }
41+
],
42+
"tokenizer" => { "kind" => "standard" }
43+
})
44+
}.to raise_error(CipherStash::Client::Error::InternalError, "The ngram filter min length must be less than or equal to the max length")
45+
end
46+
47+
it "splits text into ngrams using min length of 3 and max length of 8" do
48+
tokenizer =
49+
CipherStash::Analysis::TextProcessor.new({
50+
"tokenFilters" => [
51+
{ "kind" => "downcase" },
52+
{ "kind" => "ngram", "minLength" => 3, "maxLength" => 8 }
53+
],
54+
"tokenizer" => { "kind" => "standard" }
55+
})
56+
result = tokenizer.perform("Example filter")
57+
58+
expect(result).to eq([
59+
"exa",
60+
"xam",
61+
"amp",
62+
"mpl",
63+
"ple",
64+
"exam",
65+
"xamp",
66+
"ampl",
67+
"mple",
68+
"examp",
69+
"xampl",
70+
"ample",
71+
"exampl",
72+
"xample",
73+
"example",
74+
"fil",
75+
"ilt",
76+
"lte",
77+
"ter",
78+
"filt",
79+
"ilte",
80+
"lter",
81+
"filte",
82+
"ilter",
83+
"filter"
84+
])
85+
end
86+
end
87+
end

0 commit comments

Comments
 (0)