|
| 1 | +require 'cipherstash/analysis/text_processor' |
| 2 | +require "cipherstash/client" |
| 3 | + |
| 4 | +RSpec.describe CipherStash::Analysis::TextProcessor do |
| 5 | + describe "Standard text processor" do |
| 6 | + it "splits text based on word boundaries" do |
| 7 | + tokenizer = |
| 8 | + CipherStash::Analysis::TextProcessor.new({ |
| 9 | + "tokenFilters" => [ |
| 10 | + { "kind" => "downcase" } |
| 11 | + ], |
| 12 | + "tokenizer" => { "kind" => "standard" } |
| 13 | + }) |
| 14 | + result = tokenizer.perform("This is an example of a standard tokenizer") |
| 15 | + expect(result.length).to eq(8) |
| 16 | + expect(result).to eq(["this", "is", "an", "example", "of", "a", "standard", "tokenizer"]) |
| 17 | + end |
| 18 | + end |
| 19 | + |
| 20 | + describe "Standard text processor with an ngram filter" do |
| 21 | + ["1", { foo: "bar" }, Object.new].each do |length| |
| 22 | + it "raises an error if invalid length of #{length.inspect} provided" do |
| 23 | + expect { |
| 24 | + CipherStash::Analysis::TextProcessor.new({ |
| 25 | + "tokenFilters" => [ |
| 26 | + { "kind" => "downcase" }, |
| 27 | + { "kind" => "ngram", "minLength" => length, "maxLength" => length } |
| 28 | + ], |
| 29 | + "tokenizer" => { "kind" => "standard" } |
| 30 | + }) |
| 31 | + }.to raise_error(CipherStash::Client::Error::InternalError, "The values provided to the min and max length must be of type Integer.") |
| 32 | + end |
| 33 | + end |
| 34 | + |
| 35 | + it "raises an error if the min length is greater than the max length" do |
| 36 | + expect { |
| 37 | + CipherStash::Analysis::TextProcessor.new({ |
| 38 | + "tokenFilters" => [ |
| 39 | + { "kind" => "downcase" }, |
| 40 | + { "kind" => "ngram", "minLength" => 4, "maxLength" => 3 } |
| 41 | + ], |
| 42 | + "tokenizer" => { "kind" => "standard" } |
| 43 | + }) |
| 44 | + }.to raise_error(CipherStash::Client::Error::InternalError, "The ngram filter min length must be less than or equal to the max length") |
| 45 | + end |
| 46 | + |
| 47 | + it "splits text into ngrams using min length of 3 and max length of 8" do |
| 48 | + tokenizer = |
| 49 | + CipherStash::Analysis::TextProcessor.new({ |
| 50 | + "tokenFilters" => [ |
| 51 | + { "kind" => "downcase" }, |
| 52 | + { "kind" => "ngram", "minLength" => 3, "maxLength" => 8 } |
| 53 | + ], |
| 54 | + "tokenizer" => { "kind" => "standard" } |
| 55 | + }) |
| 56 | + result = tokenizer.perform("Example filter") |
| 57 | + |
| 58 | + expect(result).to eq([ |
| 59 | + "exa", |
| 60 | + "xam", |
| 61 | + "amp", |
| 62 | + "mpl", |
| 63 | + "ple", |
| 64 | + "exam", |
| 65 | + "xamp", |
| 66 | + "ampl", |
| 67 | + "mple", |
| 68 | + "examp", |
| 69 | + "xampl", |
| 70 | + "ample", |
| 71 | + "exampl", |
| 72 | + "xample", |
| 73 | + "example", |
| 74 | + "fil", |
| 75 | + "ilt", |
| 76 | + "lte", |
| 77 | + "ter", |
| 78 | + "filt", |
| 79 | + "ilte", |
| 80 | + "lter", |
| 81 | + "filte", |
| 82 | + "ilter", |
| 83 | + "filter" |
| 84 | + ]) |
| 85 | + end |
| 86 | + end |
| 87 | +end |
0 commit comments