diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 4b5aae8..6dbda9a 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -8,6 +8,7 @@ on: - "assets/**" - "src/**" - "tests/**" + - "tools/**" - ".taplo.toml" - ".typos.toml" - "Cargo.lock" @@ -68,7 +69,7 @@ jobs: - name: Set up Environment run: ./tools/setup.sh ${{ matrix.version }} - name: Set up Sccache - uses: mozilla-actions/sccache-action@v0.0.7 + uses: mozilla-actions/sccache-action@v0.0.9 - name: Clippy run: cargo clippy --features pg${{ matrix.version }} - name: Unit Test diff --git a/Cargo.toml b/Cargo.toml index 87e809d..239a834 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,6 @@ name = "pgrx_embed_pg_tokenizer" path = "./src/bin/pgrx_embed.rs" [features] -pg12 = ["pgrx/pg12", "pgrx-tests/pg12"] pg13 = ["pgrx/pg13", "pgrx-tests/pg13"] pg14 = ["pgrx/pg14", "pgrx-tests/pg14"] pg15 = ["pgrx/pg15", "pgrx-tests/pg15"] @@ -28,8 +27,8 @@ lindera-cc-cedict = ["lindera/cc-cedict"] anyhow = "1.0.97" dashmap = "6.1.0" jieba-rs = "0.7.2" -lindera = "0.37.0" -pgrx = "=0.13.1" +lindera = "0.42.2" +pgrx = "=0.14.1" regex = "1.11.1" rust-stemmers = { git = "https://github.com/tensorchord/rust-stemmers.git", rev = "51696378e352688b7ffd4fface615370ff5e8768" } serde = { version = "1.0.218", features = ["derive"] } @@ -42,10 +41,7 @@ unicode-segmentation = "1.12.0" validator = { version = "0.20.0", features = ["derive"] } [dev-dependencies] -pgrx-tests = "=0.13.1" - -[patch.crates-io] -pgrx = { git = "https://github.com/tensorchord/pgrx", branch = "patch-to-pg_tokenizer" } +pgrx-tests = "=0.14.1" [profile.release] opt-level = 3 diff --git a/docs/00-reference.md b/docs/00-reference.md index 016d47f..8aac0ed 100644 --- a/docs/00-reference.md +++ b/docs/00-reference.md @@ -92,6 +92,7 @@ You can choose only one of the above options for each character filter. | stopwords | String | Stopwords name, builtin: `lucene_english`, `nltk_english`, `iso_english` | | synonym | String | Synonym name | | pg_dict | String | Using [postgres text search dictionary](https://www.postgresql.org/docs/current/textsearch-dictionaries.html). We currently support all dictionaries except `Thesaurus Dictionary`. | +| ngram | Table | N-gram tokenizer, see [Options for `ngram`](#options-for-ngram) | You can choose only one of the above options for each token filter. @@ -99,6 +100,14 @@ You can choose only one of the above options for each token filter. arabic, armenian, basque, catalan, danish, dutch, english_porter, english_porter2, estonian, finnish, french, german, greek, hindi, hungarian, indonesian, irish, italian, lithuanian, nepali, norwegian, portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish, yiddish +#### Options for `ngram` + +| Key | Type | Description | +| ----------------- | ------- | -------------------------------------------------------- | +| max_gram | Integer | Maximum n-gram size, range: `1..=255`, default: `2` | +| min_gram | Integer | Minimum n-gram size, range: `1..=255`, default: `1` | +| preserve_original | Boolean | Whether to preserve the original token, default: `false` | + ### Options for `tokenizer` | Key | Type | Description | diff --git a/src/lib.rs b/src/lib.rs index e3a2bed..a44637e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ compile_error!("Target is not supported."); compiler_error!("PostgreSQL version must be selected."); #[pgrx::pg_guard] -unsafe extern "C" fn _PG_init() { +extern "C-unwind" fn _PG_init() { if unsafe { pgrx::pg_sys::IsUnderPostmaster } { pgrx::error!("pg_tokenizer must be loaded via shared_preload_libraries."); } diff --git a/src/token_filter/mod.rs b/src/token_filter/mod.rs index 02e3852..7a99320 100644 --- a/src/token_filter/mod.rs +++ b/src/token_filter/mod.rs @@ -1,3 +1,4 @@ +mod ngram; mod pg_dict; mod skip_non_alphanumeric; mod stemmer; @@ -6,6 +7,7 @@ mod synonym; use std::sync::Arc; +use ngram::{Ngram, NgramConfig}; use pg_dict::PgDictTokenFilter; use serde::{Deserialize, Serialize}; use skip_non_alphanumeric::SkipNonAlphanumeric; @@ -32,6 +34,8 @@ pub enum TokenFilterConfig { Stopwords(String), PgDict(String), Synonym(String), + #[serde(rename = "ngram")] + NGram(NgramConfig), } pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr { @@ -41,5 +45,6 @@ pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr { TokenFilterConfig::Stopwords(name) => stopwords::get_stopwords_token_filter(&name), TokenFilterConfig::PgDict(name) => Arc::new(PgDictTokenFilter::new(&name)), TokenFilterConfig::Synonym(name) => synonym::get_synonym_token_filter(&name), + TokenFilterConfig::NGram(config) => Arc::new(Ngram::new(config)), } } diff --git a/src/token_filter/ngram.rs b/src/token_filter/ngram.rs new file mode 100644 index 0000000..5350de3 --- /dev/null +++ b/src/token_filter/ngram.rs @@ -0,0 +1,71 @@ +use serde::{Deserialize, Serialize}; +use validator::{Validate, ValidationError}; + +use super::TokenFilter; + +#[derive(Clone, Debug, Serialize, Deserialize, Validate)] +#[serde(rename_all = "snake_case")] +#[serde(deny_unknown_fields)] +#[validate(schema(function = "NgramConfig::validate_grams"))] +pub struct NgramConfig { + #[serde(default = "NgramConfig::default_max_gram")] + #[validate(range(min = 1, max = 255))] + pub max_gram: usize, + #[serde(default = "NgramConfig::default_min_gram")] + #[validate(range(min = 1, max = 255))] + pub min_gram: usize, + #[serde(default = "NgramConfig::default_preserve_original")] + pub preserve_original: bool, +} + +impl NgramConfig { + fn default_max_gram() -> usize { + 2 + } + fn default_min_gram() -> usize { + 1 + } + fn default_preserve_original() -> bool { + false + } + fn validate_grams(&self) -> Result<(), ValidationError> { + if self.min_gram > self.max_gram { + return Err(ValidationError::new( + "min_gram must be less than or equal to max_gram", + )); + } + Ok(()) + } +} + +pub struct Ngram { + config: NgramConfig, +} + +impl TokenFilter for Ngram { + fn apply(&self, token: String) -> Vec { + let mut results = Vec::new(); + let len = token.len(); + for i in 0..=(len - self.config.min_gram) { + for j in (i + self.config.min_gram)..=(i + self.config.max_gram).min(len) { + results.push(token[i..j].to_string()); + } + } + if self.config.preserve_original + && !(self.config.min_gram..=self.config.max_gram).contains(&len) + { + results.push(token); + } + results + } +} + +impl Ngram { + pub fn new(config: NgramConfig) -> Self { + if let Err(e) = config.validate() { + panic!("Invalid NgramConfig: {}", e); + } + + Ngram { config } + } +} diff --git a/tests/sqllogictest/ngram.slt b/tests/sqllogictest/ngram.slt new file mode 100644 index 0000000..7867d8e --- /dev/null +++ b/tests/sqllogictest/ngram.slt @@ -0,0 +1,29 @@ +statement ok +BEGIN; + +statement ok +SELECT tokenizer_catalog.create_text_analyzer('test_ngram', $$ +pre_tokenizer = "unicode_segmentation" +[[token_filters]] +[token_filters.ngram] +$$); + +query T +SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram'); +---- +{Q,Qu,u,ui,i,ic,c,ck,k,f,fo,o,ox,x} + +statement ok +SELECT tokenizer_catalog.create_text_analyzer('test_ngram2', $$ +pre_tokenizer = "unicode_segmentation" +[[token_filters]] +[token_filters.ngram] +max_gram = 3 +min_gram = 2 +preserve_original = true +$$); + +query T +SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram2'); +---- +{Qu,Qui,ui,uic,ic,ick,ck,Quick,fo,fox,ox} diff --git a/tools/setup.sh b/tools/setup.sh index 7ac6861..bb4c584 100755 --- a/tools/setup.sh +++ b/tools/setup.sh @@ -19,7 +19,7 @@ sudo -iu postgres createdb -O $USER $USER sudo -iu postgres psql -c 'ALTER SYSTEM SET shared_preload_libraries = "pg_tokenizer.so"' sudo systemctl stop postgresql -curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.13.1/cargo-pgrx-v0.13.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx +curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.14.1/cargo-pgrx-v0.14.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx cargo pgrx init --pg${version}=$(which pg_config) curl -fsSL https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.26.4/sqllogictest-bin-v0.26.4-$(uname -m)-unknown-linux-musl.tar.gz | tar -xOzf - ./sqllogictest | install -m 755 /dev/stdin /usr/local/bin/sqllogictest