Skip to content

Commit

Permalink
feat: Adding Support for Regex Tokenizer (paradedb#1463)
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhawvipul authored Aug 5, 2024
1 parent d9e051e commit 171bf64
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 5 deletions.
8 changes: 8 additions & 0 deletions docs/search/full-text/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,14 @@ The following tokenizer names are accepted by the `paradedb.tokenizer` function.
<ParamField body="lowercase">
Equivalent to `raw`, but also converts to lowercase.
</ParamField>
<ParamField body="regex">
Tokenizes text using a regular expression. The regular expression can be specified with the `pattern` parameter.
<Expandable title="Config Options">
<ParamField body="pattern">
The regular expression pattern used to tokenize the text. Example: `\\W+` splits on non-word characters.
</ParamField>
</Expandable>
</ParamField>
<ParamField body="ngram">
Tokenizes text by splitting words into overlapping substrings based on the specified parameters.
<Expandable title="Config Options">
Expand Down
12 changes: 10 additions & 2 deletions pg_search/src/api/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub fn tokenizer(
max_gram: default!(Option<i32>, "NULL"),
prefix_only: default!(Option<bool>, "NULL"),
language: default!(Option<String>, "NULL"),
pattern: default!(Option<String>, "NULL"),
) -> JsonB {
let mut config = Map::new();

Expand All @@ -44,7 +45,7 @@ pub fn tokenizer(
max_gram.map(|v| config.insert("max_gram".to_string(), Value::Number(v.into())));
prefix_only.map(|v| config.insert("prefix_only".to_string(), Value::Bool(v)));
language.map(|v| config.insert("language".to_string(), Value::String(v)));

pattern.map(|v| config.insert("pattern".to_string(), Value::String(v)));
JsonB(json!(config))
}
#[cfg(test)]
Expand Down Expand Up @@ -74,7 +75,14 @@ mod tests {
Some(false),
Some("position".to_string()),
Some(true),
Some(tokenizer("ngram", Some(4), Some(4), Some(false), None)),
Some(tokenizer(
"ngram",
Some(4),
Some(4),
Some(false),
None,
None,
)),
Some("lowercase".to_string()),
);

Expand Down
33 changes: 33 additions & 0 deletions pg_search/tests/search_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,39 @@ fn raw_tokenizer_config(mut conn: PgConnection) {
assert_eq!(count.0, 1);
}

#[rstest]
fn regex_tokenizer_config(mut conn: PgConnection) {
"CALL paradedb.create_bm25_test_table(table_name => 'bm25_search', schema_name => 'paradedb')"
.execute(&mut conn);

r#"CALL paradedb.create_bm25(
index_name => 'bm25_search',
table_name => 'bm25_search',
schema_name => 'paradedb',
key_field => 'id',
text_fields => paradedb.field('description', tokenizer => paradedb.tokenizer('regex', pattern => '\b\w{4,}\b'))
);
INSERT INTO paradedb.bm25_search (id, description) VALUES
(11001, 'This is a simple test'),
(11002, 'Rust is awesome'),
(11003, 'Regex patterns are powerful'),
(11004, 'Find the longer words');
"#
.execute(&mut conn);

let count: (i64,) =
"SELECT COUNT(*) FROM bm25_search.search('description:simple')".fetch_one(&mut conn);
assert_eq!(count.0, 1);

let count: (i64,) =
"SELECT COUNT(*) FROM bm25_search.search('description:is')".fetch_one(&mut conn);
assert_eq!(count.0, 0);

let count: (i64,) =
"SELECT COUNT(*) FROM bm25_search.search('description:longer')".fetch_one(&mut conn);
assert_eq!(count.0, 1);
}

#[rstest]
fn language_stem_tokenizer_config(mut conn: PgConnection) {
// Define languages and corresponding test data
Expand Down
11 changes: 9 additions & 2 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ use cjk::ChineseTokenizer;
use code::CodeTokenizer;
use lindera::{LinderaChineseTokenizer, LinderaJapaneseTokenizer, LinderaKoreanTokenizer};
use tantivy::tokenizer::{
AsciiFoldingFilter, Language, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
SimpleTokenizer, Stemmer, TextAnalyzer, TokenizerManager, WhitespaceTokenizer,
AsciiFoldingFilter, Language, LowerCaser, NgramTokenizer, RawTokenizer, RegexTokenizer,
RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer, TokenizerManager,
WhitespaceTokenizer,
};
use tracing::info;

Expand Down Expand Up @@ -66,6 +67,12 @@ pub fn create_tokenizer_manager(search_tokenizers: Vec<&SearchTokenizer>) -> Tok
.filter(LowerCaser)
.build(),
),
SearchTokenizer::RegexTokenizer { pattern } => Some(
TextAnalyzer::builder(RegexTokenizer::new(pattern.as_str()).unwrap())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build(),
),
SearchTokenizer::ChineseCompatible => Some(
TextAnalyzer::builder(ChineseTokenizer)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
Expand Down
32 changes: 31 additions & 1 deletion tokenizers/src/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use tantivy::tokenizer::Language;
// `from_json_value` methods. We don't use serde_json to ser/de the
// SearchTokenizer, because our bincode serialization format is incompatible
// with the "tagged" format we use in our public API.
#[derive(Serialize, Deserialize, Default, Copy, Clone, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq, Eq)]
pub enum SearchTokenizer {
#[default]
Default,
Expand All @@ -41,6 +41,9 @@ pub enum SearchTokenizer {
},
Lowercase,
WhiteSpace,
RegexTokenizer {
pattern: String,
},
ChineseCompatible,
SourceCode,
Ngram {
Expand All @@ -64,6 +67,9 @@ impl SearchTokenizer {
SearchTokenizer::Stem { language } => json!({ "type": "stem", "language": language }),
SearchTokenizer::Lowercase => json!({ "type": "lowercase" }),
SearchTokenizer::WhiteSpace => json!({ "type": "whitespace" }),
SearchTokenizer::RegexTokenizer { pattern } => {
json!({ "type": "regex", "pattern": pattern })
}
SearchTokenizer::ChineseCompatible => json!({ "type": "chinese_compatible" }),
SearchTokenizer::SourceCode => json!({ "type": "source_code" }),
SearchTokenizer::Ngram {
Expand Down Expand Up @@ -106,6 +112,13 @@ impl SearchTokenizer {
}
"lowercase" => Ok(SearchTokenizer::Lowercase),
"whitespace" => Ok(SearchTokenizer::WhiteSpace),
"regex" => {
let pattern: String =
serde_json::from_value(value["pattern"].clone()).map_err(|_| {
anyhow::anyhow!("regex tokenizer requires a string 'pattern' field")
})?;
Ok(SearchTokenizer::RegexTokenizer { pattern })
}
"chinese_compatible" => Ok(SearchTokenizer::ChineseCompatible),
"source_code" => Ok(SearchTokenizer::SourceCode),
"ngram" => {
Expand Down Expand Up @@ -172,6 +185,7 @@ impl SearchTokenizer {
SearchTokenizer::Stem { language } => format!("stem_{}", language_to_str(language)),
SearchTokenizer::Lowercase => "lowercase".into(),
SearchTokenizer::WhiteSpace => "whitespace".into(),
SearchTokenizer::RegexTokenizer { .. } => "regex".into(),
SearchTokenizer::ChineseCompatible => "chinese_compatible".into(),
SearchTokenizer::SourceCode => "source_code".into(),
SearchTokenizer::Ngram {
Expand Down Expand Up @@ -237,6 +251,22 @@ mod tests {
);
}

#[rstest]
fn test_regexizer() {
let json = r#"{
"type": "regex",
"pattern": "a+b*"
}"#;
let tokenizer = SearchTokenizer::RegexTokenizer {
pattern: "a+b*".to_string(),
};

assert_eq!(
tokenizer,
SearchTokenizer::from_json_value(&serde_json::from_str(json).unwrap()).unwrap()
);
}

#[rstest]
fn test_search_normalizer() {
assert_eq!(SearchNormalizer::Lowercase.name(), "lowercase");
Expand Down

0 comments on commit 171bf64

Please sign in to comment.