diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index bbdae9f3898..00953a4477f 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -15,6 +15,7 @@ mod chinese_compatible; mod code_tokenizer; mod tokenizer_manager; +mod truncate_tokenizer; use std::sync::LazyLock; @@ -26,8 +27,9 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; - +pub use self::truncate_tokenizer::TruncateLongFilter; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; +pub const DEFAULT_TRUNCATE_TOKEN_LENGTH: usize = 255; /// Quickwit's tokenizer/analyzer manager. pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { @@ -85,11 +87,11 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager { let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .filter(TruncateLongFilter::limit(DEFAULT_TRUNCATE_TOKEN_LENGTH)) .build(); let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .filter(TruncateLongFilter::limit(DEFAULT_TRUNCATE_TOKEN_LENGTH)) .build(); let tokenizer_manager = TokenizerManager::new(); tokenizer_manager.register("raw", raw_tokenizer, false); @@ -166,4 +168,18 @@ mod tests { assert!(stream.token().text.chars().all(|c| !c.is_uppercase())); assert!(!stream.advance()); } + + #[test] + fn test_truncate_tokenizer() { + let tokenizer_manager = super::create_quickwit_fastfield_normalizer_manager(); + let very_long_text = "a text, that is just too long, no one will type it, no one will like \ + it, no one shall find it. I just need some more chars, now you may \ + not pass.".repeat(3); + + let mut truncate_tokenizer = tokenizer_manager.get_tokenizer("raw").unwrap(); + let mut truncate_stream = truncate_tokenizer.token_stream(&very_long_text); + assert!(truncate_stream.advance()); + assert!(!truncate_stream.advance()); + assert!(truncate_stream.token().text.len() <= super::DEFAULT_TRUNCATE_TOKEN_LENGTH); + } } diff --git a/quickwit/quickwit-query/src/tokenizers/truncate_tokenizer.rs b/quickwit/quickwit-query/src/tokenizers/truncate_tokenizer.rs new file mode 100644 index 00000000000..7738b5cfbaf --- /dev/null +++ b/quickwit/quickwit-query/src/tokenizers/truncate_tokenizer.rs @@ -0,0 +1,70 @@ +use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; + +#[derive(Clone)] +pub struct TruncateLongFilter { + max_bytes: usize, +} +impl TruncateLongFilter { + pub fn limit(max_bytes: usize) -> Self { + Self { max_bytes } + } +} +impl TokenFilter for TruncateLongFilter { + type Tokenizer = TruncateLongWrapper; + fn transform(self, inner: T) -> Self::Tokenizer { + TruncateLongWrapper { + max_bytes: self.max_bytes, + inner, + } + } +} +#[derive(Clone)] +pub struct TruncateLongWrapper { + max_bytes: usize, + inner: T, +} +impl Tokenizer for TruncateLongWrapper { + type TokenStream<'a> = TruncateLongStream> where T: 'a; + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + TruncateLongStream { + max_bytes: self.max_bytes, + tail: self.inner.token_stream(text), + } + } +} +pub struct TruncateLongStream { + max_bytes: usize, + tail: T, +} +impl TokenStream for TruncateLongStream { + fn advance(&mut self) -> bool { + if !self.tail.advance() { + return false; + } + let tok = self.tail.token_mut(); + if tok.text.len() > self.max_bytes { + truncate_at_char_boundary(&mut tok.text, self.max_bytes); + tok.offset_to = tok.offset_from.saturating_add(tok.text.len()); + } + true + } + fn token(&self) -> &Token { + self.tail.token() + } + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} + +/// Shrinks `s` to at most `max_bytes` UTF-8 bytes without splitting a +/// multibyte character at the given `max_bytes` index. +fn truncate_at_char_boundary(s: &mut String, max_bytes: usize) { + if s.len() <= max_bytes { + return; + } + let mut end = max_bytes; + while !s.is_char_boundary(end) { + end -= 1; + } + s.truncate(end); +} \ No newline at end of file