Skip to content

Commit

Permalink
support hex bytes (quickwit-oss#3459)
Browse files Browse the repository at this point in the history
* add input/output format config to byte fields

* support serializing bytes to hex

* support parsing bytes from hex strings

* derive SerDe
  • Loading branch information
trinity-1686a authored Jun 1, 2023
1 parent 32f2e1c commit 43ad400
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 40 deletions.
1 change: 1 addition & 0 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ anyhow = { workspace = true }
base64 = { workspace = true }
dyn-clone = { workspace = true }
fnv = { workspace = true }
hex = { workspace = true }
indexmap = { workspace = true }
itertools = { workspace = true }
mockall = { workspace = true, optional = true }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ mod tests {
"image": "invalid base64 data"
}"#,
);
let expected_msg = "The field `image` could not be parsed: Expected Base64 string, got \
let expected_msg = "The field `image` could not be parsed: Expected base64 string, got \
`invalid base64 data`: Invalid byte 32, offset 7.";
assert_eq!(result.unwrap_err().to_string(), expected_msg);
Ok(())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@
use std::convert::TryFrom;

use anyhow::bail;
use base64::prelude::{Engine, BASE64_STANDARD};
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use tantivy::schema::{IndexRecordOption, JsonObjectOptions, TextFieldIndexing, TextOptions, Type};
use tantivy::schema::{
IndexRecordOption, JsonObjectOptions, TextFieldIndexing, TextOptions, Type,
Value as TantivyValue,
};

use super::date_time_type::QuickwitDateTimeOptions;
use super::{default_as_true, FieldMappingType};
Expand Down Expand Up @@ -103,6 +107,85 @@ impl Default for QuickwitNumericOptions {
}
}

#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct QuickwitBytesOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default = "default_as_true")]
pub stored: bool,
#[serde(default = "default_as_true")]
pub indexed: bool,
#[serde(default)]
pub fast: bool,
#[serde(default)]
pub input_format: BinaryFormat,
#[serde(default)]
pub output_format: BinaryFormat,
}

impl Default for QuickwitBytesOptions {
fn default() -> Self {
Self {
description: None,
indexed: true,
stored: true,
fast: false,
input_format: BinaryFormat::default(),
output_format: BinaryFormat::default(),
}
}
}

#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum BinaryFormat {
#[default]
Base64,
Hex,
}

impl BinaryFormat {
pub fn as_str(&self) -> &str {
match self {
BinaryFormat::Base64 => "base64",
BinaryFormat::Hex => "hex",
}
}

pub fn format_to_json(&self, value: &[u8]) -> JsonValue {
match self {
BinaryFormat::Base64 => BASE64_STANDARD.encode(value).into(),
BinaryFormat::Hex => hex::encode(value).into(),
}
}

pub fn parse_json(&self, json_val: JsonValue) -> Result<TantivyValue, String> {
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
return Err(format!(
"Expected {} string, got `{json_val}`.",
self.as_str()
));
};
let payload = match self {
BinaryFormat::Base64 => {
BASE64_STANDARD
.decode(&byte_str)
.map_err(|base64_decode_err| {
format!("Expected base64 string, got `{byte_str}`: {base64_decode_err}")
})?
}
BinaryFormat::Hex => hex::decode(&byte_str).map_err(|hex_decode_err| {
format!("Expected hex string, got `{byte_str}`: {hex_decode_err}")
})?,
};
Ok(TantivyValue::Bytes(payload))
}
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct QuickwitIpAddrOptions {
Expand Down Expand Up @@ -393,7 +476,7 @@ fn deserialize_mapping_type(
}
Type::Facet => unimplemented!("Facet are not supported in quickwit yet."),
Type::Bytes => {
let numeric_options: QuickwitNumericOptions = serde_json::from_value(json)?;
let numeric_options: QuickwitBytesOptions = serde_json::from_value(json)?;
if numeric_options.fast && cardinality == Cardinality::MultiValues {
bail!("fast field is not allowed for array<bytes>.");
}
Expand Down Expand Up @@ -455,9 +538,9 @@ fn typed_mapping_to_json_params(
FieldMappingType::Text(text_options, _) => serialize_to_map(&text_options),
FieldMappingType::U64(options, _)
| FieldMappingType::I64(options, _)
| FieldMappingType::Bytes(options, _)
| FieldMappingType::F64(options, _)
| FieldMappingType::Bool(options, _) => serialize_to_map(&options),
FieldMappingType::Bytes(options, _) => serialize_to_map(&options),
FieldMappingType::IpAddr(options, _) => serialize_to_map(&options),
FieldMappingType::DateTime(date_time_options, _) => serialize_to_map(&date_time_options),
FieldMappingType::Json(json_options, _) => serialize_to_map(&json_options),
Expand Down Expand Up @@ -920,7 +1003,7 @@ mod tests {
"type": "i64",
"stored": true,
"fast": false,
"indexed": true
"indexed": true,
})
);
Ok(())
Expand Down Expand Up @@ -1006,7 +1089,7 @@ mod tests {
"type":"u64",
"stored": true,
"fast": false,
"indexed": true
"indexed": true,
})
);
}
Expand All @@ -1030,7 +1113,7 @@ mod tests {
"type":"f64",
"stored": true,
"fast": false,
"indexed": true
"indexed": true,
})
);
}
Expand All @@ -1054,7 +1137,7 @@ mod tests {
"type": "bool",
"stored": true,
"fast": false,
"indexed": true
"indexed": true,
})
);
}
Expand Down Expand Up @@ -1222,7 +1305,9 @@ mod tests {
r#"
{
"name": "my_field_name",
"type": "bytes"
"type": "bytes",
"input_format": "hex",
"output_format": "base64"
}
"#,
)
Expand All @@ -1236,6 +1321,8 @@ mod tests {
"stored": true,
"indexed": true,
"fast": false,
"input_format": "hex",
"output_format": "base64"
})
);
}
Expand All @@ -1260,6 +1347,8 @@ mod tests {
"stored": true,
"indexed": true,
"fast": false,
"input_format": "base64",
"output_format": "base64"
})
);
}
Expand Down Expand Up @@ -1375,7 +1464,7 @@ mod tests {
"type": "i64",
"stored": true,
"fast": false,
"indexed": true
"indexed": true,
})
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ use tantivy::schema::Type;

use super::date_time_type::QuickwitDateTimeOptions;
use crate::default_doc_mapper::field_mapping_entry::{
QuickwitIpAddrOptions, QuickwitJsonOptions, QuickwitNumericOptions, QuickwitObjectOptions,
QuickwitTextOptions,
QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitJsonOptions, QuickwitNumericOptions,
QuickwitObjectOptions, QuickwitTextOptions,
};
use crate::Cardinality;

Expand All @@ -45,7 +45,7 @@ pub(crate) enum FieldMappingType {
/// IP Address mapping type configuration.
IpAddr(QuickwitIpAddrOptions, Cardinality),
/// Bytes mapping type configuration.
Bytes(QuickwitNumericOptions, Cardinality),
Bytes(QuickwitBytesOptions, Cardinality),
/// Json mapping type configuration.
Json(QuickwitJsonOptions, Cardinality),
/// Object mapping type configuration.
Expand Down
Loading

0 comments on commit 43ad400

Please sign in to comment.