diff --git a/parquet-geospatial/src/types.rs b/parquet-geospatial/src/types.rs index f19911ad055a..b19e9103fcc3 100644 --- a/parquet-geospatial/src/types.rs +++ b/parquet-geospatial/src/types.rs @@ -57,12 +57,10 @@ pub struct Metadata { /// The Coordinate Reference System (CRS) of the [`WkbType`], if present. /// /// This may be a raw string value (e.g., "EPSG:3857") or a JSON object (e.g., PROJJSON). - /// Note: Common lon/lat CRS representations (EPSG:4326, OGC:CRS84) are canonicalized - /// to `None` during serialization to match Parquet conventions. #[serde(skip_serializing_if = "Option::is_none")] pub crs: Option, /// The edge interpolation algorithm of the [`WkbType`], if present. - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "edges", skip_serializing_if = "Option::is_none")] pub algorithm: Option, } @@ -88,8 +86,10 @@ impl Metadata { } } - /// Detect if the CRS is a common representation of lon/lat on the standard WGS84 ellipsoid - fn crs_is_lon_lat(&self) -> bool { + /// Detect if the CRS is a common representation of lon/lat on the standard WGS84 ellipsoid. + /// + /// Returns `true` for OGC:CRS84, EPSG:4326, and PROJJSON representations thereof. + pub fn crs_is_lon_lat(&self) -> bool { use serde_json::Value; let Some(crs) = &self.crs else { @@ -144,16 +144,7 @@ impl ExtensionType for WkbType { } fn serialize_metadata(&self) -> Option { - let md = if self.0.crs_is_lon_lat() { - &Metadata { - crs: None, // lon/lat CRS is canonicalized as omitted (None) for Parquet - algorithm: self.0.algorithm, - } - } else { - &self.0 - }; - - serde_json::to_string(md).ok() + serde_json::to_string(&self.0).ok() } fn deserialize_metadata(metadata: Option<&str>) -> ArrowResult { @@ -251,7 +242,7 @@ mod tests { let wkb = WkbType::new(Some(metadata)); let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, r#"{"algorithm":"spherical"}"#); + assert_eq!(serialized, r#"{"edges":"spherical"}"#); let deserialized = WkbType::deserialize_metadata(Some(&serialized))?; assert!(deserialized.crs.is_none()); @@ -267,7 +258,7 @@ mod tests { let wkb = WkbType::new(Some(metadata)); let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, r#"{"crs":"srid:1234","algorithm":"spherical"}"#); + assert_eq!(serialized, r#"{"crs":"srid:1234","edges":"spherical"}"#); let deserialized = WkbType::deserialize_metadata(Some(&serialized))?; assert_eq!( @@ -353,54 +344,43 @@ mod tests { Ok(()) } - /// Test CRS canonicalization logic for common lon/lat representations + /// Test crs_is_lon_lat() detection for common lon/lat representations #[test] - fn test_crs_canonicalization() -> ArrowResult<()> { - // EPSG:4326 as string should be omitted + fn test_crs_is_lon_lat() -> ArrowResult<()> { + // EPSG:4326 as string should be detected let metadata = Metadata::new(Some("EPSG:4326"), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, "{}"); + assert!(metadata.crs_is_lon_lat()); - // OGC:CRS84 as string should be omitted + // OGC:CRS84 as string should be detected let metadata = Metadata::new(Some("OGC:CRS84"), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, "{}"); + assert!(metadata.crs_is_lon_lat()); - // A JSON object that reasonably looks like PROJJSON for EPSG:4326 should be omitted + // A JSON object that reasonably looks like PROJJSON for EPSG:4326 should be detected // detect "4326" as a string let crs_json = r#"{"id":{"authority":"EPSG","code":"4326"}}"#; let metadata = Metadata::new(Some(crs_json), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, "{}"); + assert!(metadata.crs_is_lon_lat()); // detect 4326 as a number let crs_json = r#"{"id":{"authority":"EPSG","code":4326}}"#; let metadata = Metadata::new(Some(crs_json), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, "{}"); + assert!(metadata.crs_is_lon_lat()); - // A JSON object that reasonably looks like PROJJSON for OGC:CRS84 should be omitted + // A JSON object that reasonably looks like PROJJSON for OGC:CRS84 should be detected let crs_json = r#"{"id":{"authority":"OGC","code":"CRS84"}}"#; let metadata = Metadata::new(Some(crs_json), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, "{}"); + assert!(metadata.crs_is_lon_lat()); - // Other input types should be preserved + // Other CRS values should NOT be detected as lon/lat let metadata = Metadata::new(Some("srid:1234"), None); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, r#"{"crs":"srid:1234"}"#); + assert!(!metadata.crs_is_lon_lat()); - // Canonicalization should work with algorithm field - let metadata = Metadata::new(Some("EPSG:4326"), Some(Edges::Spherical)); - let wkb = WkbType::new(Some(metadata)); - let serialized = wkb.serialize_metadata().unwrap(); - assert_eq!(serialized, r#"{"algorithm":"spherical"}"#); + let metadata = Metadata::new(Some("EPSG:3857"), None); + assert!(!metadata.crs_is_lon_lat()); + + // None CRS should NOT be detected as lon/lat + let metadata = Metadata::new(None, None); + assert!(!metadata.crs_is_lon_lat()); Ok(()) } diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index 353770ddbdbb..51e1f1dbd5aa 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -67,7 +67,13 @@ pub(crate) fn try_add_extension_type( } #[cfg(feature = "geospatial")] LogicalType::Geometry(geometry) => { - let md = parquet_geospatial::WkbMetadata::new(geometry.crs.as_deref(), None); + // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0 means unset CRS + let crs = match geometry.crs.as_deref() { + None => Some("OGC:CRS84"), + Some("srid:0") => None, + Some(crs) => Some(crs), + }; + let md = parquet_geospatial::WkbMetadata::new(crs, None); let mut arrow_field = arrow_field; arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?; arrow_field @@ -78,7 +84,13 @@ pub(crate) fn try_add_extension_type( .algorithm() .map(|a| a.try_as_edges()) .transpose()?; - let md = parquet_geospatial::WkbMetadata::new(geography.crs.as_deref(), algorithm); + // Per Parquet spec: omitted CRS defaults to OGC:CRS84, srid:0 means unset CRS + let crs = match geography.crs.as_deref() { + None => Some("OGC:CRS84"), + Some("srid:0") => None, + Some(crs) => Some(crs), + }; + let md = parquet_geospatial::WkbMetadata::new(crs, algorithm); let mut arrow_field = arrow_field; arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?; arrow_field @@ -167,15 +179,31 @@ pub(crate) fn logical_type_for_binary(field: &Field) -> Option { match field.extension_type_name() { Some(n) if n == WkbType::NAME => match field.try_extension_type::() { - Ok(wkb_type) => match wkb_type.metadata().type_hint() { - WkbTypeHint::Geometry => Some(LogicalType::geometry( - wkb_type.metadata().crs.as_ref().map(|c| c.to_string()), - )), - WkbTypeHint::Geography => Some(LogicalType::geography( - wkb_type.metadata().crs.as_ref().map(|c| c.to_string()), - wkb_type.metadata().algorithm.map(|a| a.into()), - )), - }, + Ok(wkb_type) => { + // Convert Arrow CRS to Parquet CRS: + // - None → "srid:0" (unset CRS in Parquet) + // - lon/lat CRS (OGC:CRS84, EPSG:4326) → None (default in Parquet) + // - Other CRS → JSON string + let crs = match &wkb_type.metadata().crs { + None => Some("srid:0".to_string()), + Some(_) if wkb_type.metadata().crs_is_lon_lat() => None, + Some(c) => Some(c.to_string()), + }; + // Convert Arrow edges to Parquet algorithm: + // - Spherical → None (default for Geography) + // - Other algorithms → Some(algorithm) + let algorithm = wkb_type.metadata().algorithm.and_then(|a| { + use parquet_geospatial::WkbEdges; + match a { + WkbEdges::Spherical => None, // spherical is the default + _ => Some(a.into()), + } + }); + match wkb_type.metadata().type_hint() { + WkbTypeHint::Geometry => Some(LogicalType::geometry(crs)), + WkbTypeHint::Geography => Some(LogicalType::geography(crs, algorithm)), + } + } Err(_e) => None, }, _ => None, diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs index bf34528d03e8..d7a6f5b067d4 100644 --- a/parquet/tests/geospatial.rs +++ b/parquet/tests/geospatial.rs @@ -30,7 +30,7 @@ mod test { ArrowSchemaConverter, ArrowWriter, arrow_reader::ParquetRecordBatchReaderBuilder, arrow_writer::ArrowWriterOptions, }, - basic::LogicalType, + basic::{EdgeInterpolationAlgorithm, LogicalType}, column::reader::ColumnReader, data_type::{ByteArray, ByteArrayType}, file::{ @@ -63,7 +63,7 @@ mod test { ( "crs-default.parquet", LogicalType::geometry(None), - WkbMetadata::new(None, None), + WkbMetadata::new(Some("OGC:CRS84"), None), // omitted CRS defaults to OGC:CRS84 ), ( "crs-srid.parquet", @@ -78,7 +78,7 @@ mod test { ( "crs-geography.parquet", LogicalType::geography(None, None), - WkbMetadata::new(None, Some(WkbEdges::Spherical)), + WkbMetadata::new(Some("OGC:CRS84"), Some(WkbEdges::Spherical)), // omitted CRS defaults to OGC:CRS84 ), ]; @@ -425,4 +425,221 @@ mod test { ); Arc::new(array) } + + #[test] + fn test_logical_type_to_field_conversion() { + use parquet::arrow::parquet_to_arrow_schema; + use parquet::basic::Type as PhysicalType; + use parquet::schema::types::{SchemaDescriptor, Type}; + + // Test cases: (LogicalType, expected metadata JSON) + let test_cases = [ + // Geometry with default CRS (defaults to OGC:CRS84 per Parquet spec) + (LogicalType::geometry(None), r#"{"crs":"OGC:CRS84"}"#), + // Geometry with srid:0 should result in an unset (omitted) CRS + (LogicalType::geometry(Some("srid:0".to_string())), r#"{}"#), + // Geometry with custom CRSes (authority:code and partial projjson) + ( + LogicalType::geometry(Some("EPSG:4267".to_string())), + r#"{"crs":"EPSG:4267"}"#, + ), + ( + LogicalType::geometry(Some( + r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string(), + )), + r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#, + ), + // Geography with default CRS (default OGC:CRS84, spherical edges) + ( + LogicalType::geography(None, None), + r#"{"crs":"OGC:CRS84","edges":"spherical"}"#, + ), + // Geography with explicit edges + ( + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::SPHERICAL)), + r#"{"crs":"OGC:CRS84","edges":"spherical"}"#, + ), + ( + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::KARNEY)), + r#"{"crs":"OGC:CRS84","edges":"karney"}"#, + ), + ( + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::VINCENTY)), + r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#, + ), + ( + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::ANDOYER)), + r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#, + ), + ( + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::THOMAS)), + r#"{"crs":"OGC:CRS84","edges":"thomas"}"#, + ), + // Geometry with srid:0 should result in an unset (omitted) CRS + // and spherical edges + ( + LogicalType::geography(Some("srid:0".to_string()), None), + r#"{"edges":"spherical"}"#, + ), + // Geography with custom CRSes (authority:code and partial projjson) + ( + LogicalType::geography(Some("EPSG:4267".to_string()), None), + r#"{"crs":"EPSG:4267","edges":"spherical"}"#, + ), + ( + LogicalType::geography( + Some(r#"{"id":{"authority":"EPSG","code":4326}}"#.to_string()), + None, + ), + r#"{"crs":{"id":{"authority":"EPSG","code":4326}},"edges":"spherical"}"#, + ), + ]; + + for (logical_type, expected_metadata) in test_cases { + // Build a Parquet schema with the given LogicalType + let parquet_schema = SchemaDescriptor::new(Arc::new( + Type::group_type_builder("schema") + .with_fields(vec![Arc::new( + Type::primitive_type_builder("geom", PhysicalType::BYTE_ARRAY) + .with_logical_type(Some(logical_type.clone())) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + )); + + // Convert to Arrow schema + let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); + let field = arrow_schema.field(0); + + // Check extension type name + let ext_name = field.metadata().get("ARROW:extension:name"); + assert_eq!( + ext_name, + Some(&"geoarrow.wkb".to_string()), + "Extension name mismatch for {logical_type:?}" + ); + + // Check extension metadata + let ext_metadata = field.metadata().get("ARROW:extension:metadata"); + assert_eq!( + ext_metadata, + Some(&expected_metadata.to_string()), + "Extension metadata mismatch for {logical_type:?}" + ); + } + } + + #[test] + fn test_field_to_logical_type_conversion() { + use std::collections::HashMap; + + // Test cases: (extension metadata JSON, expected LogicalType) + let test_cases = [ + // Geometry with no CRS should be GEOMETRY(srid:0) + (r#"{}"#, LogicalType::geometry(Some("srid:0".to_string()))), + // Geometry with string CRS + ( + r#"{"crs":"EPSG:4267"}"#, + LogicalType::geometry(Some("\"EPSG:4267\"".to_string())), + ), + // Geometry with PROJJSON CRS + ( + r#"{"crs":{"id":{"authority":"EPSG","code":3857}}}"#, + LogicalType::geometry(Some( + r#"{"id":{"authority":"EPSG","code":3857}}"#.to_string(), + )), + ), + // Geometry with lon/lat CRSes (canonically removed because lon/lat is the + // default Parquet CRS) + (r#"{"crs":"OGC:CRS84"}"#, LogicalType::geometry(None)), + (r#"{"crs":"EPSG:4326"}"#, LogicalType::geometry(None)), + ( + r#"{"crs":{"id":{"authority":"EPSG","code":4326}}}"#, + LogicalType::geometry(None), + ), + ( + r#"{"crs":{"id":{"authority":"EPSG","code":"4326"}}}"#, + LogicalType::geometry(None), + ), + ( + r#"{"crs":{"id":{"authority":"OGC","code":"CRS84"}}}"#, + LogicalType::geometry(None), + ), + // Geography with no CRS, spherical edges + ( + r#"{"edges":"spherical"}"#, + LogicalType::geography(Some("srid:0".to_string()), None), + ), + // Geography with OGC:CRS84 and spherical edges + ( + r#"{"crs":"OGC:CRS84","edges":"spherical"}"#, + LogicalType::geography(None, None), + ), + // Geography with different edge algorithms + ( + r#"{"crs":"OGC:CRS84","edges":"karney"}"#, + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::KARNEY)), + ), + ( + r#"{"crs":"OGC:CRS84","edges":"vincenty"}"#, + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::VINCENTY)), + ), + ( + r#"{"crs":"OGC:CRS84","edges":"andoyer"}"#, + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::ANDOYER)), + ), + ( + r#"{"crs":"OGC:CRS84","edges":"thomas"}"#, + LogicalType::geography(None, Some(EdgeInterpolationAlgorithm::THOMAS)), + ), + // Geography with custom CRS and edges + ( + r#"{"crs":"EPSG:4267","edges":"karney"}"#, + LogicalType::geography( + Some("\"EPSG:4267\"".to_string()), + Some(EdgeInterpolationAlgorithm::KARNEY), + ), + ), + // Geography with PROJJSON CRS + ( + r#"{"crs":{"id":{"authority":"EPSG","code":4267}},"edges":"spherical"}"#, + LogicalType::geography( + Some(r#"{"id":{"authority":"EPSG","code":4267}}"#.to_string()), + None, + ), + ), + ]; + + for (ext_metadata, expected_logical_type) in test_cases { + // Create an Arrow Field with raw extension metadata + let metadata = HashMap::from([ + ( + "ARROW:extension:name".to_string(), + "geoarrow.wkb".to_string(), + ), + ( + "ARROW:extension:metadata".to_string(), + ext_metadata.to_string(), + ), + ]); + let field = + Arc::new(Field::new("geom", DataType::Binary, true).with_metadata(metadata)); + let schema = Schema::new(vec![field]); + + // Convert to Parquet schema + let parquet_schema = ArrowSchemaConverter::new().convert(&schema).unwrap(); + + // Check the logical type + let column_descr = parquet_schema.column(0); + let logical_type = column_descr.logical_type_ref(); + + assert_eq!( + logical_type, + Some(&expected_logical_type), + "LogicalType mismatch for extension metadata: {ext_metadata}" + ); + } + } }