From f7ab76b00f33cf6a5f678c675602052d3cc13e79 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Jun 2026 15:37:09 -0600 Subject: [PATCH 1/2] fix: correct stale unix_timestamp NTZ reason and date_format codegen default note CometUnixTimestamp.getUnsupportedReasons claimed TimestampNTZType is unsupported because Comet incorrectly applies timezone conversion, but the native unix_timestamp path has a dedicated TimestampNTZ branch that divides raw microseconds without any timezone conversion, matching Spark. Align the reason text with isSupportedInputType and getSupportLevel, both of which already treat TimestampNTZType as supported. CometDateFormat.getCompatibleNotes described the codegen dispatcher as disabled by default; spark.comet.exec.scalaUDF.codegen.enabled now defaults to true. Correct the note. --- .../src/main/scala/org/apache/comet/serde/datetime.scala | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 2ce75ccc0d..a2dad7fe4b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -292,9 +292,7 @@ object CometSecond extends CometExpressionSerde[Second] with CodegenDispatchFall object CometUnixTimestamp extends CometExpressionSerde[UnixTimestamp] { override def getUnsupportedReasons(): Seq[String] = Seq( - "Only `TimestampType` and `DateType` inputs are supported." + - " `TimestampNTZType` is not supported because Comet incorrectly applies timezone" + - " conversion to TimestampNTZ values.") + "Only `DateType`, `TimestampType`, and `TimestampNTZType` inputs are supported.") private def isSupportedInputType(expr: UnixTimestamp): Boolean = { expr.children.head.dataType match { @@ -695,9 +693,8 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { "Format strings in a curated allow-list run natively via DataFusion's `to_char` for UTC " + "sessions. Other format strings (including non-literal formats), as well as non-UTC " + "sessions, route through Spark's own `DateFormatClass.doGenCode` via the Arrow-direct " + - "codegen dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true`. When the " + - "codegen dispatcher is disabled (default) the operator falls back to Spark in those " + - "cases.") + "codegen dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true` (the default). " + + "When the codegen dispatcher is disabled the operator falls back to Spark in those cases.") override def convert( expr: DateFormatClass, From 51836d6a19bbee2d3e654a92ed4a1315e8098a89 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Jun 2026 16:39:57 -0600 Subject: [PATCH 2/2] fix: use wrapping_sub in native date_diff to match Spark and avoid debug panic The native date_diff subtracted Date32 day values with a plain i32 -, which panics in debug builds when the result overflows i32. Spark computes the day difference with JVM int subtraction, which wraps. Switch to wrapping_sub to match Spark and remove the debug-only panic path, and add regression tests covering basic and overflow inputs. --- .../src/datetime_funcs/date_diff.rs | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/datetime_funcs/date_diff.rs b/native/spark-expr/src/datetime_funcs/date_diff.rs index ca148c103a..0fddbc9331 100644 --- a/native/spark-expr/src/datetime_funcs/date_diff.rs +++ b/native/spark-expr/src/datetime_funcs/date_diff.rs @@ -100,9 +100,12 @@ impl ScalarUDFImpl for SparkDateDiff { ) })?; - // Date32 stores days since epoch, so difference is just subtraction - let result: Int32Array = - binary(end_date_array, start_date_array, |end, start| end - start)?; + // Date32 stores days since epoch, so difference is just subtraction. Use wrapping_sub + // to match Spark, whose JVM int subtraction wraps on overflow; a plain `i32 -` would + // panic in debug builds on extreme inputs. + let result: Int32Array = binary(end_date_array, start_date_array, |end, start| { + end.wrapping_sub(start) + })?; Ok(ColumnarValue::Array(Arc::new(result))) } @@ -111,3 +114,53 @@ impl ScalarUDFImpl for SparkDateDiff { &self.aliases } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::Field; + use datafusion::config::ConfigOptions; + + fn date_diff(end: i32, start: i32) -> i32 { + let udf = SparkDateDiff::new(); + let return_field = Arc::new(Field::new("date_diff", DataType::Int32, true)); + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(Date32Array::from(vec![Some(end)]))), + ColumnarValue::Array(Arc::new(Date32Array::from(vec![Some(start)]))), + ], + number_rows: 1, + return_field, + config_options: Arc::new(ConfigOptions::default()), + arg_fields: vec![], + }; + match udf.invoke_with_args(args).unwrap() { + ColumnarValue::Array(array) => array + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + _ => panic!("expected array result"), + } + } + + #[test] + fn test_date_diff_basic() { + // 2020-01-02 (18263) minus 2020-01-01 (18262) = 1 day + assert_eq!(date_diff(18263, 18262), 1); + assert_eq!(date_diff(18262, 18263), -1); + } + + #[test] + fn test_date_diff_wraps_on_overflow() { + // Extreme inputs overflow i32; Spark's JVM int subtraction wraps rather than panicking. + assert_eq!( + date_diff(i32::MAX, i32::MIN), + i32::MAX.wrapping_sub(i32::MIN) + ); + assert_eq!( + date_diff(i32::MIN, i32::MAX), + i32::MIN.wrapping_sub(i32::MAX) + ); + } +}