From badfb81fa0f1bccc7fe86193844bf071c2c9db75 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Tue, 21 Apr 2026 19:36:15 +0800
Subject: [PATCH 1/3] feat(partitioning): support multiple partition columns
 (#429)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend `lance.partition.columns` TBLPROPERTY from a single column name to a
comma-separated list of columns. When every declared column is partition-
constant per fragment with identical coverage, the connector reports a
multi-key `KeyGroupedPartitioning` so Spark's SPJ can join on the full key
tuple without a shuffle.

Core changes
- `PartitionInfo` refactored from `(columnName, Map<Integer, Comparable<?>>)`
  to `(List<String>, Map<Integer, Comparable<?>[]>)` with constructor-
  enforced width invariants, defensive tuple clones, `restrictTo`,
  `withSoftCapped`, and `forSingleColumn` factory. `serialVersionUID`
  bumped to 2L for the shape change.
- `LanceScanBuilder.parsePartitionColumns` tokenizes on `,`, trims, drops
  empties, dedupes with WARN (preserving first), rejects nested paths.
- Type whitelist restricted to types that pass through Spark's InternalRow
  encoding without a converter (Boolean/Byte/Short/Int/Long/String).
  Date/Timestamp deferred pending JNI-returned-class pinning; Float/Double
  /Decimal/complex rejected for NaN/scale/equality reasons.
- `detectPartitioning` requires identical per-column fragment coverage;
  strict-subset intersection is rejected to avoid phantom null-key SPJ
  groups. Iterates in declaration order for deterministic diagnostics.
- Filter-pushdown path restricts `PartitionInfo` to the surviving fragment
  set; soft-cap re-evaluated against the restricted size.
- `LanceScan.outputPartitioning` falls back to `UnknownPartitioning` when
  soft-capped, disabled via `spark.lance.partition.reporting.enabled`, or
  gated by `SparkVersionUtil.supportsMultiKeySpj()` (allowlist: 3.5.x, 4+).
- `LanceScanBuilder.build()` wrapped in try/finally so the lazily-opened
  dataset is closed on exception paths.

Configs
- `spark.lance.partition.reporting.maxPartitions` (default 10_000): soft
  cap; above this the scan reports `UnknownPartitioning`.
- `spark.lance.partition.reporting.enabled` (default true): global escape
  hatch for SPJ reporting.

Tests
- New `PartitionInfoTest` (12 tests): invariants, defensive copies,
  factory equivalence, `restrictTo`, `withSoftCapped`, Java serialization
  round-trip, immutability.
- New `SparkVersionUtil` helper for the 3.4-vs-3.5+ multi-key gate.
- `LanceScanTest`: multi-column `KeyGroupedPartitioning`, soft-cap Unknown
  fallback.
- `LanceScanBuilderTest`: unknown column, nested path, whitespace, empty,
  delimiter-only, unsupported type — all fall back cleanly. Mismatched
  per-column coverage rejected; identical coverage accepted with correct
  tuple values.
- `ZonemapFragmentPrunerTest` migrated to new API (`forSingleColumn`,
  `getColumnNames`, `getFragmentPartitionKeys`).

316 tests pass on `lance-spark-base_2.12` (up from 311).
---
 .../java/org/lance/spark/read/LanceScan.java  |  24 +-
 .../lance/spark/read/LanceScanBuilder.java    | 384 ++++++++++++++----
 .../lance/spark/read/SparkVersionUtil.java    |  54 +++
 .../spark/read/ZonemapFragmentPruner.java     | 127 +++++-
 .../spark/read/LanceScanBuilderTest.java      | 135 ++++++
 .../org/lance/spark/read/LanceScanTest.java   |  63 ++-
 .../lance/spark/read/PartitionInfoTest.java   | 217 ++++++++++
 .../spark/read/ZonemapFragmentPrunerTest.java |  18 +-
 8 files changed, 896 insertions(+), 126 deletions(-)
 create mode 100644 lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
 create mode 100644 lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScan.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScan.java
index 9cfb41f9b..8d840b4cb 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScan.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScan.java
@@ -366,17 +366,21 @@ private List<LanceSplit> pruneByZonemapStats(List<LanceSplit> allSplits) {
    */
   @Override
   public Partitioning outputPartitioning() {
-    if (partitionInfo != null) {
-      // Use partition info fragment count — available before
-      // planInputPartitions() is called. This allows
-      // V2ScanPartitioningAndOrdering to see the partitioning
-      // early enough for SPJ.
-      int partCount =
-          numPartitions >= 0 ? numPartitions : partitionInfo.getFragmentPartitionValues().size();
-      Expression[] keys = new Expression[] {FieldReference.apply(partitionInfo.getColumnName())};
-      return new KeyGroupedPartitioning(keys, partCount);
+    if (partitionInfo == null
+        || partitionInfo.isSoftCapped()
+        || !LanceScanBuilder.readReportingEnabledConf()) {
+      return new UnknownPartitioning(numPartitions >= 0 ? numPartitions : 0);
     }
-    return new UnknownPartitioning(numPartitions >= 0 ? numPartitions : 0);
+    List<String> colNames = partitionInfo.getColumnNames();
+    if (colNames.size() > 1 && !SparkVersionUtil.supportsMultiKeySpj()) {
+      return new UnknownPartitioning(numPartitions >= 0 ? numPartitions : 0);
+    }
+    int partCount = numPartitions >= 0 ? numPartitions : partitionInfo.size();
+    Expression[] keys = new Expression[colNames.size()];
+    for (int i = 0; i < colNames.size(); i++) {
+      keys[i] = FieldReference.apply(colNames.get(i));
+    }
+    return new KeyGroupedPartitioning(keys, partCount);
   }
 
   @Override
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
index 1178bfa36..5016d8fcb 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
@@ -92,6 +92,11 @@ public class LanceScanBuilder
 
   private final java.util.Map<String, String> tableProperties;
 
+  static final String CONF_REPORTING_ENABLED = "spark.lance.partition.reporting.enabled";
+  static final String CONF_REPORTING_MAX_PARTITIONS =
+      "spark.lance.partition.reporting.maxPartitions";
+  static final int DEFAULT_REPORTING_MAX_PARTITIONS = 10_000;
+
   public LanceScanBuilder(
       StructType schema,
       LanceSparkReadOptions readOptions,
@@ -135,100 +140,98 @@ public Scan build() {
       return localScan;
     }
 
-    // Get statistics from manifest summary before closing dataset
-    ManifestSummary summary = getOrOpenDataset().getVersion().getManifestSummary();
-
-    // Collect all columns that need zonemap stats: filter columns + partition column (if declared).
-    Set<String> columnsToLoad = extractReferencedColumns(pushedFilters);
-    String partitionColumn = tableProperties.get(LanceConstant.TABLE_OPT_PARTITION_COLUMNS);
-    if (partitionColumn != null && !partitionColumn.trim().isEmpty()) {
-      partitionColumn = partitionColumn.trim();
-      columnsToLoad.add(partitionColumn);
-    } else {
-      partitionColumn = null;
-    }
-
-    // Load zonemap stats for all requested columns in one pass.
-    Map<String, List<ZoneStats>> zonemapStats = loadZonemapStats(getOrOpenDataset(), columnsToLoad);
+    try {
+      // Get statistics from manifest summary before closing dataset
+      ManifestSummary summary = getOrOpenDataset().getVersion().getManifestSummary();
+
+      // Parse and validate partition columns from TBLPROPERTIES. Nested paths and non-whitelisted
+      // types are rejected here with a WARN; detection falls through to null PartitionInfo.
+      List<String> partitionColumns =
+          parsePartitionColumns(tableProperties.get(LanceConstant.TABLE_OPT_PARTITION_COLUMNS));
+
+      // Collect all columns that need zonemap stats: filter columns + declared partition columns.
+      Set<String> columnsToLoad = extractReferencedColumns(pushedFilters);
+      columnsToLoad.addAll(partitionColumns);
+
+      // Load zonemap stats for all requested columns in one pass.
+      Map<String, List<ZoneStats>> zonemapStats =
+          loadZonemapStats(getOrOpenDataset(), columnsToLoad);
+
+      // Reject-all policy: if any declared column fails detection (missing stats, non-constant
+      // values, coverage mismatch), the whole scan falls back to UnknownPartitioning so SPJ
+      // symmetry with the joined counterpart is preserved.
+      ZonemapFragmentPruner.PartitionInfo partitionInfo =
+          detectPartitioning(partitionColumns, zonemapStats);
+
+      // Pre-compute fragment pruning so we can (a) estimate post-pruning statistics for
+      // JoinSelection (BroadcastHashJoin vs SortMergeJoin) and (b) pass the cached result
+      // to LanceScan to avoid re-computing during planInputPartitions().
+      Set<Integer> survivingFragmentIds = null;
+      if (pushedFilters.length > 0 && !zonemapStats.isEmpty()) {
+        survivingFragmentIds =
+            ZonemapFragmentPruner.pruneFragments(pushedFilters, zonemapStats).orElse(null);
+      }
 
-    // Detect partition-compatible columns, gated on lance.partition.columns table property.
-    // Currently a partitioned column is only valid if each fragment contains only a single
-    // value for that column (i.e., all zonemap zones have min == max with the same value).
-    ZonemapFragmentPruner.PartitionInfo partitionInfo = null;
-    if (partitionColumn != null) {
-      if (!zonemapStats.containsKey(partitionColumn)) {
-        LOG.warn(
-            "Partition column '{}' declared in {} has no zonemap index or stats;"
-                + " partition detection disabled",
-            partitionColumn,
-            LanceConstant.TABLE_OPT_PARTITION_COLUMNS);
-      } else {
-        Map<Integer, Comparable<?>> partValues =
-            ZonemapFragmentPruner.computeFragmentPartitionValues(zonemapStats.get(partitionColumn))
-                .orElse(null);
-        if (partValues != null) {
-          partitionInfo = new ZonemapFragmentPruner.PartitionInfo(partitionColumn, partValues);
-          LOG.info(
-              "Detected partition-compatible column '{}' with {} fragments",
-              partitionColumn,
-              partValues.size());
+      // Filter pushdown may have narrowed the surviving fragment set; restrict PartitionInfo so
+      // the partition count reported via SPJ matches the post-pushdown size. restrictTo clears
+      // the softCapped flag (cap is size-dependent) — re-apply if the restricted size still
+      // exceeds the cap.
+      if (partitionInfo != null && survivingFragmentIds != null) {
+        partitionInfo = partitionInfo.restrictTo(survivingFragmentIds);
+        if (partitionInfo.size() == 0) {
+          partitionInfo = null;
+        } else if (partitionInfo.size() > readMaxReportedPartitionsConf()) {
+          partitionInfo = partitionInfo.withSoftCapped();
         }
       }
-    }
 
-    // Pre-compute fragment pruning so we can (a) estimate post-pruning statistics for
-    // JoinSelection (BroadcastHashJoin vs SortMergeJoin) and (b) pass the cached result
-    // to LanceScan to avoid re-computing during planInputPartitions().
-    Set<Integer> survivingFragmentIds = null;
-    if (pushedFilters.length > 0 && !zonemapStats.isEmpty()) {
-      survivingFragmentIds =
-          ZonemapFragmentPruner.pruneFragments(pushedFilters, zonemapStats).orElse(null);
-    }
+      // Scale rows and full size by the zonemap fragment-pruning ratio first, then let
+      // LanceStatistics.estimateProjected apply the column-width ratio on top
+      // (when the projected schema is narrower than the full schema).
+      long projectedRows = summary.getTotalRows();
+      long projectedFullSize = summary.getTotalFilesSize();
+      if (survivingFragmentIds != null && summary.getTotalFragments() > 0) {
+        double ratio = (double) survivingFragmentIds.size() / summary.getTotalFragments();
+        projectedRows = (long) (projectedRows * ratio);
+        projectedFullSize = (long) (projectedFullSize * ratio);
+      }
+      LanceStatistics statistics =
+          LanceStatistics.estimateProjected(projectedRows, projectedFullSize, fullSchema, schema);
+      if (survivingFragmentIds != null) {
+        LOG.debug(
+            "Scan statistics after pruning: {} of {} fragments survive,"
+                + " estimatedSize={}, estimatedRows={} (full: size={}, rows={})",
+            survivingFragmentIds.size(),
+            summary.getTotalFragments(),
+            statistics.sizeInBytes(),
+            statistics.numRows(),
+            summary.getTotalFilesSize(),
+            summary.getTotalRows());
+      }
 
-    // Scale rows and full size by the zonemap fragment-pruning ratio first, then let
-    // LanceStatistics.estimateProjected apply the column-width ratio on top
-    // (when the projected schema is narrower than the full schema).
-    long projectedRows = summary.getTotalRows();
-    long projectedFullSize = summary.getTotalFilesSize();
-    if (survivingFragmentIds != null && summary.getTotalFragments() > 0) {
-      double ratio = (double) survivingFragmentIds.size() / summary.getTotalFragments();
-      projectedRows = (long) (projectedRows * ratio);
-      projectedFullSize = (long) (projectedFullSize * ratio);
-    }
-    LanceStatistics statistics =
-        LanceStatistics.estimateProjected(projectedRows, projectedFullSize, fullSchema, schema);
-    if (survivingFragmentIds != null) {
-      LOG.debug(
-          "Scan statistics after pruning: {} of {} fragments survive,"
-              + " estimatedSize={}, estimatedRows={} (full: size={}, rows={})",
-          survivingFragmentIds.size(),
-          summary.getTotalFragments(),
-          statistics.sizeInBytes(),
-          statistics.numRows(),
-          summary.getTotalFilesSize(),
-          summary.getTotalRows());
+      Optional<String> whereCondition =
+          FilterPushDown.compileFiltersToSqlWhereClause(pushedFilters);
+      return new LanceScan(
+          schema,
+          readOptions,
+          whereCondition,
+          limit,
+          offset,
+          topNSortOrders,
+          pushedAggregation,
+          pushedFilters,
+          statistics,
+          zonemapStats,
+          survivingFragmentIds,
+          partitionInfo,
+          initialStorageOptions,
+          namespaceImpl,
+          namespaceProperties);
+    } finally {
+      // Always close the lazily opened dataset, including on exception paths, so we don't leak
+      // the JNI handle when parsing/detection/pruning helpers throw.
+      closeLazyDataset();
     }
-
-    // Close the lazily opened dataset - it's no longer needed after build
-    closeLazyDataset();
-
-    Optional<String> whereCondition = FilterPushDown.compileFiltersToSqlWhereClause(pushedFilters);
-    return new LanceScan(
-        schema,
-        readOptions,
-        whereCondition,
-        limit,
-        offset,
-        topNSortOrders,
-        pushedAggregation,
-        pushedFilters,
-        statistics,
-        zonemapStats,
-        survivingFragmentIds,
-        partitionInfo,
-        initialStorageOptions,
-        namespaceImpl,
-        namespaceProperties);
   }
 
   @Override
@@ -406,4 +409,209 @@ private static Set<String> extractReferencedColumns(Filter[] filters) {
     }
     return columns;
   }
+
+  /**
+   * Tokenizes {@code lance.partition.columns} on {@code ,}, trims, drops empties, deduplicates,
+   * rejects nested paths, and validates each column's Spark type against the whitelist. Returns an
+   * empty list if the property is absent, empty, or any column fails validation (reject-all).
+   */
+  private List<String> parsePartitionColumns(String raw) {
+    // Treat null, empty, whitespace-only, and pure-delimiter values (",", ", ,", ...) all as
+    // "property not set" — these are the no-op cases; returning quietly avoids a spurious WARN.
+    if (raw == null || raw.replace(",", "").trim().isEmpty()) {
+      return Collections.emptyList();
+    }
+    List<String> tokens = new ArrayList<>();
+    Set<String> seen = new HashSet<>();
+    for (String part : raw.split(",")) {
+      String trimmed = part.trim();
+      if (trimmed.isEmpty()) {
+        continue;
+      }
+      if (!seen.add(trimmed)) {
+        LOG.warn(
+            "{} contains duplicate column '{}' (dropped)",
+            LanceConstant.TABLE_OPT_PARTITION_COLUMNS,
+            trimmed);
+        continue;
+      }
+      if (trimmed.contains(".")) {
+        LOG.warn("partition column '{}' has nested path; nested paths not supported", trimmed);
+        return Collections.emptyList();
+      }
+      if (!isSupportedPartitionType(trimmed)) {
+        return Collections.emptyList();
+      }
+      tokens.add(trimmed);
+    }
+    return tokens;
+  }
+
+  /**
+   * Looks up the column's type on the full read schema and returns true iff it is in the partition
+   * whitelist. Uses {@link #fullSchema} rather than {@link #schema} so column pruning does not
+   * remove partition columns from the lookup; returns false with a WARN if the column is missing or
+   * has an unsupported type.
+   */
+  private boolean isSupportedPartitionType(String columnName) {
+    int idx;
+    try {
+      idx = fullSchema.fieldIndex(columnName);
+    } catch (IllegalArgumentException e) {
+      LOG.warn(
+          "partition column '{}' is not in the table schema; partition detection disabled",
+          columnName);
+      return false;
+    }
+    org.apache.spark.sql.types.DataType type = fullSchema.fields()[idx].dataType();
+    // Whitelist types whose Spark InternalRow encoding matches the raw Java value returned by
+    // ZoneStats (primitive pass-through, or UTF8String for Strings). Date/Timestamp are
+    // deliberately excluded: Spark expects epoch-days int / epoch-micros long but ZoneStats
+    // may return java.sql.Date / java.time.Instant, which would corrupt SPJ keys without an
+    // explicit converter. Re-enable once the JNI-produced runtime class is pinned and a
+    // toSparkValue mapping is added for it.
+    // Use .equals() rather than == so a DataType materialized from a deserialized schema
+    // (e.g. JSON/Avro round-trip) still matches the singleton constants.
+    if (DataTypes.BooleanType.equals(type)
+        || DataTypes.ByteType.equals(type)
+        || DataTypes.ShortType.equals(type)
+        || DataTypes.IntegerType.equals(type)
+        || DataTypes.LongType.equals(type)
+        || DataTypes.StringType.equals(type)) {
+      return true;
+    }
+    LOG.warn(
+        "partition column '{}' has unsupported type {}: whitelist is"
+            + " Boolean/Byte/Short/Int/Long/String",
+        columnName,
+        type.typeName());
+    return false;
+  }
+
+  /**
+   * Runs per-column zone-constancy detection, verifies that every declared column covers the same
+   * fragment-id set, and assembles per-fragment partition tuples in declaration order. Returns null
+   * when any column fails — reject-all, so SPJ symmetry is preserved on the joined counterpart.
+   */
+  // Package-private for unit-test access to the multi-column detection logic.
+  ZonemapFragmentPruner.PartitionInfo detectPartitioning(
+      List<String> partitionColumns, Map<String, List<ZoneStats>> zonemapStats) {
+    if (partitionColumns.isEmpty()) {
+      return null;
+    }
+    Map<String, Map<Integer, Comparable<?>>> perColumnMaps = new HashMap<>();
+    for (String name : partitionColumns) {
+      if (!zonemapStats.containsKey(name)) {
+        LOG.warn("partition column '{}' has no zonemap stats; partition detection disabled", name);
+        return null;
+      }
+      Map<Integer, Comparable<?>> values =
+          ZonemapFragmentPruner.computeFragmentPartitionValues(zonemapStats.get(name)).orElse(null);
+      if (values == null || values.isEmpty()) {
+        LOG.warn(
+            "partition column '{}' has non-constant or null values; partition detection disabled",
+            name);
+        return null;
+      }
+      perColumnMaps.put(name, values);
+    }
+
+    // Require every declared partition column to cover the same fragment-id set. A strict-subset
+    // intersection would leave splits for uncovered fragments with a phantom null-key tuple —
+    // wrong input to Spark's SPJ. Iterate in declaration order so the mismatched-column WARN is
+    // deterministic across runs.
+    Set<Integer> intersection = null;
+    for (String name : partitionColumns) {
+      Set<Integer> columnFragments = perColumnMaps.get(name).keySet();
+      if (intersection == null) {
+        intersection = new HashSet<>(columnFragments);
+      } else if (!intersection.equals(columnFragments)) {
+        LOG.warn(
+            "partition columns {} have mismatched fragment-id coverage (column '{}' differs);"
+                + " partition detection disabled",
+            partitionColumns,
+            name);
+        return null;
+      }
+    }
+    if (intersection == null || intersection.isEmpty()) {
+      LOG.warn(
+          "partition columns {} have no covered fragments; partition detection disabled",
+          partitionColumns);
+      return null;
+    }
+
+    // Assemble tuples in declaration order.
+    int width = partitionColumns.size();
+    Map<Integer, Comparable<?>[]> tuples = new HashMap<>();
+    for (Integer fragId : intersection) {
+      Comparable<?>[] tuple = new Comparable<?>[width];
+      for (int i = 0; i < width; i++) {
+        tuple[i] = perColumnMaps.get(partitionColumns.get(i)).get(fragId);
+      }
+      tuples.put(fragId, tuple);
+    }
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(partitionColumns, tuples);
+
+    // Apply soft cap based on session conf (if available) or the default. When the cap fires,
+    // the scan will report UnknownPartitioning — log that branch separately so operators don't
+    // see a success-looking "detected N fragments" INFO immediately after the soft-cap WARN.
+    int cap = readMaxReportedPartitionsConf();
+    if (info.size() > cap) {
+      LOG.warn(
+          "partition count {} exceeds {}={}; reporting UnknownPartitioning",
+          info.size(),
+          CONF_REPORTING_MAX_PARTITIONS,
+          cap);
+      return info.withSoftCapped();
+    }
+    LOG.info(
+        "lance.partition.detect cols={} columnCount={} fragments={}",
+        partitionColumns,
+        partitionColumns.size(),
+        info.size());
+    return info;
+  }
+
+  private static int readMaxReportedPartitionsConf() {
+    String val = null;
+    try {
+      org.apache.spark.sql.SparkSession session = org.apache.spark.sql.SparkSession.active();
+      val = session.conf().get(CONF_REPORTING_MAX_PARTITIONS, null);
+    } catch (Exception e) {
+      // No active SparkSession (e.g. unit-test / offline builder usage); log at DEBUG so real
+      // session-level misconfiguration is diagnosable, and fall through to the default.
+      LOG.debug(
+          "Could not read {}: {}; using default", CONF_REPORTING_MAX_PARTITIONS, e.toString());
+      return DEFAULT_REPORTING_MAX_PARTITIONS;
+    }
+    if (val == null) {
+      return DEFAULT_REPORTING_MAX_PARTITIONS;
+    }
+    try {
+      return Integer.parseInt(val.trim());
+    } catch (NumberFormatException e) {
+      LOG.warn(
+          "Could not parse {}='{}' as an integer; using default {}",
+          CONF_REPORTING_MAX_PARTITIONS,
+          val,
+          DEFAULT_REPORTING_MAX_PARTITIONS);
+      return DEFAULT_REPORTING_MAX_PARTITIONS;
+    }
+  }
+
+  static boolean readReportingEnabledConf() {
+    try {
+      org.apache.spark.sql.SparkSession session = org.apache.spark.sql.SparkSession.active();
+      String val = session.conf().get(CONF_REPORTING_ENABLED, null);
+      if (val != null) {
+        return !"false".equalsIgnoreCase(val.trim());
+      }
+    } catch (Exception e) {
+      // No active SparkSession; log at DEBUG and default to enabled.
+      LOG.debug("Could not read {}: {}; defaulting to true", CONF_REPORTING_ENABLED, e.toString());
+    }
+    return true;
+  }
 }
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
new file mode 100644
index 000000000..19ddc6478
--- /dev/null
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark.read;
+
+import org.apache.spark.package$;
+
+/**
+ * Runtime Spark-version gates for features whose behavior differs across supported Spark versions.
+ *
+ * <p>Kept in a single helper so pre-merge findings on multi-key SPJ (Spark 3.4 vs 3.5+) can be
+ * adjusted in one place rather than scattered across the codebase.
+ */
+final class SparkVersionUtil {
+
+  private SparkVersionUtil() {}
+
+  /**
+   * Whether the running Spark version reliably honors multi-key {@code KeyGroupedPartitioning}
+   * without silently falling back to a shuffle. Uses an explicit allowlist (Spark 3.5.x, 4.x+)
+   * rather than a denylist so custom forks and unexpected version strings default to the safe "fall
+   * back to UnknownPartitioning" behavior.
+   */
+  static boolean supportsMultiKeySpj() {
+    String version = package$.MODULE$.SPARK_VERSION();
+    if (version == null) {
+      return false;
+    }
+    // Accept "3.5.x" or any 4.x+ build. Reject everything else (3.4.x, 3.3.x, custom strings).
+    if (version.startsWith("3.5.")) {
+      return true;
+    }
+    int dot = version.indexOf('.');
+    if (dot <= 0) {
+      return false;
+    }
+    try {
+      int major = Integer.parseInt(version.substring(0, dot));
+      return major >= 4;
+    } catch (NumberFormatException e) {
+      return false;
+    }
+  }
+}
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
index 06b5a6deb..cfc326451 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
@@ -343,36 +343,124 @@ private enum ComparisonType {
   }
 
   /**
-   * Result of partition detection: the partition column name and a map from fragment ID to the
-   * partition value for that fragment.
+   * Result of partition detection: the ordered list of partition column names and a map from
+   * fragment ID to the partition tuple (one value per declared column, in declaration order).
+   *
+   * <p>Width invariants (enforced by the constructor): every tuple has length {@code
+   * columnNames.size()}; column names are distinct and non-empty.
    */
   public static final class PartitionInfo implements Serializable {
-    private static final long serialVersionUID = 1L;
+    private static final long serialVersionUID = 2L;
 
-    private final String columnName;
-    private final Map<Integer, Comparable<?>> fragmentPartitionValues;
+    private final List<String> columnNames;
+    private final Map<Integer, Comparable<?>[]> fragmentPartitionKeys;
+    private final boolean softCapped;
 
-    public PartitionInfo(String columnName, Map<Integer, Comparable<?>> fragmentPartitionValues) {
-      this.columnName = columnName;
-      this.fragmentPartitionValues = Collections.unmodifiableMap(fragmentPartitionValues);
+    public PartitionInfo(
+        List<String> columnNames, Map<Integer, Comparable<?>[]> fragmentPartitionKeys) {
+      this(columnNames, fragmentPartitionKeys, false);
     }
 
-    public String getColumnName() {
-      return columnName;
+    public PartitionInfo(
+        List<String> columnNames,
+        Map<Integer, Comparable<?>[]> fragmentPartitionKeys,
+        boolean softCapped) {
+      if (columnNames == null || columnNames.isEmpty()) {
+        throw new IllegalArgumentException("columnNames must be non-empty");
+      }
+      if (new HashSet<>(columnNames).size() != columnNames.size()) {
+        throw new IllegalArgumentException("columnNames must be distinct: " + columnNames);
+      }
+      int width = columnNames.size();
+      Map<Integer, Comparable<?>[]> copy = new HashMap<>();
+      for (Map.Entry<Integer, Comparable<?>[]> e : fragmentPartitionKeys.entrySet()) {
+        Comparable<?>[] tuple = e.getValue();
+        if (tuple == null || tuple.length != width) {
+          throw new IllegalArgumentException(
+              "tuple for fragment " + e.getKey() + " must have length " + width);
+        }
+        copy.put(e.getKey(), tuple.clone());
+      }
+      this.columnNames = Collections.unmodifiableList(new java.util.ArrayList<>(columnNames));
+      this.fragmentPartitionKeys = Collections.unmodifiableMap(copy);
+      this.softCapped = softCapped;
     }
 
-    public Map<Integer, Comparable<?>> getFragmentPartitionValues() {
-      return fragmentPartitionValues;
+    /**
+     * Factory for the single-column case. Wraps each scalar partition value into a length-1 tuple
+     * and delegates to the list-form constructor.
+     */
+    public static PartitionInfo forSingleColumn(
+        String columnName, Map<Integer, Comparable<?>> valueByFragment) {
+      Map<Integer, Comparable<?>[]> tupleMap = new HashMap<>();
+      for (Map.Entry<Integer, Comparable<?>> e : valueByFragment.entrySet()) {
+        tupleMap.put(e.getKey(), new Comparable<?>[] {e.getValue()});
+      }
+      return new PartitionInfo(Collections.singletonList(columnName), tupleMap);
+    }
+
+    public List<String> getColumnNames() {
+      return columnNames;
     }
 
     /**
-     * Returns a partition key {@link InternalRow} for the given fragment ID. The row contains a
-     * single column with the partition value, converted to a Spark-compatible type.
+     * Returns the fragment-id → tuple map as an unmodifiable snapshot; each tuple array is
+     * defensively cloned on every call so mutating the returned arrays cannot corrupt internal
+     * state. Prefer {@link #partitionKeyForFragment(int)} for hot paths — this getter exists for
+     * inspection, equality checks, and serialization round-trip tests.
+     */
+    public Map<Integer, Comparable<?>[]> getFragmentPartitionKeys() {
+      Map<Integer, Comparable<?>[]> snapshot = new HashMap<>(fragmentPartitionKeys.size());
+      for (Map.Entry<Integer, Comparable<?>[]> e : fragmentPartitionKeys.entrySet()) {
+        snapshot.put(e.getKey(), e.getValue().clone());
+      }
+      return Collections.unmodifiableMap(snapshot);
+    }
+
+    public int size() {
+      return fragmentPartitionKeys.size();
+    }
+
+    public boolean isSoftCapped() {
+      return softCapped;
+    }
+
+    /**
+     * Returns a new PartitionInfo restricted to the given fragment-id set. Preserves column order
+     * and tuple shape. The {@code softCapped} flag is NOT carried over because the cap decision is
+     * a function of size; if the restricted size still exceeds the cap, the caller must re-apply it
+     * via {@link #withSoftCapped()}. Used after filter pushdown narrows the surviving fragment set.
+     */
+    public PartitionInfo restrictTo(Set<Integer> survivingFragmentIds) {
+      Map<Integer, Comparable<?>[]> restricted = new HashMap<>();
+      for (Map.Entry<Integer, Comparable<?>[]> e : fragmentPartitionKeys.entrySet()) {
+        if (survivingFragmentIds.contains(e.getKey())) {
+          restricted.put(e.getKey(), e.getValue());
+        }
+      }
+      return new PartitionInfo(columnNames, restricted, false);
+    }
+
+    /** Marks this info as soft-capped, returning a new instance (immutability preserved). */
+    public PartitionInfo withSoftCapped() {
+      return new PartitionInfo(columnNames, fragmentPartitionKeys, true);
+    }
+
+    /**
+     * Returns a partition key {@link InternalRow} for the given fragment ID. The row contains one
+     * or more columns (in declaration order), each converted to a Spark-compatible type.
      */
     public InternalRow partitionKeyForFragment(int fragmentId) {
-      Comparable<?> value = fragmentPartitionValues.get(fragmentId);
-      Object sparkValue = toSparkValue(value);
-      return new GenericInternalRow(new Object[] {sparkValue});
+      Comparable<?>[] tuple = fragmentPartitionKeys.get(fragmentId);
+      int width = columnNames.size();
+      Object[] out = new Object[width];
+      if (tuple == null) {
+        return new GenericInternalRow(out);
+      }
+      for (int i = 0; i < width; i++) {
+        out[i] = toSparkValue(tuple[i]);
+      }
+      return new GenericInternalRow(out);
     }
 
     private static Object toSparkValue(Comparable<?> value) {
@@ -382,7 +470,10 @@ private static Object toSparkValue(Comparable<?> value) {
       if (value instanceof String) {
         return UTF8String.fromString((String) value);
       }
-      // Long, Double, Boolean, Integer are already compatible
+      // Boolean/Byte/Short/Integer/Long pass through unchanged — Spark's InternalRow accepts
+      // them as-is. Date/Timestamp mappings belong here once ZoneStats' concrete return class
+      // is pinned; until then the type whitelist in LanceScanBuilder rejects those columns
+      // upstream so we never see them here.
       return value;
     }
   }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
index 51addf596..eba11137b 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
@@ -13,6 +13,7 @@
  */
 package org.lance.spark.read;
 
+import org.lance.index.scalar.ZoneStats;
 import org.lance.spark.LanceSparkReadOptions;
 import org.lance.spark.TestUtils;
 
@@ -37,7 +38,11 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
 
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -327,4 +332,134 @@ public NullOrdering nullOrdering() {
       return nullOrdering;
     }
   }
+
+  // --- lance.partition.columns parsing guards ---
+
+  private LanceScanBuilder builderWithPartitionColumns(String value) {
+    return new LanceScanBuilder(
+        TEST_SCHEMA,
+        TestUtils.TestTable1Config.readOptions,
+        Collections.emptyMap(),
+        null,
+        Collections.emptyMap(),
+        Collections.singletonMap("lance.partition.columns", value));
+  }
+
+  @Test
+  public void testPartitionColumnsUnknownColumnFallsBackCleanly() {
+    // Unknown column must not throw IllegalArgumentException; the builder logs a WARN and
+    // falls back to a scan that reports UnknownPartitioning.
+    Scan scan = builderWithPartitionColumns("nonexistent_column").build();
+    assertInstanceOf(LanceScan.class, scan);
+    LanceScan ls = (LanceScan) scan;
+    ls.planInputPartitions();
+    assertInstanceOf(
+        org.apache.spark.sql.connector.read.partitioning.UnknownPartitioning.class,
+        ls.outputPartitioning());
+  }
+
+  @Test
+  public void testPartitionColumnsNestedPathFallsBackCleanly() {
+    // Nested field paths are not supported; builder rejects the property, scan reports Unknown.
+    Scan scan = builderWithPartitionColumns("outer.inner").build();
+    assertInstanceOf(LanceScan.class, scan);
+    LanceScan ls = (LanceScan) scan;
+    ls.planInputPartitions();
+    assertInstanceOf(
+        org.apache.spark.sql.connector.read.partitioning.UnknownPartitioning.class,
+        ls.outputPartitioning());
+  }
+
+  @Test
+  public void testPartitionColumnsWhitespaceOnlyIsAbsent() {
+    // Whitespace-only property must be treated exactly like an absent property: no WARN about
+    // empty tokenization for the common "users didn't set it" path.
+    Scan scan = builderWithPartitionColumns("   ").build();
+    assertInstanceOf(LanceScan.class, scan);
+  }
+
+  @Test
+  public void testPartitionColumnsEmptyStringIsAbsent() {
+    Scan scan = builderWithPartitionColumns("").build();
+    assertInstanceOf(LanceScan.class, scan);
+  }
+
+  @Test
+  public void testPartitionColumnsDelimitersOnlyIsAbsent() {
+    // Pure-delimiter input (",", ", , ,") must be treated as absent — no WARN about empty
+    // tokenization, since the effective user intent is "no partition columns declared".
+    Scan scan = builderWithPartitionColumns(",").build();
+    assertInstanceOf(LanceScan.class, scan);
+    scan = builderWithPartitionColumns(", , ,").build();
+    assertInstanceOf(LanceScan.class, scan);
+  }
+
+  @Test
+  public void testPartitionColumnsUnsupportedTypeFallsBackCleanly() {
+    // A column whose Spark type is outside the whitelist (Float/Double/Decimal/complex) must
+    // trigger reject-all: the scan still builds, but reports UnknownPartitioning.
+    StructType schemaWithDouble =
+        new StructType(
+            new StructField[] {
+              DataTypes.createStructField("x", DataTypes.LongType, true),
+              DataTypes.createStructField("y", DataTypes.LongType, true),
+              DataTypes.createStructField("b", DataTypes.LongType, true),
+              DataTypes.createStructField("c", DataTypes.LongType, true),
+              DataTypes.createStructField("score", DataTypes.DoubleType, true),
+            });
+    LanceScanBuilder builder =
+        new LanceScanBuilder(
+            schemaWithDouble,
+            TestUtils.TestTable1Config.readOptions,
+            Collections.emptyMap(),
+            null,
+            Collections.emptyMap(),
+            Collections.singletonMap("lance.partition.columns", "score"));
+    Scan scan = builder.build();
+    assertInstanceOf(LanceScan.class, scan);
+    LanceScan ls = (LanceScan) scan;
+    ls.planInputPartitions();
+    assertInstanceOf(
+        org.apache.spark.sql.connector.read.partitioning.UnknownPartitioning.class,
+        ls.outputPartitioning());
+  }
+
+  // --- detectPartitioning: identical per-column fragment coverage ---
+
+  @Test
+  public void testDetectPartitioningRejectsMismatchedCoverage() {
+    // Column "a" covers fragments {0, 1}; column "b" covers only {0}. Strict-subset coverage
+    // must reject detection entirely — otherwise fragment 1 would produce a phantom null tuple
+    // element for column "b" (same class of bug the per-column intersection used to allow).
+    LanceScanBuilder builder = createBuilder();
+    Map<String, List<ZoneStats>> stats = new HashMap<>();
+    stats.put(
+        "a", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
+    stats.put("b", Collections.singletonList(new ZoneStats(0, 0, 10, 100L, 100L, 0)));
+
+    ZonemapFragmentPruner.PartitionInfo info =
+        builder.detectPartitioning(Arrays.asList("a", "b"), stats);
+    assertNull(info, "Detection must reject when per-column fragment coverage differs");
+  }
+
+  @Test
+  public void testDetectPartitioningAcceptsIdenticalCoverage() {
+    LanceScanBuilder builder = createBuilder();
+    Map<String, List<ZoneStats>> stats = new HashMap<>();
+    stats.put(
+        "a", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
+    stats.put(
+        "b",
+        Arrays.asList(
+            new ZoneStats(0, 0, 10, 100L, 100L, 0), new ZoneStats(1, 0, 10, 200L, 200L, 0)));
+
+    ZonemapFragmentPruner.PartitionInfo info =
+        builder.detectPartitioning(Arrays.asList("a", "b"), stats);
+    assertNotNull(info);
+    assertEquals(Arrays.asList("a", "b"), info.getColumnNames());
+    assertEquals(2, info.size());
+    // Tuples are assembled in declaration order, fragment by fragment.
+    assertArrayEquals(new Object[] {1L, 100L}, info.getFragmentPartitionKeys().get(0));
+    assertArrayEquals(new Object[] {2L, 200L}, info.getFragmentPartitionKeys().get(1));
+  }
 }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
index 13fe3115b..f391ecd5e 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
@@ -199,7 +199,7 @@ public void testOutputPartitioningWithPartitionInfo() {
     fragValues.put(0, "east");
     fragValues.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo partInfo =
-        new ZonemapFragmentPruner.PartitionInfo("region", fragValues);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", fragValues);
 
     LanceScan scan =
         new LanceScan(
@@ -243,6 +243,67 @@ public void testOutputPartitioningWithoutPartitionInfoIsUnknown() {
     assertInstanceOf(UnknownPartitioning.class, partitioning);
   }
 
+  private LanceScan buildScanWithPartitionInfo(ZonemapFragmentPruner.PartitionInfo info) {
+    return new LanceScan(
+        TEST_SCHEMA,
+        TestUtils.TestTable1Config.readOptions,
+        org.lance.spark.utils.Optional.empty(),
+        org.lance.spark.utils.Optional.empty(),
+        org.lance.spark.utils.Optional.empty(),
+        org.lance.spark.utils.Optional.empty(),
+        org.lance.spark.utils.Optional.empty(),
+        new Filter[0],
+        null,
+        Collections.emptyMap(),
+        null,
+        info,
+        Collections.emptyMap(),
+        null,
+        Collections.emptyMap());
+  }
+
+  /** On a Spark version that supports multi-key SPJ (3.5+), N keys are reported in order. */
+  @Test
+  public void testOutputPartitioningMultiColumn() {
+    java.util.Map<Integer, Comparable<?>[]> tuples = new HashMap<>();
+    tuples.put(0, new Comparable<?>[] {"us", 2024L});
+    tuples.put(1, new Comparable<?>[] {"eu", 2025L});
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(java.util.Arrays.asList("region", "year"), tuples);
+
+    LanceScan scan = buildScanWithPartitionInfo(info);
+    scan.planInputPartitions();
+    Partitioning partitioning = scan.outputPartitioning();
+
+    // Gated off on 3.4 — skip the KGP assertion when the gate is closed.
+    if (!SparkVersionUtil.supportsMultiKeySpj()) {
+      assertInstanceOf(UnknownPartitioning.class, partitioning);
+      return;
+    }
+
+    assertInstanceOf(KeyGroupedPartitioning.class, partitioning);
+    KeyGroupedPartitioning kgp = (KeyGroupedPartitioning) partitioning;
+    Expression[] keys = kgp.keys();
+    assertEquals(2, keys.length);
+    assertEquals("region", ((FieldReference) keys[0]).fieldNames()[0]);
+    assertEquals("year", ((FieldReference) keys[1]).fieldNames()[0]);
+  }
+
+  /** A soft-capped PartitionInfo must cause outputPartitioning to report Unknown. */
+  @Test
+  public void testOutputPartitioningSoftCappedReturnsUnknown() {
+    java.util.Map<Integer, Comparable<?>[]> tuples = new HashMap<>();
+    tuples.put(0, new Comparable<?>[] {"us"});
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(
+                java.util.Collections.singletonList("region"), tuples)
+            .withSoftCapped();
+
+    LanceScan scan = buildScanWithPartitionInfo(info);
+    scan.planInputPartitions();
+    assertInstanceOf(UnknownPartitioning.class, scan.outputPartitioning());
+  }
+
   // --- equals / hashCode (required for ReusedExchange) ---
 
   @Test
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
new file mode 100644
index 000000000..6648649ce
--- /dev/null
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark.read;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Unit tests for {@link ZonemapFragmentPruner.PartitionInfo} covering the multi-column refactor.
+ */
+public class PartitionInfoTest {
+
+  private static Map<Integer, Comparable<?>[]> tuples(Object[]... entries) {
+    Map<Integer, Comparable<?>[]> out = new HashMap<>();
+    for (int i = 0; i < entries.length; i++) {
+      Comparable<?>[] tuple = new Comparable<?>[entries[i].length];
+      for (int j = 0; j < entries[i].length; j++) {
+        tuple[j] = (Comparable<?>) entries[i][j];
+      }
+      out.put(i, tuple);
+    }
+    return out;
+  }
+
+  @Test
+  public void rejectsEmptyColumnNames() {
+    assertThrows(
+        IllegalArgumentException.class,
+        () -> new ZonemapFragmentPruner.PartitionInfo(Collections.emptyList(), new HashMap<>()));
+  }
+
+  @Test
+  public void rejectsDuplicateColumnNames() {
+    assertThrows(
+        IllegalArgumentException.class,
+        () ->
+            new ZonemapFragmentPruner.PartitionInfo(
+                Arrays.asList("a", "a"), tuples(new Object[] {"x", "y"})));
+  }
+
+  @Test
+  public void rejectsTupleWidthMismatch() {
+    Map<Integer, Comparable<?>[]> bad = new HashMap<>();
+    bad.put(0, new Comparable<?>[] {"x"}); // expects width 2
+    assertThrows(
+        IllegalArgumentException.class,
+        () -> new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("a", "b"), bad));
+  }
+
+  @Test
+  public void constructorDefensivelyCopiesTuples() {
+    Comparable<?>[] tuple = new Comparable<?>[] {"east", 2024L};
+    Map<Integer, Comparable<?>[]> input = new HashMap<>();
+    input.put(0, tuple);
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), input);
+    tuple[0] = "west"; // mutate caller's array
+    assertEquals("east", info.getFragmentPartitionKeys().get(0)[0]);
+  }
+
+  @Test
+  public void getFragmentPartitionKeysIsUnmodifiable() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(
+            Collections.singletonList("region"), tuples(new Object[] {"east"}));
+    assertThrows(
+        UnsupportedOperationException.class,
+        () -> info.getFragmentPartitionKeys().put(1, new Comparable<?>[] {"west"}));
+  }
+
+  @Test
+  public void partitionKeyForFragmentMultiColumn() {
+    Map<Integer, Comparable<?>[]> map = new HashMap<>();
+    map.put(7, new Comparable<?>[] {"us", 2024L});
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), map);
+
+    InternalRow row = info.partitionKeyForFragment(7);
+    assertEquals(2, row.numFields());
+    assertEquals(UTF8String.fromString("us"), row.get(0, DataTypes.StringType));
+    assertEquals(2024L, row.getLong(1));
+  }
+
+  @Test
+  public void partitionKeyForMissingFragmentReturnsNullRow() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(
+            Arrays.asList("a", "b"), tuples(new Object[] {"x", 1L}));
+    InternalRow row = info.partitionKeyForFragment(999);
+    assertEquals(2, row.numFields());
+    assertTrue(row.isNullAt(0));
+    assertTrue(row.isNullAt(1));
+  }
+
+  @Test
+  public void forSingleColumnMatchesListForm() {
+    Map<Integer, Comparable<?>> scalarMap = new HashMap<>();
+    scalarMap.put(0, "east");
+    scalarMap.put(1, "west");
+    ZonemapFragmentPruner.PartitionInfo factory =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", scalarMap);
+
+    Map<Integer, Comparable<?>[]> listMap = new HashMap<>();
+    listMap.put(0, new Comparable<?>[] {"east"});
+    listMap.put(1, new Comparable<?>[] {"west"});
+    ZonemapFragmentPruner.PartitionInfo direct =
+        new ZonemapFragmentPruner.PartitionInfo(Collections.singletonList("region"), listMap);
+
+    assertEquals(direct.getColumnNames(), factory.getColumnNames());
+    assertEquals(direct.size(), factory.size());
+    // partitionKeyForFragment output must match for every fragment id.
+    for (int fragId : new int[] {0, 1}) {
+      InternalRow a = direct.partitionKeyForFragment(fragId);
+      InternalRow b = factory.partitionKeyForFragment(fragId);
+      assertEquals(a.numFields(), b.numFields());
+      assertEquals(a.get(0, DataTypes.StringType), b.get(0, DataTypes.StringType));
+    }
+  }
+
+  @Test
+  public void restrictToSubsetsFragments() {
+    Map<Integer, Comparable<?>[]> m = new HashMap<>();
+    m.put(0, new Comparable<?>[] {"us", 2024L});
+    m.put(1, new Comparable<?>[] {"us", 2025L});
+    m.put(2, new Comparable<?>[] {"eu", 2024L});
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), m);
+    ZonemapFragmentPruner.PartitionInfo narrowed =
+        info.restrictTo(new HashSet<>(Arrays.asList(0, 2)));
+    assertNotSame(info, narrowed);
+    assertEquals(2, narrowed.size());
+    assertEquals(3, info.size()); // original unchanged
+    assertTrue(narrowed.getFragmentPartitionKeys().containsKey(0));
+    assertTrue(narrowed.getFragmentPartitionKeys().containsKey(2));
+    assertFalse(narrowed.getFragmentPartitionKeys().containsKey(1));
+  }
+
+  @Test
+  public void withSoftCappedCarriesFlag() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(
+            Collections.singletonList("a"), tuples(new Object[] {"x"}));
+    assertFalse(info.isSoftCapped());
+    ZonemapFragmentPruner.PartitionInfo capped = info.withSoftCapped();
+    assertTrue(capped.isSoftCapped());
+    // Original untouched.
+    assertFalse(info.isSoftCapped());
+    // Data preserved.
+    assertEquals(info.getColumnNames(), capped.getColumnNames());
+    assertEquals(info.size(), capped.size());
+  }
+
+  @Test
+  public void javaSerializationRoundTrip() throws Exception {
+    Map<Integer, Comparable<?>[]> m = new HashMap<>();
+    m.put(0, new Comparable<?>[] {"us", 2024L});
+    m.put(1, new Comparable<?>[] {"eu", 2025L});
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), m, true);
+
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
+      oos.writeObject(info);
+    }
+    ZonemapFragmentPruner.PartitionInfo restored;
+    try (ObjectInputStream ois =
+        new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray()))) {
+      restored = (ZonemapFragmentPruner.PartitionInfo) ois.readObject();
+    }
+
+    assertEquals(Arrays.asList("region", "year"), restored.getColumnNames());
+    assertEquals(2, restored.size());
+    assertTrue(restored.isSoftCapped());
+    assertArrayEquals(new Object[] {"us", 2024L}, restored.getFragmentPartitionKeys().get(0));
+  }
+
+  @Test
+  public void columnNamesAreImmutableView() {
+    List<String> names = new java.util.ArrayList<>(Arrays.asList("a", "b"));
+    ZonemapFragmentPruner.PartitionInfo info =
+        new ZonemapFragmentPruner.PartitionInfo(names, tuples(new Object[] {"x", 1L}));
+    names.add("c"); // mutate caller's list after construction
+    assertEquals(Arrays.asList("a", "b"), info.getColumnNames());
+    assertThrows(UnsupportedOperationException.class, () -> info.getColumnNames().add("c"));
+  }
+}
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
index 35678a03d..f0786d04a 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
@@ -563,7 +563,7 @@ public void testPartitionKeyForFragmentString() {
     values.put(0, "east");
     values.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
 
     InternalRow row0 = info.partitionKeyForFragment(0);
     assertNotNull(row0);
@@ -583,7 +583,7 @@ public void testPartitionKeyForFragmentLong() {
     values.put(0, 2023L);
     values.put(1, 2024L);
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo("year", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("year", values);
 
     InternalRow row0 = info.partitionKeyForFragment(0);
     assertEquals(2023L, row0.getLong(0));
@@ -597,7 +597,7 @@ public void testPartitionKeyForMissingFragment() {
     Map<Integer, Comparable<?>> values = new HashMap<>();
     values.put(0, "east");
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
 
     InternalRow row = info.partitionKeyForFragment(99);
     assertNotNull(row);
@@ -610,7 +610,7 @@ public void testPartitionInfoIsSerializable() throws Exception {
     values.put(0, "east");
     values.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
 
     java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
     java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(baos);
@@ -622,9 +622,9 @@ public void testPartitionInfoIsSerializable() throws Exception {
     ZonemapFragmentPruner.PartitionInfo deserialized =
         (ZonemapFragmentPruner.PartitionInfo) ois.readObject();
 
-    assertEquals("region", deserialized.getColumnName());
-    assertEquals("east", deserialized.getFragmentPartitionValues().get(0));
-    assertEquals("west", deserialized.getFragmentPartitionValues().get(1));
+    assertEquals(java.util.Collections.singletonList("region"), deserialized.getColumnNames());
+    assertEquals("east", deserialized.getFragmentPartitionKeys().get(0)[0]);
+    assertEquals("west", deserialized.getFragmentPartitionKeys().get(1)[0]);
   }
 
   @Test
@@ -632,10 +632,10 @@ public void testPartitionInfoImmutableMap() {
     Map<Integer, Comparable<?>> values = new HashMap<>();
     values.put(0, "east");
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
 
     assertThrows(
         UnsupportedOperationException.class,
-        () -> info.getFragmentPartitionValues().put(1, "west"));
+        () -> info.getFragmentPartitionKeys().put(1, new Comparable<?>[] {"west"}));
   }
 }

From e7a47624a8f0362486835286ed1902d60c9e3a50 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Wed, 22 Apr 2026 10:25:51 +0800
Subject: [PATCH 2/3] fix(partitioning): type-aware InternalRow encoding for
 partition keys

PartitionInfo.toSparkValue previously passed ZoneStats values through
unchanged, assuming the returned Comparable<?> already matched Spark's
InternalRow encoding. Verified against lance-core 6.0.0-beta.1 that
ZoneStats.getMin/getMax returns java.lang.Long for every integral Arrow
width (int8/16/32 included) and for Date (epoch-days) / Timestamp
(epoch-micros) - so the pass-through would hand a Long to Spark's
InternalRow.getByte/getShort/getInt on Byte/Short/Int columns and
ClassCastException at scan time.

Make toSparkValue type-dispatched: carry a parallel List<DataType>
alongside columnNames and narrow Long -> byte/short/int for
Byte/Short/Int/Date slots, pass through for Long/Timestamp, wrap
Strings in UTF8String, Booleans pass through. This also closes the
Date/Timestamp deferral since narrowing handles both cleanly.

Changes
- ZonemapFragmentPruner.PartitionInfo: add columnTypes field, bump
  serialVersionUID 2L -> 3L, type-aware toSparkValue, preserve types
  through restrictTo / withSoftCapped. forSingleColumn takes a DataType.
- LanceScanBuilder.isSupportedPartitionType: add DateType, TimestampType
  to the whitelist; WARN message updated.
- LanceScanBuilder.detectPartitioning: resolve per-column types from
  fullSchema and thread them into PartitionInfo.

Tests
- PartitionInfoTest grows from 12 to 20: adds rejectsColumnTypesSize
  Mismatch, explicit narrowing assertions for Byte/Short/Int/Date/
  Timestamp/Boolean/String.
- LanceScanBuilderTest.detectPartitioning* switched to real TEST_SCHEMA
  columns (x, y) and asserts the resolved columnTypes.
- Existing callers (LanceScanTest, ZonemapFragmentPrunerTest) updated
  to the new forSingleColumn / constructor signatures.

All 333 tests pass in lance-spark-base_2.12; lance-spark-3.4_2.12
cross-compile green.
---
 .../lance/spark/read/LanceScanBuilder.java    |  25 ++--
 .../spark/read/ZonemapFragmentPruner.java     |  74 ++++++++---
 .../spark/read/LanceScanBuilderTest.java      |  21 +--
 .../org/lance/spark/read/LanceScanTest.java   |  15 ++-
 .../lance/spark/read/PartitionInfoTest.java   | 124 +++++++++++++++---
 .../spark/read/ZonemapFragmentPrunerTest.java |  15 ++-
 6 files changed, 214 insertions(+), 60 deletions(-)

diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
index 5016d8fcb..077f43813 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
@@ -464,12 +464,9 @@ private boolean isSupportedPartitionType(String columnName) {
       return false;
     }
     org.apache.spark.sql.types.DataType type = fullSchema.fields()[idx].dataType();
-    // Whitelist types whose Spark InternalRow encoding matches the raw Java value returned by
-    // ZoneStats (primitive pass-through, or UTF8String for Strings). Date/Timestamp are
-    // deliberately excluded: Spark expects epoch-days int / epoch-micros long but ZoneStats
-    // may return java.sql.Date / java.time.Instant, which would corrupt SPJ keys without an
-    // explicit converter. Re-enable once the JNI-produced runtime class is pinned and a
-    // toSparkValue mapping is added for it.
+    // Whitelist types that PartitionInfo.toSparkValue can encode into Spark's InternalRow.
+    // ZoneStats always returns Long for integral Arrow widths (int8/16/32 too) and for
+    // Date (epoch-days) / Timestamp (epoch-micros); toSparkValue narrows/wraps appropriately.
     // Use .equals() rather than == so a DataType materialized from a deserialized schema
     // (e.g. JSON/Avro round-trip) still matches the singleton constants.
     if (DataTypes.BooleanType.equals(type)
@@ -477,12 +474,14 @@ private boolean isSupportedPartitionType(String columnName) {
         || DataTypes.ShortType.equals(type)
         || DataTypes.IntegerType.equals(type)
         || DataTypes.LongType.equals(type)
-        || DataTypes.StringType.equals(type)) {
+        || DataTypes.StringType.equals(type)
+        || DataTypes.DateType.equals(type)
+        || DataTypes.TimestampType.equals(type)) {
       return true;
     }
     LOG.warn(
         "partition column '{}' has unsupported type {}: whitelist is"
-            + " Boolean/Byte/Short/Int/Long/String",
+            + " Boolean/Byte/Short/Int/Long/String/Date/Timestamp",
         columnName,
         type.typeName());
     return false;
@@ -541,7 +540,9 @@ ZonemapFragmentPruner.PartitionInfo detectPartitioning(
       return null;
     }
 
-    // Assemble tuples in declaration order.
+    // Assemble tuples in declaration order, and resolve per-column Spark types from the full
+    // read schema so PartitionInfo can encode each value into the right InternalRow slot
+    // (narrowing Long -> byte/short/int for Byte/Short/Int/Date columns, pass-through otherwise).
     int width = partitionColumns.size();
     Map<Integer, Comparable<?>[]> tuples = new HashMap<>();
     for (Integer fragId : intersection) {
@@ -551,8 +552,12 @@ ZonemapFragmentPruner.PartitionInfo detectPartitioning(
       }
       tuples.put(fragId, tuple);
     }
+    List<org.apache.spark.sql.types.DataType> columnTypes = new java.util.ArrayList<>(width);
+    for (String name : partitionColumns) {
+      columnTypes.add(fullSchema.fields()[fullSchema.fieldIndex(name)].dataType());
+    }
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(partitionColumns, tuples);
+        new ZonemapFragmentPruner.PartitionInfo(partitionColumns, columnTypes, tuples);
 
     // Apply soft cap based on session conf (if available) or the default. When the cap fires,
     // the scan will report UnknownPartitioning — log that branch separately so operators don't
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
index cfc326451..0ad67b3a2 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/ZonemapFragmentPruner.java
@@ -29,6 +29,8 @@
 import org.apache.spark.sql.sources.LessThanOrEqual;
 import org.apache.spark.sql.sources.Not;
 import org.apache.spark.sql.sources.Or;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -343,31 +345,43 @@ private enum ComparisonType {
   }
 
   /**
-   * Result of partition detection: the ordered list of partition column names and a map from
-   * fragment ID to the partition tuple (one value per declared column, in declaration order).
+   * Result of partition detection: the ordered list of partition column names, a parallel list of
+   * Spark {@link DataType}s (used to encode each fragment's tuple into an {@link InternalRow}), and
+   * a map from fragment ID to the partition tuple (one value per declared column, in declaration
+   * order).
    *
-   * <p>Width invariants (enforced by the constructor): every tuple has length {@code
-   * columnNames.size()}; column names are distinct and non-empty.
+   * <p>Width invariants (enforced by the constructor): {@code columnTypes.size()} equals {@code
+   * columnNames.size()}; every tuple has length {@code columnNames.size()}; column names are
+   * distinct and non-empty.
    */
   public static final class PartitionInfo implements Serializable {
+    // Bumped from the single-column shape (1L on upstream main): adds columnTypes and uses
+    // tuple storage (Map<Integer, Comparable<?>[]>) for multi-column type-aware encoding.
     private static final long serialVersionUID = 2L;
 
     private final List<String> columnNames;
+    private final List<DataType> columnTypes;
     private final Map<Integer, Comparable<?>[]> fragmentPartitionKeys;
     private final boolean softCapped;
 
     public PartitionInfo(
-        List<String> columnNames, Map<Integer, Comparable<?>[]> fragmentPartitionKeys) {
-      this(columnNames, fragmentPartitionKeys, false);
+        List<String> columnNames,
+        List<DataType> columnTypes,
+        Map<Integer, Comparable<?>[]> fragmentPartitionKeys) {
+      this(columnNames, columnTypes, fragmentPartitionKeys, false);
     }
 
     public PartitionInfo(
         List<String> columnNames,
+        List<DataType> columnTypes,
         Map<Integer, Comparable<?>[]> fragmentPartitionKeys,
         boolean softCapped) {
       if (columnNames == null || columnNames.isEmpty()) {
         throw new IllegalArgumentException("columnNames must be non-empty");
       }
+      if (columnTypes == null || columnTypes.size() != columnNames.size()) {
+        throw new IllegalArgumentException("columnTypes must have the same size as columnNames");
+      }
       if (new HashSet<>(columnNames).size() != columnNames.size()) {
         throw new IllegalArgumentException("columnNames must be distinct: " + columnNames);
       }
@@ -382,6 +396,7 @@ public PartitionInfo(
         copy.put(e.getKey(), tuple.clone());
       }
       this.columnNames = Collections.unmodifiableList(new java.util.ArrayList<>(columnNames));
+      this.columnTypes = Collections.unmodifiableList(new java.util.ArrayList<>(columnTypes));
       this.fragmentPartitionKeys = Collections.unmodifiableMap(copy);
       this.softCapped = softCapped;
     }
@@ -391,18 +406,23 @@ public PartitionInfo(
      * and delegates to the list-form constructor.
      */
     public static PartitionInfo forSingleColumn(
-        String columnName, Map<Integer, Comparable<?>> valueByFragment) {
+        String columnName, DataType columnType, Map<Integer, Comparable<?>> valueByFragment) {
       Map<Integer, Comparable<?>[]> tupleMap = new HashMap<>();
       for (Map.Entry<Integer, Comparable<?>> e : valueByFragment.entrySet()) {
         tupleMap.put(e.getKey(), new Comparable<?>[] {e.getValue()});
       }
-      return new PartitionInfo(Collections.singletonList(columnName), tupleMap);
+      return new PartitionInfo(
+          Collections.singletonList(columnName), Collections.singletonList(columnType), tupleMap);
     }
 
     public List<String> getColumnNames() {
       return columnNames;
     }
 
+    public List<DataType> getColumnTypes() {
+      return columnTypes;
+    }
+
     /**
      * Returns the fragment-id → tuple map as an unmodifiable snapshot; each tuple array is
      * defensively cloned on every call so mutating the returned arrays cannot corrupt internal
@@ -438,12 +458,12 @@ public PartitionInfo restrictTo(Set<Integer> survivingFragmentIds) {
           restricted.put(e.getKey(), e.getValue());
         }
       }
-      return new PartitionInfo(columnNames, restricted, false);
+      return new PartitionInfo(columnNames, columnTypes, restricted, false);
     }
 
     /** Marks this info as soft-capped, returning a new instance (immutability preserved). */
     public PartitionInfo withSoftCapped() {
-      return new PartitionInfo(columnNames, fragmentPartitionKeys, true);
+      return new PartitionInfo(columnNames, columnTypes, fragmentPartitionKeys, true);
     }
 
     /**
@@ -458,23 +478,41 @@ public InternalRow partitionKeyForFragment(int fragmentId) {
         return new GenericInternalRow(out);
       }
       for (int i = 0; i < width; i++) {
-        out[i] = toSparkValue(tuple[i]);
+        out[i] = toSparkValue(tuple[i], columnTypes.get(i));
       }
       return new GenericInternalRow(out);
     }
 
-    private static Object toSparkValue(Comparable<?> value) {
+    /**
+     * Converts a ZoneStats value to the exact Java class Spark's {@link InternalRow} expects for
+     * the target slot. ZoneStats returns {@code Long} for every integral Arrow width (int8/16/32
+     * included) and for Date (epoch-days) / Timestamp (epoch-micros); those need explicit narrowing
+     * to match {@code getByte}/{@code getShort}/{@code getInt} accessors. Boolean is already typed;
+     * Strings are wrapped in {@link UTF8String}.
+     */
+    private static Object toSparkValue(Comparable<?> value, DataType type) {
       if (value == null) {
         return null;
       }
-      if (value instanceof String) {
+      if (DataTypes.BooleanType.equals(type)) {
+        return value;
+      }
+      if (DataTypes.ByteType.equals(type)) {
+        return ((Number) value).byteValue();
+      }
+      if (DataTypes.ShortType.equals(type)) {
+        return ((Number) value).shortValue();
+      }
+      if (DataTypes.IntegerType.equals(type) || DataTypes.DateType.equals(type)) {
+        return ((Number) value).intValue();
+      }
+      if (DataTypes.LongType.equals(type) || DataTypes.TimestampType.equals(type)) {
+        return ((Number) value).longValue();
+      }
+      if (DataTypes.StringType.equals(type)) {
         return UTF8String.fromString((String) value);
       }
-      // Boolean/Byte/Short/Integer/Long pass through unchanged — Spark's InternalRow accepts
-      // them as-is. Date/Timestamp mappings belong here once ZoneStats' concrete return class
-      // is pinned; until then the type whitelist in LanceScanBuilder rejects those columns
-      // upstream so we never see them here.
-      return value;
+      throw new IllegalArgumentException("Unsupported partition column type: " + type);
     }
   }
 
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
index eba11137b..d55db4d0e 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
@@ -428,17 +428,18 @@ public void testPartitionColumnsUnsupportedTypeFallsBackCleanly() {
 
   @Test
   public void testDetectPartitioningRejectsMismatchedCoverage() {
-    // Column "a" covers fragments {0, 1}; column "b" covers only {0}. Strict-subset coverage
+    // Column "x" covers fragments {0, 1}; column "y" covers only {0}. Strict-subset coverage
     // must reject detection entirely — otherwise fragment 1 would produce a phantom null tuple
-    // element for column "b" (same class of bug the per-column intersection used to allow).
+    // element for column "y" (same class of bug the per-column intersection used to allow).
+    // Column names chosen from TEST_SCHEMA (x, y, b, c) so fullSchema.fieldIndex resolves.
     LanceScanBuilder builder = createBuilder();
     Map<String, List<ZoneStats>> stats = new HashMap<>();
     stats.put(
-        "a", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
-    stats.put("b", Collections.singletonList(new ZoneStats(0, 0, 10, 100L, 100L, 0)));
+        "x", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
+    stats.put("y", Collections.singletonList(new ZoneStats(0, 0, 10, 100L, 100L, 0)));
 
     ZonemapFragmentPruner.PartitionInfo info =
-        builder.detectPartitioning(Arrays.asList("a", "b"), stats);
+        builder.detectPartitioning(Arrays.asList("x", "y"), stats);
     assertNull(info, "Detection must reject when per-column fragment coverage differs");
   }
 
@@ -447,16 +448,18 @@ public void testDetectPartitioningAcceptsIdenticalCoverage() {
     LanceScanBuilder builder = createBuilder();
     Map<String, List<ZoneStats>> stats = new HashMap<>();
     stats.put(
-        "a", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
+        "x", Arrays.asList(new ZoneStats(0, 0, 10, 1L, 1L, 0), new ZoneStats(1, 0, 10, 2L, 2L, 0)));
     stats.put(
-        "b",
+        "y",
         Arrays.asList(
             new ZoneStats(0, 0, 10, 100L, 100L, 0), new ZoneStats(1, 0, 10, 200L, 200L, 0)));
 
     ZonemapFragmentPruner.PartitionInfo info =
-        builder.detectPartitioning(Arrays.asList("a", "b"), stats);
+        builder.detectPartitioning(Arrays.asList("x", "y"), stats);
     assertNotNull(info);
-    assertEquals(Arrays.asList("a", "b"), info.getColumnNames());
+    assertEquals(Arrays.asList("x", "y"), info.getColumnNames());
+    // Types resolved from TEST_SCHEMA (both x and y are LongType).
+    assertEquals(Arrays.asList(DataTypes.LongType, DataTypes.LongType), info.getColumnTypes());
     assertEquals(2, info.size());
     // Tuples are assembled in declaration order, fragment by fragment.
     assertArrayEquals(new Object[] {1L, 100L}, info.getFragmentPartitionKeys().get(0));
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
index f391ecd5e..18737e6cd 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanTest.java
@@ -199,7 +199,8 @@ public void testOutputPartitioningWithPartitionInfo() {
     fragValues.put(0, "east");
     fragValues.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo partInfo =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", fragValues);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", org.apache.spark.sql.types.DataTypes.StringType, fragValues);
 
     LanceScan scan =
         new LanceScan(
@@ -269,7 +270,12 @@ public void testOutputPartitioningMultiColumn() {
     tuples.put(0, new Comparable<?>[] {"us", 2024L});
     tuples.put(1, new Comparable<?>[] {"eu", 2025L});
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(java.util.Arrays.asList("region", "year"), tuples);
+        new ZonemapFragmentPruner.PartitionInfo(
+            java.util.Arrays.asList("region", "year"),
+            java.util.Arrays.asList(
+                org.apache.spark.sql.types.DataTypes.StringType,
+                org.apache.spark.sql.types.DataTypes.LongType),
+            tuples);
 
     LanceScan scan = buildScanWithPartitionInfo(info);
     scan.planInputPartitions();
@@ -296,7 +302,10 @@ public void testOutputPartitioningSoftCappedReturnsUnknown() {
     tuples.put(0, new Comparable<?>[] {"us"});
     ZonemapFragmentPruner.PartitionInfo info =
         new ZonemapFragmentPruner.PartitionInfo(
-                java.util.Collections.singletonList("region"), tuples)
+                java.util.Collections.singletonList("region"),
+                java.util.Collections.singletonList(
+                    org.apache.spark.sql.types.DataTypes.StringType),
+                tuples)
             .withSoftCapped();
 
     LanceScan scan = buildScanWithPartitionInfo(info);
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
index 6648649ce..1bc288c7d 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
@@ -14,6 +14,7 @@
 package org.lance.spark.read;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.junit.jupiter.api.Test;
@@ -53,11 +54,18 @@ private static Map<Integer, Comparable<?>[]> tuples(Object[]... entries) {
     return out;
   }
 
+  private static final List<DataType> STRING_LONG =
+      Arrays.asList(DataTypes.StringType, DataTypes.LongType);
+
+  private static final List<DataType> STRING_ONLY = Collections.singletonList(DataTypes.StringType);
+
   @Test
   public void rejectsEmptyColumnNames() {
     assertThrows(
         IllegalArgumentException.class,
-        () -> new ZonemapFragmentPruner.PartitionInfo(Collections.emptyList(), new HashMap<>()));
+        () ->
+            new ZonemapFragmentPruner.PartitionInfo(
+                Collections.emptyList(), Collections.emptyList(), new HashMap<>()));
   }
 
   @Test
@@ -66,7 +74,9 @@ public void rejectsDuplicateColumnNames() {
         IllegalArgumentException.class,
         () ->
             new ZonemapFragmentPruner.PartitionInfo(
-                Arrays.asList("a", "a"), tuples(new Object[] {"x", "y"})));
+                Arrays.asList("a", "a"),
+                Arrays.asList(DataTypes.StringType, DataTypes.StringType),
+                tuples(new Object[] {"x", "y"})));
   }
 
   @Test
@@ -75,7 +85,18 @@ public void rejectsTupleWidthMismatch() {
     bad.put(0, new Comparable<?>[] {"x"}); // expects width 2
     assertThrows(
         IllegalArgumentException.class,
-        () -> new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("a", "b"), bad));
+        () -> new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("a", "b"), STRING_LONG, bad));
+  }
+
+  @Test
+  public void rejectsColumnTypesSizeMismatch() {
+    assertThrows(
+        IllegalArgumentException.class,
+        () ->
+            new ZonemapFragmentPruner.PartitionInfo(
+                Arrays.asList("a", "b"),
+                Collections.singletonList(DataTypes.StringType),
+                tuples(new Object[] {"x", 1L})));
   }
 
   @Test
@@ -84,7 +105,8 @@ public void constructorDefensivelyCopiesTuples() {
     Map<Integer, Comparable<?>[]> input = new HashMap<>();
     input.put(0, tuple);
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), input);
+        new ZonemapFragmentPruner.PartitionInfo(
+            Arrays.asList("region", "year"), STRING_LONG, input);
     tuple[0] = "west"; // mutate caller's array
     assertEquals("east", info.getFragmentPartitionKeys().get(0)[0]);
   }
@@ -93,7 +115,7 @@ public void constructorDefensivelyCopiesTuples() {
   public void getFragmentPartitionKeysIsUnmodifiable() {
     ZonemapFragmentPruner.PartitionInfo info =
         new ZonemapFragmentPruner.PartitionInfo(
-            Collections.singletonList("region"), tuples(new Object[] {"east"}));
+            Collections.singletonList("region"), STRING_ONLY, tuples(new Object[] {"east"}));
     assertThrows(
         UnsupportedOperationException.class,
         () -> info.getFragmentPartitionKeys().put(1, new Comparable<?>[] {"west"}));
@@ -104,7 +126,7 @@ public void partitionKeyForFragmentMultiColumn() {
     Map<Integer, Comparable<?>[]> map = new HashMap<>();
     map.put(7, new Comparable<?>[] {"us", 2024L});
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), map);
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), STRING_LONG, map);
 
     InternalRow row = info.partitionKeyForFragment(7);
     assertEquals(2, row.numFields());
@@ -116,7 +138,7 @@ public void partitionKeyForFragmentMultiColumn() {
   public void partitionKeyForMissingFragmentReturnsNullRow() {
     ZonemapFragmentPruner.PartitionInfo info =
         new ZonemapFragmentPruner.PartitionInfo(
-            Arrays.asList("a", "b"), tuples(new Object[] {"x", 1L}));
+            Arrays.asList("a", "b"), STRING_LONG, tuples(new Object[] {"x", 1L}));
     InternalRow row = info.partitionKeyForFragment(999);
     assertEquals(2, row.numFields());
     assertTrue(row.isNullAt(0));
@@ -129,13 +151,15 @@ public void forSingleColumnMatchesListForm() {
     scalarMap.put(0, "east");
     scalarMap.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo factory =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", scalarMap);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", DataTypes.StringType, scalarMap);
 
     Map<Integer, Comparable<?>[]> listMap = new HashMap<>();
     listMap.put(0, new Comparable<?>[] {"east"});
     listMap.put(1, new Comparable<?>[] {"west"});
     ZonemapFragmentPruner.PartitionInfo direct =
-        new ZonemapFragmentPruner.PartitionInfo(Collections.singletonList("region"), listMap);
+        new ZonemapFragmentPruner.PartitionInfo(
+            Collections.singletonList("region"), STRING_ONLY, listMap);
 
     assertEquals(direct.getColumnNames(), factory.getColumnNames());
     assertEquals(direct.size(), factory.size());
@@ -155,7 +179,7 @@ public void restrictToSubsetsFragments() {
     m.put(1, new Comparable<?>[] {"us", 2025L});
     m.put(2, new Comparable<?>[] {"eu", 2024L});
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), m);
+        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), STRING_LONG, m);
     ZonemapFragmentPruner.PartitionInfo narrowed =
         info.restrictTo(new HashSet<>(Arrays.asList(0, 2)));
     assertNotSame(info, narrowed);
@@ -164,20 +188,23 @@ public void restrictToSubsetsFragments() {
     assertTrue(narrowed.getFragmentPartitionKeys().containsKey(0));
     assertTrue(narrowed.getFragmentPartitionKeys().containsKey(2));
     assertFalse(narrowed.getFragmentPartitionKeys().containsKey(1));
+    // Column types survive restriction.
+    assertEquals(STRING_LONG, narrowed.getColumnTypes());
   }
 
   @Test
-  public void withSoftCappedCarriesFlag() {
+  public void withSoftCappedCarriesFlagAndTypes() {
     ZonemapFragmentPruner.PartitionInfo info =
         new ZonemapFragmentPruner.PartitionInfo(
-            Collections.singletonList("a"), tuples(new Object[] {"x"}));
+            Collections.singletonList("a"), STRING_ONLY, tuples(new Object[] {"x"}));
     assertFalse(info.isSoftCapped());
     ZonemapFragmentPruner.PartitionInfo capped = info.withSoftCapped();
     assertTrue(capped.isSoftCapped());
     // Original untouched.
     assertFalse(info.isSoftCapped());
-    // Data preserved.
+    // Data and types preserved.
     assertEquals(info.getColumnNames(), capped.getColumnNames());
+    assertEquals(info.getColumnTypes(), capped.getColumnTypes());
     assertEquals(info.size(), capped.size());
   }
 
@@ -187,7 +214,8 @@ public void javaSerializationRoundTrip() throws Exception {
     m.put(0, new Comparable<?>[] {"us", 2024L});
     m.put(1, new Comparable<?>[] {"eu", 2025L});
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(Arrays.asList("region", "year"), m, true);
+        new ZonemapFragmentPruner.PartitionInfo(
+            Arrays.asList("region", "year"), STRING_LONG, m, true);
 
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
     try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
@@ -200,6 +228,7 @@ public void javaSerializationRoundTrip() throws Exception {
     }
 
     assertEquals(Arrays.asList("region", "year"), restored.getColumnNames());
+    assertEquals(STRING_LONG, restored.getColumnTypes());
     assertEquals(2, restored.size());
     assertTrue(restored.isSoftCapped());
     assertArrayEquals(new Object[] {"us", 2024L}, restored.getFragmentPartitionKeys().get(0));
@@ -209,9 +238,74 @@ public void javaSerializationRoundTrip() throws Exception {
   public void columnNamesAreImmutableView() {
     List<String> names = new java.util.ArrayList<>(Arrays.asList("a", "b"));
     ZonemapFragmentPruner.PartitionInfo info =
-        new ZonemapFragmentPruner.PartitionInfo(names, tuples(new Object[] {"x", 1L}));
+        new ZonemapFragmentPruner.PartitionInfo(names, STRING_LONG, tuples(new Object[] {"x", 1L}));
     names.add("c"); // mutate caller's list after construction
     assertEquals(Arrays.asList("a", "b"), info.getColumnNames());
     assertThrows(UnsupportedOperationException.class, () -> info.getColumnNames().add("c"));
   }
+
+  // --- Type-aware narrowing (ZoneStats returns Long for every integral Arrow width) ---
+
+  @Test
+  public void byteColumnNarrowsLongToByte() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "b", DataTypes.ByteType, Collections.singletonMap(0, 5L));
+    InternalRow row = info.partitionKeyForFragment(0);
+    assertEquals((byte) 5, row.getByte(0));
+  }
+
+  @Test
+  public void shortColumnNarrowsLongToShort() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "s", DataTypes.ShortType, Collections.singletonMap(0, 1234L));
+    assertEquals((short) 1234, info.partitionKeyForFragment(0).getShort(0));
+  }
+
+  @Test
+  public void intColumnNarrowsLongToInt() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "i", DataTypes.IntegerType, Collections.singletonMap(0, 100_000L));
+    assertEquals(100_000, info.partitionKeyForFragment(0).getInt(0));
+  }
+
+  @Test
+  public void dateColumnEncodesAsEpochDaysInt() {
+    // ZoneStats returns epoch-days as Long (e.g. 19737 == 2024-01-15); Spark's InternalRow
+    // for DateType holds an int. Narrow without loss of information.
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "d", DataTypes.DateType, Collections.singletonMap(0, 19737L));
+    assertEquals(19737, info.partitionKeyForFragment(0).getInt(0));
+  }
+
+  @Test
+  public void timestampColumnEncodesAsEpochMicrosLong() {
+    // ZoneStats returns epoch-micros as Long; Spark's InternalRow for TimestampType holds long.
+    long micros = 1_705_276_800_000_000L;
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "t", DataTypes.TimestampType, Collections.singletonMap(0, micros));
+    assertEquals(micros, info.partitionKeyForFragment(0).getLong(0));
+  }
+
+  @Test
+  public void booleanColumnPassesThrough() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "b", DataTypes.BooleanType, Collections.singletonMap(0, Boolean.TRUE));
+    assertTrue(info.partitionKeyForFragment(0).getBoolean(0));
+  }
+
+  @Test
+  public void stringColumnWrapsAsUtf8String() {
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "r", DataTypes.StringType, Collections.singletonMap(0, "east"));
+    assertEquals(
+        UTF8String.fromString("east"),
+        info.partitionKeyForFragment(0).get(0, DataTypes.StringType));
+  }
 }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
index f0786d04a..3772840d5 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/ZonemapFragmentPrunerTest.java
@@ -563,7 +563,8 @@ public void testPartitionKeyForFragmentString() {
     values.put(0, "east");
     values.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo info =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", org.apache.spark.sql.types.DataTypes.StringType, values);
 
     InternalRow row0 = info.partitionKeyForFragment(0);
     assertNotNull(row0);
@@ -583,7 +584,8 @@ public void testPartitionKeyForFragmentLong() {
     values.put(0, 2023L);
     values.put(1, 2024L);
     ZonemapFragmentPruner.PartitionInfo info =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("year", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "year", org.apache.spark.sql.types.DataTypes.LongType, values);
 
     InternalRow row0 = info.partitionKeyForFragment(0);
     assertEquals(2023L, row0.getLong(0));
@@ -597,7 +599,8 @@ public void testPartitionKeyForMissingFragment() {
     Map<Integer, Comparable<?>> values = new HashMap<>();
     values.put(0, "east");
     ZonemapFragmentPruner.PartitionInfo info =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", org.apache.spark.sql.types.DataTypes.StringType, values);
 
     InternalRow row = info.partitionKeyForFragment(99);
     assertNotNull(row);
@@ -610,7 +613,8 @@ public void testPartitionInfoIsSerializable() throws Exception {
     values.put(0, "east");
     values.put(1, "west");
     ZonemapFragmentPruner.PartitionInfo info =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", org.apache.spark.sql.types.DataTypes.StringType, values);
 
     java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
     java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(baos);
@@ -632,7 +636,8 @@ public void testPartitionInfoImmutableMap() {
     Map<Integer, Comparable<?>> values = new HashMap<>();
     values.put(0, "east");
     ZonemapFragmentPruner.PartitionInfo info =
-        ZonemapFragmentPruner.PartitionInfo.forSingleColumn("region", values);
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "region", org.apache.spark.sql.types.DataTypes.StringType, values);
 
     assertThrows(
         UnsupportedOperationException.class,

From 3b9225b39f4437255d2f4416b630a317c0ff745a Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Wed, 22 Apr 2026 10:55:38 +0800
Subject: [PATCH 3/3] test(partitioning): fill coverage gaps identified in
 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Coverage audit exposed three areas with no direct assertions. Close them:

1. SparkVersionUtil.supportsMultiKeySpj had no unit tests — the running
   Spark version was the only input exercised. Split into a pure-function
   overload that takes the version string and cover it with two table-
   driven tests (accept vs reject): 3.5.x (incl. snapshot / rc / vendor
   suffixes) and 4+/5+ for accept; 3.4.x-and-earlier, conservative 3.6+
   reject, and malformed input (null / empty / no-dot / leading-dot /
   non-numeric major) for reject.

2. parsePartitionColumns had no direct assertion of the dedupe path.
   Promoted to package-private and added one test on "y, x , x, b" →
   ["y", "x", "b"] covering dedupe, whitespace trimming, and declaration-
   order preservation in a single shot.

3. PartitionInfo had no test for unsupported-type handling at encode
   time. Added a test that a non-whitelisted type (DoubleType) reaching
   toSparkValue throws IllegalArgumentException — guards against a
   future bypass of detection.

Visibility change: parsePartitionColumns is now package-private (was
private); SparkVersionUtil gets a package-private string-input overload.

Remaining gap (tracked separately): end-to-end SPJ integration across
Spark versions is blocked on the lance-core describeIndices retrain
limitation.

All tests pass in lance-spark-base_2.12.
---
 .../lance/spark/read/LanceScanBuilder.java    |  3 +-
 .../lance/spark/read/SparkVersionUtil.java    |  9 ++-
 .../spark/read/LanceScanBuilderTest.java      | 12 +++
 .../lance/spark/read/PartitionInfoTest.java   | 10 +++
 .../spark/read/SparkVersionUtilTest.java      | 77 +++++++++++++++++++
 5 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 lance-spark-base_2.12/src/test/java/org/lance/spark/read/SparkVersionUtilTest.java

diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
index 077f43813..ac43e7d66 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/LanceScanBuilder.java
@@ -415,7 +415,8 @@ private static Set<String> extractReferencedColumns(Filter[] filters) {
    * rejects nested paths, and validates each column's Spark type against the whitelist. Returns an
    * empty list if the property is absent, empty, or any column fails validation (reject-all).
    */
-  private List<String> parsePartitionColumns(String raw) {
+  // Package-private so LanceScanBuilderTest can assert dedup / ordering directly.
+  List<String> parsePartitionColumns(String raw) {
     // Treat null, empty, whitespace-only, and pure-delimiter values (",", ", ,", ...) all as
     // "property not set" — these are the no-op cases; returning quietly avoids a spurious WARN.
     if (raw == null || raw.replace(",", "").trim().isEmpty()) {
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
index 19ddc6478..16cbc4bfe 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/read/SparkVersionUtil.java
@@ -32,7 +32,14 @@ private SparkVersionUtil() {}
    * back to UnknownPartitioning" behavior.
    */
   static boolean supportsMultiKeySpj() {
-    String version = package$.MODULE$.SPARK_VERSION();
+    return supportsMultiKeySpj(package$.MODULE$.SPARK_VERSION());
+  }
+
+  /**
+   * Package-private pure-function overload for unit tests. Inputs the version string directly so
+   * the parse/allowlist logic can be exercised without a running SparkContext.
+   */
+  static boolean supportsMultiKeySpj(String version) {
     if (version == null) {
       return false;
     }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
index d55db4d0e..8aa7d8165 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/LanceScanBuilderTest.java
@@ -465,4 +465,16 @@ public void testDetectPartitioningAcceptsIdenticalCoverage() {
     assertArrayEquals(new Object[] {1L, 100L}, info.getFragmentPartitionKeys().get(0));
     assertArrayEquals(new Object[] {2L, 200L}, info.getFragmentPartitionKeys().get(1));
   }
+
+  // --- parsePartitionColumns: direct assertions on the token list ---
+
+  @Test
+  public void testParsePartitionColumnsDedupesTrimsAndPreservesOrder() {
+    // "y, x , x, b" exercises all three behaviors together: whitespace trimming happens before
+    // dedup (so " x " collapses with "x"), duplicates after the first are dropped with a WARN,
+    // and the surviving tokens keep source-string order (not alphabetic).
+    LanceScanBuilder builder = createBuilder();
+    List<String> result = builder.parsePartitionColumns("y, x , x, b");
+    assertEquals(Arrays.asList("y", "x", "b"), result);
+  }
 }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
index 1bc288c7d..0f45ff42c 100644
--- a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/PartitionInfoTest.java
@@ -308,4 +308,14 @@ public void stringColumnWrapsAsUtf8String() {
         UTF8String.fromString("east"),
         info.partitionKeyForFragment(0).get(0, DataTypes.StringType));
   }
+
+  @Test
+  public void unsupportedPartitionTypeThrowsAtEncodeTime() {
+    // If detection is bypassed and a non-whitelisted type reaches toSparkValue, we must fail loud
+    // rather than hand Spark a slot that silently contains the wrong Java class.
+    ZonemapFragmentPruner.PartitionInfo info =
+        ZonemapFragmentPruner.PartitionInfo.forSingleColumn(
+            "d", DataTypes.DoubleType, Collections.singletonMap(0, 1.5));
+    assertThrows(IllegalArgumentException.class, () -> info.partitionKeyForFragment(0));
+  }
 }
diff --git a/lance-spark-base_2.12/src/test/java/org/lance/spark/read/SparkVersionUtilTest.java b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/SparkVersionUtilTest.java
new file mode 100644
index 000000000..b227e2f26
--- /dev/null
+++ b/lance-spark-base_2.12/src/test/java/org/lance/spark/read/SparkVersionUtilTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark.read;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Unit tests for {@link SparkVersionUtil#supportsMultiKeySpj(String)} — the pure-function overload
+ * that the allowlist logic delegates to. Two cases: strings the allowlist accepts, and strings it
+ * rejects (including malformed input).
+ */
+public class SparkVersionUtilTest {
+
+  @Test
+  public void acceptsAllowlistedVersions() {
+    String[] accepted = {
+      // Spark 3.5.x is the minimum version that reliably honors multi-key KGP.
+      "3.5.0",
+      "3.5.1",
+      "3.5.10",
+      // 3.5.x with snapshot / rc / vendor suffix: startsWith("3.5.") still matches.
+      "3.5.0-SNAPSHOT",
+      "3.5.1-rc1",
+      "3.5.2-databricks",
+      // Any 4.x+ build is accepted via the major-version branch.
+      "4.0.0",
+      "4.1.0",
+      "4.0.0-preview",
+      "5.0.0",
+      "10.0.0",
+    };
+    for (String v : accepted) {
+      assertTrue(SparkVersionUtil.supportsMultiKeySpj(v), "expected accepted: " + v);
+    }
+  }
+
+  @Test
+  public void rejectsUnsupportedAndMalformed() {
+    String[] rejected = {
+      // Pre-3.5 Spark lines: 3.4.x and earlier don't reliably honor multi-key KGP.
+      "3.4.0",
+      "3.4.1",
+      "3.4.0-preview",
+      "3.3.0",
+      "3.0.0",
+      "2.4.8",
+      // 3.6+ through 3.x: conservatively rejected until the allowlist is explicitly updated.
+      "3.6.0",
+      "3.9.9",
+      // null / empty / no-dot / leading-dot / non-numeric major: all unparseable → reject.
+      null,
+      "",
+      "3",
+      "custom",
+      ".5.0",
+      "vX.0.0",
+      "custom-fork.1.0",
+    };
+    for (String v : rejected) {
+      assertFalse(SparkVersionUtil.supportsMultiKeySpj(v), "expected rejected: " + v);
+    }
+  }
+}