From 5bd77142112aaf1ba89725290dce42ad070eb589 Mon Sep 17 00:00:00 2001 From: ugbotueferhire <266243254+ugbotueferhire@users.noreply.github.com> Date: Thu, 14 May 2026 10:44:30 +0100 Subject: [PATCH 1/2] fix(unique-value-count): drop NA from label set UniqueValueCountCalculation._all_unique_values used pd.Series.unique() which preserves pd.NA / np.nan / None. The seed dict then carried the missing-value sentinel as a label key, which pydantic rejected against Label = Union[StrictBool, int, str, None]. value_counts(dropna=True) already drops NA on the count side, so this aligns the label set with the count set. Missing values continue to be reported separately by MissingValueCount. Fixes #1616 for DataSummaryPreset / DataDriftPreset on categorical columns containing pd.NA. --- src/evidently/metrics/column_statistics.py | 4 +-- .../future/metrics/test_unique_value_count.py | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/evidently/metrics/column_statistics.py b/src/evidently/metrics/column_statistics.py index ddde369af1..6ae47f054b 100644 --- a/src/evidently/metrics/column_statistics.py +++ b/src/evidently/metrics/column_statistics.py @@ -869,9 +869,9 @@ def share_label_display_name(self, label: Label) -> str: return f"Unique Value Share: {self.metric.column} for label {label}" def _all_unique_values(self, current: Dataset, reference: Optional[Dataset]) -> set: - values = set(current.as_dataframe()[self.metric.column].unique()) + values = set(current.as_dataframe()[self.metric.column].dropna().unique()) if reference is not None: - values.update(reference.as_dataframe()[self.metric.column].unique()) + values.update(reference.as_dataframe()[self.metric.column].dropna().unique()) return values def _calculate_value(self, dataset: Dataset, values: set): diff --git a/tests/future/metrics/test_unique_value_count.py b/tests/future/metrics/test_unique_value_count.py index 35b71d5fa2..99c2b8d1d3 100644 --- a/tests/future/metrics/test_unique_value_count.py +++ b/tests/future/metrics/test_unique_value_count.py @@ -1,5 +1,7 @@ +import numpy as np import pandas as pd +from evidently import DataDefinition from evidently import Dataset from evidently import Report from evidently.core.metric_types import ByLabelCountValue @@ -40,3 +42,37 @@ def test_unique_value_count_metric(): assert label_count.display_name == result[label]["count_display_name"] assert label_share.value == result[label]["share"] assert label_share.display_name == result[label]["share_display_name"] + + +def test_unique_value_count_with_pd_na_in_string_dtype(): + """Issue #1616: pd.NA in a nullable-string column must not become a label key.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.Series(["a", "a", "b", pd.NA], dtype="string")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {"a", "b"} + a_count, _ = res.get_label_result("a") + b_count, _ = res.get_label_result("b") + assert a_count.value == 2 + assert b_count.value == 1 + + +def test_unique_value_count_with_nan_in_object_dtype(): + """np.nan / None in an object column also must not surface as a label.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.Series(["a", "b", None, np.nan], dtype="object")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {"a", "b"} + + +def test_unique_value_count_with_pd_na_in_int64_dtype(): + """pd.NA in a nullable Int64 column.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.array([1, 2, 2, pd.NA], dtype="Int64")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {1, 2} From e9569a0d167eaf0dcf4a99399e4219cb3069bac1 Mon Sep 17 00:00:00 2001 From: ugbotueferhire <266243254+ugbotueferhire@users.noreply.github.com> Date: Thu, 14 May 2026 10:49:16 +0100 Subject: [PATCH 2/2] fix(by-label-count): coerce dict keys and handle pd.NA in convert_types ByLabelCountValue lacked the pre=True key-coercion validator that its sibling ByLabelValue has, so np.int64 / pd.NA / other non-Label-typed keys reached pydantic uncoerced. Add @validator("counts", "shares", pre=True) mirroring ByLabelValue.convert_labels. convert_types relied on np.isnan(val) which raises TypeError on pd.NA ("boolean value of NA is ambiguous"). Add an identity check for pd.NA that normalizes it to None, consistent with Label = Union[..., None]. np.nan behavior is preserved (still falls through to pydantic's str-coercion path, matching the pre-existing test contract). Adds end-to-end coverage matching the exact reproduction from #1616 on both DataSummaryPreset and DataDriftPreset, plus parametrize cases for pd.NA and np.int64 keys. Fixes #1616. --- src/evidently/core/metric_types.py | 8 +++++++ tests/future/presets/test_dataset_stats_na.py | 22 +++++++++++++++++++ tests/future/test_metric_types.py | 3 +++ 3 files changed, 33 insertions(+) create mode 100644 tests/future/presets/test_dataset_stats_na.py diff --git a/src/evidently/core/metric_types.py b/src/evidently/core/metric_types.py index 62b2b3a90e..531bf89357 100644 --- a/src/evidently/core/metric_types.py +++ b/src/evidently/core/metric_types.py @@ -486,6 +486,12 @@ class Config: count_display_name_template: str = "Missing label {label} count" share_display_name_template: str = "Missing label {label} share" + @validator("counts", "shares", pre=True) + def _convert_label_keys(cls, value): + if not isinstance(value, dict): + return value + return {convert_types(k): v for k, v in value.items()} + def labels(self) -> List[Label]: return list(self.counts.keys()) @@ -551,6 +557,8 @@ def convert_labels(cls, value): def convert_types(val): + if val is pd.NA: + return None if isinstance( val, ( diff --git a/tests/future/presets/test_dataset_stats_na.py b/tests/future/presets/test_dataset_stats_na.py new file mode 100644 index 0000000000..2020ca0cc3 --- /dev/null +++ b/tests/future/presets/test_dataset_stats_na.py @@ -0,0 +1,22 @@ +import pandas as pd + +from evidently import DataDefinition +from evidently import Dataset +from evidently import Report +from evidently.presets import DataDriftPreset +from evidently.presets import DataSummaryPreset + + +def _make_dataset() -> Dataset: + df = pd.DataFrame({"a": pd.Series(["x", "y", "z", pd.NA], dtype="string")}) + return Dataset.from_pandas(df, DataDefinition(categorical_columns=["a"])) + + +def test_data_summary_preset_handles_pd_na(): + """Issue #1616: DataSummaryPreset must not fail on pd.NA in a categorical column.""" + Report([DataSummaryPreset()]).run(current_data=_make_dataset()) + + +def test_data_drift_preset_handles_pd_na(): + """Issue #1616: DataDriftPreset must not fail on pd.NA in a categorical column.""" + Report([DataDriftPreset()]).run(current_data=_make_dataset(), reference_data=_make_dataset()) diff --git a/tests/future/test_metric_types.py b/tests/future/test_metric_types.py index d6af0c6945..732870eadd 100644 --- a/tests/future/test_metric_types.py +++ b/tests/future/test_metric_types.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from evidently.core.metric_types import ByLabelCountValue @@ -9,6 +10,8 @@ "input,output", [ ({np.nan: (1.0, 1.0)}, ({"nan": 1.0}, {"nan": 1.0})), + ({pd.NA: (1.0, 1.0)}, ({None: 1.0}, {None: 1.0})), + ({np.int64(7): (3.0, 0.5)}, ({7: 3.0}, {7: 0.5})), ], ) def test_by_label_count_value(input: dict, output: tuple):