Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/evidently/core/metric_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,12 @@ class Config:
count_display_name_template: str = "Missing label {label} count"
share_display_name_template: str = "Missing label {label} share"

@validator("counts", "shares", pre=True)
def _convert_label_keys(cls, value):
if not isinstance(value, dict):
return value
return {convert_types(k): v for k, v in value.items()}

def labels(self) -> List[Label]:
return list(self.counts.keys())

Expand Down Expand Up @@ -551,6 +557,8 @@ def convert_labels(cls, value):


def convert_types(val):
if val is pd.NA:
return None
if isinstance(
val,
(
Expand Down
4 changes: 2 additions & 2 deletions src/evidently/metrics/column_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,9 +869,9 @@ def share_label_display_name(self, label: Label) -> str:
return f"Unique Value Share: {self.metric.column} for label {label}"

def _all_unique_values(self, current: Dataset, reference: Optional[Dataset]) -> set:
values = set(current.as_dataframe()[self.metric.column].unique())
values = set(current.as_dataframe()[self.metric.column].dropna().unique())
if reference is not None:
values.update(reference.as_dataframe()[self.metric.column].unique())
values.update(reference.as_dataframe()[self.metric.column].dropna().unique())
return values

def _calculate_value(self, dataset: Dataset, values: set):
Expand Down
36 changes: 36 additions & 0 deletions tests/future/metrics/test_unique_value_count.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
import pandas as pd

from evidently import DataDefinition
from evidently import Dataset
from evidently import Report
from evidently.core.metric_types import ByLabelCountValue
Expand Down Expand Up @@ -40,3 +42,37 @@ def test_unique_value_count_metric():
assert label_count.display_name == result[label]["count_display_name"]
assert label_share.value == result[label]["share"]
assert label_share.display_name == result[label]["share_display_name"]


def test_unique_value_count_with_pd_na_in_string_dtype():
"""Issue #1616: pd.NA in a nullable-string column must not become a label key."""
metric = UniqueValueCount(column="col1")
data = pd.DataFrame({"col1": pd.Series(["a", "a", "b", pd.NA], dtype="string")})
dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"]))
res = Report([metric]).run(dataset, None)._context.get_metric_result(metric)
assert isinstance(res, ByLabelCountValue)
assert set(res.labels()) == {"a", "b"}
a_count, _ = res.get_label_result("a")
b_count, _ = res.get_label_result("b")
assert a_count.value == 2
assert b_count.value == 1


def test_unique_value_count_with_nan_in_object_dtype():
"""np.nan / None in an object column also must not surface as a label."""
metric = UniqueValueCount(column="col1")
data = pd.DataFrame({"col1": pd.Series(["a", "b", None, np.nan], dtype="object")})
dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"]))
res = Report([metric]).run(dataset, None)._context.get_metric_result(metric)
assert isinstance(res, ByLabelCountValue)
assert set(res.labels()) == {"a", "b"}


def test_unique_value_count_with_pd_na_in_int64_dtype():
"""pd.NA in a nullable Int64 column."""
metric = UniqueValueCount(column="col1")
data = pd.DataFrame({"col1": pd.array([1, 2, 2, pd.NA], dtype="Int64")})
dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"]))
res = Report([metric]).run(dataset, None)._context.get_metric_result(metric)
assert isinstance(res, ByLabelCountValue)
assert set(res.labels()) == {1, 2}
22 changes: 22 additions & 0 deletions tests/future/presets/test_dataset_stats_na.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd

from evidently import DataDefinition
from evidently import Dataset
from evidently import Report
from evidently.presets import DataDriftPreset
from evidently.presets import DataSummaryPreset


def _make_dataset() -> Dataset:
df = pd.DataFrame({"a": pd.Series(["x", "y", "z", pd.NA], dtype="string")})
return Dataset.from_pandas(df, DataDefinition(categorical_columns=["a"]))


def test_data_summary_preset_handles_pd_na():
"""Issue #1616: DataSummaryPreset must not fail on pd.NA in a categorical column."""
Report([DataSummaryPreset()]).run(current_data=_make_dataset())


def test_data_drift_preset_handles_pd_na():
"""Issue #1616: DataDriftPreset must not fail on pd.NA in a categorical column."""
Report([DataDriftPreset()]).run(current_data=_make_dataset(), reference_data=_make_dataset())
3 changes: 3 additions & 0 deletions tests/future/test_metric_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
import pytest

from evidently.core.metric_types import ByLabelCountValue
Expand All @@ -9,6 +10,8 @@
"input,output",
[
({np.nan: (1.0, 1.0)}, ({"nan": 1.0}, {"nan": 1.0})),
({pd.NA: (1.0, 1.0)}, ({None: 1.0}, {None: 1.0})),
({np.int64(7): (3.0, 0.5)}, ({7: 3.0}, {7: 0.5})),
],
)
def test_by_label_count_value(input: dict, output: tuple):
Expand Down