Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ New Features
Additionally, negative numbers indicated with parentheses can be converted to the
regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela
Gómez Jiménez <gabrielapgomezji>`.
- :meth:`TableReport.json` now includes histogram data for numeric and datetime
columns (the bin count and edges, and numbers of low and high outliers). Now
``json()`` contains all the information shown in the report html rendering,
including the plots. :pr:`2164` by :user:`Jérôme Dockès <jeromedockes>`.

Changes
-------
Expand Down
4 changes: 2 additions & 2 deletions skrub/_reporting/_data/templates/column-summary.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
data-name-repr="{{ column.name.__repr__() }}"
data-column-name="{{ column.name }}"
data-column-idx="{{ column.idx }}"
{% if column['n_low_outliers'] %}
{% if column['histogram_data'] and column['histogram_data']['n_low_outliers'] %}
data-has-low-outliers
{% endif %}
{% if column['n_high_outliers'] %}
{% if column['histogram_data'] and column['histogram_data']['n_high_outliers'] %}
data-has-high-outliers
{% endif %}
data-manager="FilterableColumn {% if in_sample_tab %}SampleColumnSummary{% endif %}"
Expand Down
49 changes: 33 additions & 16 deletions skrub/_reporting/_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import numpy as np
from matplotlib import pyplot as plt

from skrub import _dataframe as sbd

from .. import _dataframe as sbd
from .. import _datetime_encoder
from . import _utils

__all__ = ["COLORS", "COLOR_0", "histogram", "line", "value_counts"]
Expand Down Expand Up @@ -192,15 +192,35 @@ def _get_range(values, frac=0.2, factor=3.0):
return low, high


def _robust_hist(values, ax, color):
def _robust_hist(col, ax=None, color=None):
col = sbd.drop_nulls(col)
if sbd.is_float(col):
# avoid any issues with pandas nullable dtypes
# (to_numpy can yield a numpy array with object dtype in old pandas
# version if there are inf or nan)
col = sbd.to_float32(col)
values = sbd.to_numpy(col)
if sbd.is_any_date(col):
# numpy histogram does not handle datetimes but matplotlib does

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment here explaining that this is converting dates to seconds since epoch? It's not clear from the code

np_histogram_values = sbd.to_numpy(
_datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col)
).ravel()
else:
np_histogram_values = values
low, high = _get_range(values)
inliers = values[(low <= values) & (values <= high)]
inlier_mask = (low <= values) & (values <= high)
n_low_outliers = (values < low).sum()
n_high_outliers = (high < values).sum()
n, bins, patches = ax.hist(inliers)
result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers}
result["bin_counts"], result["bin_edges"] = np.histogram(
np_histogram_values[inlier_mask]
)
if ax is None:
return result
n, bins, patches = ax.hist(values[inlier_mask])
n_out = n_low_outliers + n_high_outliers
if not n_out:
return 0, 0
return result
width = bins[1] - bins[0]
start, stop = bins[0], bins[-1]
line_params = dict(color=_RED, linestyle="--", ymax=0.95)
Expand Down Expand Up @@ -229,28 +249,25 @@ def _robust_hist(values, ax, color):
color=_RED,
)
ax.set_xlim(start, stop)
return n_low_outliers, n_high_outliers
return result


def histogram_data(col):
return _robust_hist(col, ax=None, color=None)


@_plot
def histogram(col, duration_unit=None, color=COLOR_0):
"""Histogram for a numeric column."""
col = sbd.drop_nulls(col)
if sbd.is_float(col):
# avoid any issues with pandas nullable dtypes
# (to_numpy can yield a numpy array with object dtype in old pandas
# version if there are inf or nan)
col = sbd.to_float32(col)
values = sbd.to_numpy(col)
fig, ax = plt.subplots()
_despine(ax)
n_low_outliers, n_high_outliers = _robust_hist(values, ax, color=color)
histogram_data = _robust_hist(col, ax=ax, color=color)
if duration_unit is not None:
ax.set_xlabel(f"{duration_unit.capitalize()}s")
if sbd.is_any_date(col):
_rotate_ticklabels(ax)
_adjust_fig_size(fig, ax, 2.0, 1.0)
return _serialize(fig), n_low_outliers, n_high_outliers
return _serialize(fig), histogram_data


@_plot
Expand Down
12 changes: 5 additions & 7 deletions skrub/_reporting/_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,10 @@ def _add_datetime_summary(summary, column, with_plots):
if with_plots:
(
summary["histogram_plot"],
summary["n_low_outliers"],
summary["n_high_outliers"],
summary["histogram_data"],
) = _plotting.histogram(column, color=_plotting.COLORS[0])
else:
summary["histogram_data"] = _plotting.histogram_data(column)


def _add_numeric_summary(
Expand Down Expand Up @@ -289,13 +290,10 @@ def _add_numeric_summary(
summary["value_is_constant"] = False
summary["quantiles"] = quantiles
if not with_plots:
summary["histogram_data"] = _plotting.histogram_data(column)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add a small comment here to explain what this is used for?

return
if order_by_column is None:
(
summary["histogram_plot"],
summary["n_low_outliers"],
summary["n_high_outliers"],
) = _plotting.histogram(
summary["histogram_plot"], summary["histogram_data"] = _plotting.histogram(
column, duration_unit=duration_unit, color=_plotting.COLORS[0]
)
else:
Expand Down
2 changes: 2 additions & 0 deletions skrub/_reporting/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ def default(self, value):
return int(value)
if isinstance(value, np.floating):
return float(value)
if isinstance(value, np.ndarray):
return value.tolist()
raise


Expand Down
20 changes: 10 additions & 10 deletions skrub/_reporting/tests/test_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@ def test_histogram():
o = rng.uniform(-100, 100, size=10)

data = pd.Series(np.concatenate([x, o]))
_, n_low, n_high = _plotting.histogram(data)
assert (n_low, n_high) == (5, 4)
_, hist = _plotting.histogram(data)
assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (5, 4)

data = pd.Series(np.concatenate([x, o - 1000]))
_, n_low, n_high = _plotting.histogram(data)
assert (n_low, n_high) == (10, 0)
_, hist = _plotting.histogram(data)
assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (10, 0)

data = pd.Series(np.concatenate([x, o + 1000]))
_, n_low, n_high = _plotting.histogram(data)
assert (n_low, n_high) == (0, 10)
_, hist = _plotting.histogram(data)
assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 10)

data = pd.Series(x)
_, n_low, n_high = _plotting.histogram(data)
assert (n_low, n_high) == (0, 0)
_, hist = _plotting.histogram(data)
assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0)

data = pd.Series([0.0])
_, n_low, n_high = _plotting.histogram(data)
assert (n_low, n_high) == (0, 0)
_, hist = _plotting.histogram(data)
assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0)
3 changes: 3 additions & 0 deletions skrub/_reporting/tests/test_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ def test_summarize(
0.75: 33.6,
1.0: 78.3,
}
if order_by is None:
assert len(summary["columns"][5]["histogram_data"]["bin_counts"]) == 10
assert len(summary["columns"][5]["histogram_data"]["bin_edges"]) == 11
assert summary["columns"][7]["null_count"] == 9
assert summary["columns"][7]["nulls_level"] == "warning"
assert summary["columns"][8]["null_count"] == 17
Expand Down
2 changes: 1 addition & 1 deletion skrub/_reporting/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_json_encoder():
d = {"a": x[0], "b": y[0]}
assert json.dumps(d, cls=_utils.JSONEncoder) == '{"a": 1, "b": 1.0}'
with pytest.raises(TypeError, match=".*JSON serializable"):
json.dumps({"a": x}, cls=_utils.JSONEncoder)
json.dumps({"a": np}, cls=_utils.JSONEncoder)


def test_svg_to_img_src():
Expand Down