From d5e3aed2fa1884d60213c107547030afa2e5faff Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Mon, 15 Jun 2026 09:53:00 +0200 Subject: [PATCH 1/7] include bin counts and edges in tablereport json output --- .../_data/templates/column-summary.html | 4 +-- skrub/_reporting/_plotting.py | 33 ++++++++++++------- skrub/_reporting/_summarize.py | 7 ++-- skrub/_reporting/_utils.py | 2 ++ 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/skrub/_reporting/_data/templates/column-summary.html b/skrub/_reporting/_data/templates/column-summary.html index 7c658a092..fcf4fa1bb 100644 --- a/skrub/_reporting/_data/templates/column-summary.html +++ b/skrub/_reporting/_data/templates/column-summary.html @@ -4,10 +4,10 @@ data-name-repr="{{ column.name.__repr__() }}" data-column-name="{{ column.name }}" data-column-idx="{{ column.idx }}" - {% if column['n_low_outliers'] %} + {% if column['histogram_data']['n_low_outliers'] %} data-has-low-outliers {% endif %} - {% if column['n_high_outliers'] %} + {% if column['histogram_data']['n_high_outliers'] %} data-has-high-outliers {% endif %} data-manager="FilterableColumn {% if in_sample_tab %}SampleColumnSummary{% endif %}" diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py index 131be0113..90dd02283 100644 --- a/skrub/_reporting/_plotting.py +++ b/skrub/_reporting/_plotting.py @@ -192,15 +192,27 @@ def _get_range(values, frac=0.2, factor=3.0): return low, high -def _robust_hist(values, ax, color): +def _robust_hist(col, ax=None, color=None): + col = sbd.drop_nulls(col) + if sbd.is_float(col): + # avoid any issues with pandas nullable dtypes + # (to_numpy can yield a numpy array with object dtype in old pandas + # version if there are inf or nan) + col = sbd.to_float32(col) + values = sbd.to_numpy(col) + low, high = _get_range(values) inliers = values[(low <= values) & (values <= high)] n_low_outliers = (values < low).sum() n_high_outliers = (high < values).sum() + result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers} + result["bin_counts"], result["bin_edges"] = np.histogram(inliers) + if ax is None: + return result n, bins, patches = ax.hist(inliers) n_out = n_low_outliers + n_high_outliers if not n_out: - return 0, 0 + return result width = bins[1] - bins[0] start, stop = bins[0], bins[-1] line_params = dict(color=_RED, linestyle="--", ymax=0.95) @@ -229,28 +241,25 @@ def _robust_hist(values, ax, color): color=_RED, ) ax.set_xlim(start, stop) - return n_low_outliers, n_high_outliers + return result + + +def histogram_data(col): + return _robust_hist(col, ax=None, color=None) @_plot def histogram(col, duration_unit=None, color=COLOR_0): """Histogram for a numeric column.""" - col = sbd.drop_nulls(col) - if sbd.is_float(col): - # avoid any issues with pandas nullable dtypes - # (to_numpy can yield a numpy array with object dtype in old pandas - # version if there are inf or nan) - col = sbd.to_float32(col) - values = sbd.to_numpy(col) fig, ax = plt.subplots() _despine(ax) - n_low_outliers, n_high_outliers = _robust_hist(values, ax, color=color) + histogram_data = _robust_hist(col, ax=ax, color=color) if duration_unit is not None: ax.set_xlabel(f"{duration_unit.capitalize()}s") if sbd.is_any_date(col): _rotate_ticklabels(ax) _adjust_fig_size(fig, ax, 2.0, 1.0) - return _serialize(fig), n_low_outliers, n_high_outliers + return _serialize(fig), histogram_data @_plot diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index b544986e2..663472d0f 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -289,13 +289,10 @@ def _add_numeric_summary( summary["value_is_constant"] = False summary["quantiles"] = quantiles if not with_plots: + summary["histogram_data"] = _plotting.histogram_data(column) return if order_by_column is None: - ( - summary["histogram_plot"], - summary["n_low_outliers"], - summary["n_high_outliers"], - ) = _plotting.histogram( + summary["histogram_plot"], summary["histogram_data"] = _plotting.histogram( column, duration_unit=duration_unit, color=_plotting.COLORS[0] ) else: diff --git a/skrub/_reporting/_utils.py b/skrub/_reporting/_utils.py index e6dd0b44d..6238f07d8 100644 --- a/skrub/_reporting/_utils.py +++ b/skrub/_reporting/_utils.py @@ -115,6 +115,8 @@ def default(self, value): return int(value) if isinstance(value, np.floating): return float(value) + if isinstance(value, np.ndarray): + return value.tolist() raise From 9b10644ef93774822f089898d52fcfcdb005ae29 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Mon, 15 Jun 2026 09:58:45 +0200 Subject: [PATCH 2/7] _ --- skrub/_reporting/_summarize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index 663472d0f..fce85ad2b 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -255,9 +255,10 @@ def _add_datetime_summary(summary, column, with_plots): if with_plots: ( summary["histogram_plot"], - summary["n_low_outliers"], - summary["n_high_outliers"], + summary["histogram_data"], ) = _plotting.histogram(column, color=_plotting.COLORS[0]) + else: + summary["histogram_data"] = _plotting.histogram_data(column) def _add_numeric_summary( From d5eabf938fc723e9e8c1e6d852e6f2c3a5de73c6 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Mon, 15 Jun 2026 13:53:51 +0200 Subject: [PATCH 3/7] _ --- .../_data/templates/column-summary.html | 4 ++-- skrub/_reporting/_plotting.py | 7 ++++++- skrub/_reporting/tests/test_plotting.py | 20 +++++++++---------- skrub/_reporting/tests/test_utils.py | 2 +- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/skrub/_reporting/_data/templates/column-summary.html b/skrub/_reporting/_data/templates/column-summary.html index fcf4fa1bb..e7482b5d0 100644 --- a/skrub/_reporting/_data/templates/column-summary.html +++ b/skrub/_reporting/_data/templates/column-summary.html @@ -4,10 +4,10 @@ data-name-repr="{{ column.name.__repr__() }}" data-column-name="{{ column.name }}" data-column-idx="{{ column.idx }}" - {% if column['histogram_data']['n_low_outliers'] %} + {% if column['histogram_data'] and column['histogram_data']['n_low_outliers'] %} data-has-low-outliers {% endif %} - {% if column['histogram_data']['n_high_outliers'] %} + {% if column['histogram_data'] and column['histogram_data']['n_high_outliers'] %} data-has-high-outliers {% endif %} data-manager="FilterableColumn {% if in_sample_tab %}SampleColumnSummary{% endif %}" diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py index 90dd02283..c1e4f8847 100644 --- a/skrub/_reporting/_plotting.py +++ b/skrub/_reporting/_plotting.py @@ -206,7 +206,12 @@ def _robust_hist(col, ax=None, color=None): n_low_outliers = (values < low).sum() n_high_outliers = (high < values).sum() result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers} - result["bin_counts"], result["bin_edges"] = np.histogram(inliers) + if sbd.is_any_date(col): + # numpy histogram does not handle datetimes + np_inliers = inliers.astype("datetime64[s]").astype("float") + else: + np_inliers = inliers + result["bin_counts"], result["bin_edges"] = np.histogram(np_inliers) if ax is None: return result n, bins, patches = ax.hist(inliers) diff --git a/skrub/_reporting/tests/test_plotting.py b/skrub/_reporting/tests/test_plotting.py index c49eb98aa..601f96526 100644 --- a/skrub/_reporting/tests/test_plotting.py +++ b/skrub/_reporting/tests/test_plotting.py @@ -10,21 +10,21 @@ def test_histogram(): o = rng.uniform(-100, 100, size=10) data = pd.Series(np.concatenate([x, o])) - _, n_low, n_high = _plotting.histogram(data) - assert (n_low, n_high) == (5, 4) + _, hist = _plotting.histogram(data) + assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (5, 4) data = pd.Series(np.concatenate([x, o - 1000])) - _, n_low, n_high = _plotting.histogram(data) - assert (n_low, n_high) == (10, 0) + _, hist = _plotting.histogram(data) + assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (10, 0) data = pd.Series(np.concatenate([x, o + 1000])) - _, n_low, n_high = _plotting.histogram(data) - assert (n_low, n_high) == (0, 10) + _, hist = _plotting.histogram(data) + assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 10) data = pd.Series(x) - _, n_low, n_high = _plotting.histogram(data) - assert (n_low, n_high) == (0, 0) + _, hist = _plotting.histogram(data) + assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0) data = pd.Series([0.0]) - _, n_low, n_high = _plotting.histogram(data) - assert (n_low, n_high) == (0, 0) + _, hist = _plotting.histogram(data) + assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0) diff --git a/skrub/_reporting/tests/test_utils.py b/skrub/_reporting/tests/test_utils.py index c2f6b7a68..23dcf571e 100644 --- a/skrub/_reporting/tests/test_utils.py +++ b/skrub/_reporting/tests/test_utils.py @@ -104,7 +104,7 @@ def test_json_encoder(): d = {"a": x[0], "b": y[0]} assert json.dumps(d, cls=_utils.JSONEncoder) == '{"a": 1, "b": 1.0}' with pytest.raises(TypeError, match=".*JSON serializable"): - json.dumps({"a": x}, cls=_utils.JSONEncoder) + json.dumps({"a": np}, cls=_utils.JSONEncoder) def test_svg_to_img_src(): From 78d4aa4050e9033ede97ace4b050adf8c0824cee Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Mon, 15 Jun 2026 13:58:11 +0200 Subject: [PATCH 4/7] changelog --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index db21d87ec..e4ee84e02 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -40,6 +40,10 @@ New Features Additionally, negative numbers indicated with parentheses can be converted to the regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. +- :meth:`TableReport.json` now includes histogram data for numeric and datetime + columns (the bin count and edges, and numbers of low and high outliers). Now + ``json()`` contains all the information shown in the report html rendering, + including the plots. :pr:`2164` by :user:`Jérôme Dockès `. Changes ------- From ee96623ae451566bc476bfa1ff818fc9a4dfe238 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Mon, 15 Jun 2026 14:11:20 +0200 Subject: [PATCH 5/7] _ --- skrub/_reporting/_plotting.py | 25 +++++++++++++----------- skrub/_reporting/tests/test_summarize.py | 3 +++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py index c1e4f8847..d0a8e9be5 100644 --- a/skrub/_reporting/_plotting.py +++ b/skrub/_reporting/_plotting.py @@ -11,8 +11,8 @@ import numpy as np from matplotlib import pyplot as plt -from skrub import _dataframe as sbd - +from .. import _dataframe as sbd +from .. import _datetime_encoder from . import _utils __all__ = ["COLORS", "COLOR_0", "histogram", "line", "value_counts"] @@ -200,21 +200,24 @@ def _robust_hist(col, ax=None, color=None): # version if there are inf or nan) col = sbd.to_float32(col) values = sbd.to_numpy(col) - + if sbd.is_any_date(col): + # numpy histogram does not handle datetimes but matplotlib does + np_histogram_values = sbd.to_numpy( + _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col) + ).ravel() + else: + np_histogram_values = values low, high = _get_range(values) - inliers = values[(low <= values) & (values <= high)] + inlier_mask = (low <= values) & (values <= high) n_low_outliers = (values < low).sum() n_high_outliers = (high < values).sum() result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers} - if sbd.is_any_date(col): - # numpy histogram does not handle datetimes - np_inliers = inliers.astype("datetime64[s]").astype("float") - else: - np_inliers = inliers - result["bin_counts"], result["bin_edges"] = np.histogram(np_inliers) + result["bin_counts"], result["bin_edges"] = np.histogram( + np_histogram_values[inlier_mask] + ) if ax is None: return result - n, bins, patches = ax.hist(inliers) + n, bins, patches = ax.hist(values[inlier_mask]) n_out = n_low_outliers + n_high_outliers if not n_out: return result diff --git a/skrub/_reporting/tests/test_summarize.py b/skrub/_reporting/tests/test_summarize.py index 245465239..bde2c4df9 100644 --- a/skrub/_reporting/tests/test_summarize.py +++ b/skrub/_reporting/tests/test_summarize.py @@ -84,6 +84,9 @@ def test_summarize( 0.75: 33.6, 1.0: 78.3, } + if order_by is None: + assert len(summary["columns"][5]["histogram_data"]["bin_counts"]) == 10 + assert len(summary["columns"][5]["histogram_data"]["bin_edges"]) == 11 assert summary["columns"][7]["null_count"] == 9 assert summary["columns"][7]["nulls_level"] == "warning" assert summary["columns"][8]["null_count"] == 17 From f4fe1ef0f10aa3d3e840f62160cb2ef14eac8348 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Tue, 16 Jun 2026 17:57:14 +0200 Subject: [PATCH 6/7] add comments --- skrub/_reporting/_plotting.py | 3 ++- skrub/_reporting/_summarize.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py index d0a8e9be5..bd4c512e4 100644 --- a/skrub/_reporting/_plotting.py +++ b/skrub/_reporting/_plotting.py @@ -201,7 +201,8 @@ def _robust_hist(col, ax=None, color=None): col = sbd.to_float32(col) values = sbd.to_numpy(col) if sbd.is_any_date(col): - # numpy histogram does not handle datetimes but matplotlib does + # numpy histogram does not handle datetimes but matplotlib does, so we + # convert to the total number of seconds since epoch (a float) np_histogram_values = sbd.to_numpy( _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col) ).ravel() diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index fce85ad2b..e582e07fd 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -258,6 +258,8 @@ def _add_datetime_summary(summary, column, with_plots): summary["histogram_data"], ) = _plotting.histogram(column, color=_plotting.COLORS[0]) else: + # besides the plots, the bin counts and edges are always stored and + # available in the json output. summary["histogram_data"] = _plotting.histogram_data(column) @@ -290,6 +292,8 @@ def _add_numeric_summary( summary["value_is_constant"] = False summary["quantiles"] = quantiles if not with_plots: + # besides the plots, the bin counts and edges are always stored and + # available in the json output. summary["histogram_data"] = _plotting.histogram_data(column) return if order_by_column is None: From 0e276e3c6f767294b478ac8ea1287f39da6391e0 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 17 Jun 2026 11:07:40 +0200 Subject: [PATCH 7/7] add comment --- skrub/_reporting/_plotting.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py index bd4c512e4..4920621e7 100644 --- a/skrub/_reporting/_plotting.py +++ b/skrub/_reporting/_plotting.py @@ -203,6 +203,10 @@ def _robust_hist(col, ax=None, color=None): if sbd.is_any_date(col): # numpy histogram does not handle datetimes but matplotlib does, so we # convert to the total number of seconds since epoch (a float) + # + # note that the dtype cannot be duration (timedelta) here as they are + # handled higher in the call stack and converted to floats with + # _utils.duration_to_numeric np_histogram_values = sbd.to_numpy( _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col) ).ravel()