From d5e3aed2fa1884d60213c107547030afa2e5faff Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Mon, 15 Jun 2026 09:53:00 +0200
Subject: [PATCH 1/7] include bin counts and edges in tablereport json output

---
 .../_data/templates/column-summary.html       |  4 +--
 skrub/_reporting/_plotting.py                 | 33 ++++++++++++-------
 skrub/_reporting/_summarize.py                |  7 ++--
 skrub/_reporting/_utils.py                    |  2 ++
 4 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/skrub/_reporting/_data/templates/column-summary.html b/skrub/_reporting/_data/templates/column-summary.html
index 7c658a092..fcf4fa1bb 100644
--- a/skrub/_reporting/_data/templates/column-summary.html
+++ b/skrub/_reporting/_data/templates/column-summary.html
@@ -4,10 +4,10 @@
     data-name-repr="{{ column.name.__repr__() }}"
     data-column-name="{{ column.name }}"
     data-column-idx="{{ column.idx }}"
-     {% if column['n_low_outliers'] %}
+     {% if column['histogram_data']['n_low_outliers'] %}
      data-has-low-outliers
      {% endif %}
-     {% if column['n_high_outliers'] %}
+     {% if column['histogram_data']['n_high_outliers'] %}
      data-has-high-outliers
      {% endif %}
     data-manager="FilterableColumn {% if in_sample_tab %}SampleColumnSummary{% endif %}"
diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
index 131be0113..90dd02283 100644
--- a/skrub/_reporting/_plotting.py
+++ b/skrub/_reporting/_plotting.py
@@ -192,15 +192,27 @@ def _get_range(values, frac=0.2, factor=3.0):
     return low, high
 
 
-def _robust_hist(values, ax, color):
+def _robust_hist(col, ax=None, color=None):
+    col = sbd.drop_nulls(col)
+    if sbd.is_float(col):
+        # avoid any issues with pandas nullable dtypes
+        # (to_numpy can yield a numpy array with object dtype in old pandas
+        # version if there are inf or nan)
+        col = sbd.to_float32(col)
+    values = sbd.to_numpy(col)
+
     low, high = _get_range(values)
     inliers = values[(low <= values) & (values <= high)]
     n_low_outliers = (values < low).sum()
     n_high_outliers = (high < values).sum()
+    result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers}
+    result["bin_counts"], result["bin_edges"] = np.histogram(inliers)
+    if ax is None:
+        return result
     n, bins, patches = ax.hist(inliers)
     n_out = n_low_outliers + n_high_outliers
     if not n_out:
-        return 0, 0
+        return result
     width = bins[1] - bins[0]
     start, stop = bins[0], bins[-1]
     line_params = dict(color=_RED, linestyle="--", ymax=0.95)
@@ -229,28 +241,25 @@ def _robust_hist(values, ax, color):
         color=_RED,
     )
     ax.set_xlim(start, stop)
-    return n_low_outliers, n_high_outliers
+    return result
+
+
+def histogram_data(col):
+    return _robust_hist(col, ax=None, color=None)
 
 
 @_plot
 def histogram(col, duration_unit=None, color=COLOR_0):
     """Histogram for a numeric column."""
-    col = sbd.drop_nulls(col)
-    if sbd.is_float(col):
-        # avoid any issues with pandas nullable dtypes
-        # (to_numpy can yield a numpy array with object dtype in old pandas
-        # version if there are inf or nan)
-        col = sbd.to_float32(col)
-    values = sbd.to_numpy(col)
     fig, ax = plt.subplots()
     _despine(ax)
-    n_low_outliers, n_high_outliers = _robust_hist(values, ax, color=color)
+    histogram_data = _robust_hist(col, ax=ax, color=color)
     if duration_unit is not None:
         ax.set_xlabel(f"{duration_unit.capitalize()}s")
     if sbd.is_any_date(col):
         _rotate_ticklabels(ax)
     _adjust_fig_size(fig, ax, 2.0, 1.0)
-    return _serialize(fig), n_low_outliers, n_high_outliers
+    return _serialize(fig), histogram_data
 
 
 @_plot
diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py
index b544986e2..663472d0f 100644
--- a/skrub/_reporting/_summarize.py
+++ b/skrub/_reporting/_summarize.py
@@ -289,13 +289,10 @@ def _add_numeric_summary(
     summary["value_is_constant"] = False
     summary["quantiles"] = quantiles
     if not with_plots:
+        summary["histogram_data"] = _plotting.histogram_data(column)
         return
     if order_by_column is None:
-        (
-            summary["histogram_plot"],
-            summary["n_low_outliers"],
-            summary["n_high_outliers"],
-        ) = _plotting.histogram(
+        summary["histogram_plot"], summary["histogram_data"] = _plotting.histogram(
             column, duration_unit=duration_unit, color=_plotting.COLORS[0]
         )
     else:
diff --git a/skrub/_reporting/_utils.py b/skrub/_reporting/_utils.py
index e6dd0b44d..6238f07d8 100644
--- a/skrub/_reporting/_utils.py
+++ b/skrub/_reporting/_utils.py
@@ -115,6 +115,8 @@ def default(self, value):
                 return int(value)
             if isinstance(value, np.floating):
                 return float(value)
+            if isinstance(value, np.ndarray):
+                return value.tolist()
             raise
 
 

From 9b10644ef93774822f089898d52fcfcdb005ae29 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Mon, 15 Jun 2026 09:58:45 +0200
Subject: [PATCH 2/7] _

---
 skrub/_reporting/_summarize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py
index 663472d0f..fce85ad2b 100644
--- a/skrub/_reporting/_summarize.py
+++ b/skrub/_reporting/_summarize.py
@@ -255,9 +255,10 @@ def _add_datetime_summary(summary, column, with_plots):
     if with_plots:
         (
             summary["histogram_plot"],
-            summary["n_low_outliers"],
-            summary["n_high_outliers"],
+            summary["histogram_data"],
         ) = _plotting.histogram(column, color=_plotting.COLORS[0])
+    else:
+        summary["histogram_data"] = _plotting.histogram_data(column)
 
 
 def _add_numeric_summary(

From d5eabf938fc723e9e8c1e6d852e6f2c3a5de73c6 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Mon, 15 Jun 2026 13:53:51 +0200
Subject: [PATCH 3/7] _

---
 .../_data/templates/column-summary.html       |  4 ++--
 skrub/_reporting/_plotting.py                 |  7 ++++++-
 skrub/_reporting/tests/test_plotting.py       | 20 +++++++++----------
 skrub/_reporting/tests/test_utils.py          |  2 +-
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/skrub/_reporting/_data/templates/column-summary.html b/skrub/_reporting/_data/templates/column-summary.html
index fcf4fa1bb..e7482b5d0 100644
--- a/skrub/_reporting/_data/templates/column-summary.html
+++ b/skrub/_reporting/_data/templates/column-summary.html
@@ -4,10 +4,10 @@
     data-name-repr="{{ column.name.__repr__() }}"
     data-column-name="{{ column.name }}"
     data-column-idx="{{ column.idx }}"
-     {% if column['histogram_data']['n_low_outliers'] %}
+     {% if column['histogram_data'] and column['histogram_data']['n_low_outliers'] %}
      data-has-low-outliers
      {% endif %}
-     {% if column['histogram_data']['n_high_outliers'] %}
+     {% if column['histogram_data'] and column['histogram_data']['n_high_outliers'] %}
      data-has-high-outliers
      {% endif %}
     data-manager="FilterableColumn {% if in_sample_tab %}SampleColumnSummary{% endif %}"
diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
index 90dd02283..c1e4f8847 100644
--- a/skrub/_reporting/_plotting.py
+++ b/skrub/_reporting/_plotting.py
@@ -206,7 +206,12 @@ def _robust_hist(col, ax=None, color=None):
     n_low_outliers = (values < low).sum()
     n_high_outliers = (high < values).sum()
     result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers}
-    result["bin_counts"], result["bin_edges"] = np.histogram(inliers)
+    if sbd.is_any_date(col):
+        # numpy histogram does not handle datetimes
+        np_inliers = inliers.astype("datetime64[s]").astype("float")
+    else:
+        np_inliers = inliers
+    result["bin_counts"], result["bin_edges"] = np.histogram(np_inliers)
     if ax is None:
         return result
     n, bins, patches = ax.hist(inliers)
diff --git a/skrub/_reporting/tests/test_plotting.py b/skrub/_reporting/tests/test_plotting.py
index c49eb98aa..601f96526 100644
--- a/skrub/_reporting/tests/test_plotting.py
+++ b/skrub/_reporting/tests/test_plotting.py
@@ -10,21 +10,21 @@ def test_histogram():
     o = rng.uniform(-100, 100, size=10)
 
     data = pd.Series(np.concatenate([x, o]))
-    _, n_low, n_high = _plotting.histogram(data)
-    assert (n_low, n_high) == (5, 4)
+    _, hist = _plotting.histogram(data)
+    assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (5, 4)
 
     data = pd.Series(np.concatenate([x, o - 1000]))
-    _, n_low, n_high = _plotting.histogram(data)
-    assert (n_low, n_high) == (10, 0)
+    _, hist = _plotting.histogram(data)
+    assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (10, 0)
 
     data = pd.Series(np.concatenate([x, o + 1000]))
-    _, n_low, n_high = _plotting.histogram(data)
-    assert (n_low, n_high) == (0, 10)
+    _, hist = _plotting.histogram(data)
+    assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 10)
 
     data = pd.Series(x)
-    _, n_low, n_high = _plotting.histogram(data)
-    assert (n_low, n_high) == (0, 0)
+    _, hist = _plotting.histogram(data)
+    assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0)
 
     data = pd.Series([0.0])
-    _, n_low, n_high = _plotting.histogram(data)
-    assert (n_low, n_high) == (0, 0)
+    _, hist = _plotting.histogram(data)
+    assert (hist["n_low_outliers"], hist["n_high_outliers"]) == (0, 0)
diff --git a/skrub/_reporting/tests/test_utils.py b/skrub/_reporting/tests/test_utils.py
index c2f6b7a68..23dcf571e 100644
--- a/skrub/_reporting/tests/test_utils.py
+++ b/skrub/_reporting/tests/test_utils.py
@@ -104,7 +104,7 @@ def test_json_encoder():
     d = {"a": x[0], "b": y[0]}
     assert json.dumps(d, cls=_utils.JSONEncoder) == '{"a": 1, "b": 1.0}'
     with pytest.raises(TypeError, match=".*JSON serializable"):
-        json.dumps({"a": x}, cls=_utils.JSONEncoder)
+        json.dumps({"a": np}, cls=_utils.JSONEncoder)
 
 
 def test_svg_to_img_src():

From 78d4aa4050e9033ede97ace4b050adf8c0824cee Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Mon, 15 Jun 2026 13:58:11 +0200
Subject: [PATCH 4/7] changelog

---
 CHANGES.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index db21d87ec..e4ee84e02 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -40,6 +40,10 @@ New Features
   Additionally, negative numbers indicated with parentheses can be converted to the
   regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela
   Gómez Jiménez <gabrielapgomezji>`.
+- :meth:`TableReport.json` now includes histogram data for numeric and datetime
+  columns (the bin count and edges, and numbers of low and high outliers). Now
+  ``json()`` contains all the information shown in the report html rendering,
+  including the plots. :pr:`2164` by :user:`Jérôme Dockès <jeromedockes>`.
 
 Changes
 -------

From ee96623ae451566bc476bfa1ff818fc9a4dfe238 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Mon, 15 Jun 2026 14:11:20 +0200
Subject: [PATCH 5/7] _

---
 skrub/_reporting/_plotting.py            | 25 +++++++++++++-----------
 skrub/_reporting/tests/test_summarize.py |  3 +++
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
index c1e4f8847..d0a8e9be5 100644
--- a/skrub/_reporting/_plotting.py
+++ b/skrub/_reporting/_plotting.py
@@ -11,8 +11,8 @@
 import numpy as np
 from matplotlib import pyplot as plt
 
-from skrub import _dataframe as sbd
-
+from .. import _dataframe as sbd
+from .. import _datetime_encoder
 from . import _utils
 
 __all__ = ["COLORS", "COLOR_0", "histogram", "line", "value_counts"]
@@ -200,21 +200,24 @@ def _robust_hist(col, ax=None, color=None):
         # version if there are inf or nan)
         col = sbd.to_float32(col)
     values = sbd.to_numpy(col)
-
+    if sbd.is_any_date(col):
+        # numpy histogram does not handle datetimes but matplotlib does
+        np_histogram_values = sbd.to_numpy(
+            _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col)
+        ).ravel()
+    else:
+        np_histogram_values = values
     low, high = _get_range(values)
-    inliers = values[(low <= values) & (values <= high)]
+    inlier_mask = (low <= values) & (values <= high)
     n_low_outliers = (values < low).sum()
     n_high_outliers = (high < values).sum()
     result = {"n_low_outliers": n_low_outliers, "n_high_outliers": n_high_outliers}
-    if sbd.is_any_date(col):
-        # numpy histogram does not handle datetimes
-        np_inliers = inliers.astype("datetime64[s]").astype("float")
-    else:
-        np_inliers = inliers
-    result["bin_counts"], result["bin_edges"] = np.histogram(np_inliers)
+    result["bin_counts"], result["bin_edges"] = np.histogram(
+        np_histogram_values[inlier_mask]
+    )
     if ax is None:
         return result
-    n, bins, patches = ax.hist(inliers)
+    n, bins, patches = ax.hist(values[inlier_mask])
     n_out = n_low_outliers + n_high_outliers
     if not n_out:
         return result
diff --git a/skrub/_reporting/tests/test_summarize.py b/skrub/_reporting/tests/test_summarize.py
index 245465239..bde2c4df9 100644
--- a/skrub/_reporting/tests/test_summarize.py
+++ b/skrub/_reporting/tests/test_summarize.py
@@ -84,6 +84,9 @@ def test_summarize(
         0.75: 33.6,
         1.0: 78.3,
     }
+    if order_by is None:
+        assert len(summary["columns"][5]["histogram_data"]["bin_counts"]) == 10
+        assert len(summary["columns"][5]["histogram_data"]["bin_edges"]) == 11
     assert summary["columns"][7]["null_count"] == 9
     assert summary["columns"][7]["nulls_level"] == "warning"
     assert summary["columns"][8]["null_count"] == 17

From f4fe1ef0f10aa3d3e840f62160cb2ef14eac8348 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Tue, 16 Jun 2026 17:57:14 +0200
Subject: [PATCH 6/7] add comments

---
 skrub/_reporting/_plotting.py  | 3 ++-
 skrub/_reporting/_summarize.py | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
index d0a8e9be5..bd4c512e4 100644
--- a/skrub/_reporting/_plotting.py
+++ b/skrub/_reporting/_plotting.py
@@ -201,7 +201,8 @@ def _robust_hist(col, ax=None, color=None):
         col = sbd.to_float32(col)
     values = sbd.to_numpy(col)
     if sbd.is_any_date(col):
-        # numpy histogram does not handle datetimes but matplotlib does
+        # numpy histogram does not handle datetimes but matplotlib does, so we
+        # convert to the total number of seconds since epoch (a float)
         np_histogram_values = sbd.to_numpy(
             _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col)
         ).ravel()
diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py
index fce85ad2b..e582e07fd 100644
--- a/skrub/_reporting/_summarize.py
+++ b/skrub/_reporting/_summarize.py
@@ -258,6 +258,8 @@ def _add_datetime_summary(summary, column, with_plots):
             summary["histogram_data"],
         ) = _plotting.histogram(column, color=_plotting.COLORS[0])
     else:
+        # besides the plots, the bin counts and edges are always stored and
+        # available in the json output.
         summary["histogram_data"] = _plotting.histogram_data(column)
 
 
@@ -290,6 +292,8 @@ def _add_numeric_summary(
     summary["value_is_constant"] = False
     summary["quantiles"] = quantiles
     if not with_plots:
+        # besides the plots, the bin counts and edges are always stored and
+        # available in the json output.
         summary["histogram_data"] = _plotting.histogram_data(column)
         return
     if order_by_column is None:

From 0e276e3c6f767294b478ac8ea1287f39da6391e0 Mon Sep 17 00:00:00 2001
From: Jerome Dockes <jerome@dockes.org>
Date: Wed, 17 Jun 2026 11:07:40 +0200
Subject: [PATCH 7/7] add comment

---
 skrub/_reporting/_plotting.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
index bd4c512e4..4920621e7 100644
--- a/skrub/_reporting/_plotting.py
+++ b/skrub/_reporting/_plotting.py
@@ -203,6 +203,10 @@ def _robust_hist(col, ax=None, color=None):
     if sbd.is_any_date(col):
         # numpy histogram does not handle datetimes but matplotlib does, so we
         # convert to the total number of seconds since epoch (a float)
+        #
+        # note that the dtype cannot be duration (timedelta) here as they are
+        # handled higher in the call stack and converted to floats with
+        # _utils.duration_to_numeric
         np_histogram_values = sbd.to_numpy(
             _datetime_encoder.DatetimeEncoder(resolution=None).fit_transform(col)
         ).ravel()