diff --git a/CHANGES.rst b/CHANGES.rst index fce0bc259..7b1ef93f7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -41,6 +41,7 @@ New Features regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. + Changes ------- - :meth:`choose_from` now transparently converts `outcomes` to a list when it is another type of sequence. :pr:`2100` by @@ -54,6 +55,8 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. +- The :class:`TableReport` can now display the estimated memory usage of the data it is applied to. + :pr:`2153` by :user:`Salam AlKaissi Sanae Janati Idrissi `. Bugfixes -------- diff --git a/skrub/_reporting/_data/templates/dataframe-sample.html b/skrub/_reporting/_data/templates/dataframe-sample.html index 9544f6f5a..372606413 100644 --- a/skrub/_reporting/_data/templates/dataframe-sample.html +++ b/skrub/_reporting/_data/templates/dataframe-sample.html @@ -86,6 +86,13 @@ {{ summary.n_rows | format_number }} rows ✕ {{ summary.n_columns | format_number }} columns + {%- if summary.get("memory_usage_kb") is not none -%} + (estimated memory usage: {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB) + {%- if summary.get("memory_estimate_unreliable") -%} + — estimate may be inaccurate for complex objects + {%- endif -%} + + {%- endif -%} {% if 'is_subsampled' in summary %} (subsampled from more rows) {% endif %} diff --git a/skrub/_reporting/_data/templates/report.md b/skrub/_reporting/_data/templates/report.md index 3239b6184..b836c5f56 100644 --- a/skrub/_reporting/_data/templates/report.md +++ b/skrub/_reporting/_data/templates/report.md @@ -2,6 +2,12 @@ The provided dataframe uses the {{ summary.dataframe_module }} library. It has **shape** {{ summary.n_rows }} rows × {{ summary.n_columns }} columns. +{% if summary.get("memory_usage_kb") is not none %} +**memory usage** {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB. +{% if summary.get("memory_estimate_unreliable") %} +_Note: memory estimate may be inaccurate for complex object columns._ +{% endif %} +{% endif %} Columns are marked as "high cardinality" if they contain more than {{ summary.cardinality_threshold }} unique values. diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index b544986e2..6f8500bfa 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -2,6 +2,9 @@ import sys +from skrub._dataframe._common import raise_dispatch_unregistered_type +from skrub._dispatch import dispatch + from .. import _column_associations, _config from .. import _dataframe as sbd from . import _plotting, _sample_table, _utils @@ -18,6 +21,34 @@ _N_TOP_ASSOCIATIONS = 1000 +@dispatch +def _memory_usage_kb(obj): + raise_dispatch_unregistered_type(obj) + + +@_memory_usage_kb.specialize("pandas") +def _memory_usage_pandas(obj): + memory_usage_bytes = obj.memory_usage(deep=False).sum() + return memory_usage_bytes / 1024 + + +@_memory_usage_kb.specialize("polars") +def _memory_usage_polars(obj): + memory_usage_bytes = obj.estimated_size() + return memory_usage_bytes / 1024 + + +def _has_complex_objects(df): + """Return True when pandas has object-dtype columns. + + The memory estimate is less reliable for object-dtype columns, so we warn + as soon as any are present. + """ + if sbd.dataframe_module_name(df) != "pandas": + return False + return any(dtype == object for dtype in df.dtypes) + + def summarize_dataframe( df, *, @@ -77,6 +108,7 @@ def summarize_dataframe( "dataframe_module": sbd.dataframe_module_name(df), "n_rows": n_rows, "n_columns": n_columns, + "memory_usage_kb": _memory_usage_kb(df), "columns": [], "dataframe_is_empty": not n_rows or not n_columns, "plots_skipped": not with_plots, @@ -90,6 +122,11 @@ def summarize_dataframe( } if title is not None: summary["title"] = title + # detect complex objects that make memory estimates unreliable + # try: + summary["memory_estimate_unreliable"] = _has_complex_objects(df) + # except Exception: + # summary["memory_estimate_unreliable"] = False if order_by is not None: df = sbd.sort(df, by=order_by) summary["order_by"] = order_by diff --git a/skrub/_reporting/_table_report.py b/skrub/_reporting/_table_report.py index db491170f..d8c4286f6 100644 --- a/skrub/_reporting/_table_report.py +++ b/skrub/_reporting/_table_report.py @@ -240,7 +240,7 @@ class TableReport: >>> j = TableReport(df, plot_distributions=False).json() >>> print(j) - {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, "columns": ... + {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, ...} Advanced configuration: you can add custom column filters that will appear diff --git a/skrub/_reporting/tests/test_markdown_template.py b/skrub/_reporting/tests/test_markdown_template.py index 55d3b81b7..53187b8e0 100644 --- a/skrub/_reporting/tests/test_markdown_template.py +++ b/skrub/_reporting/tests/test_markdown_template.py @@ -37,6 +37,7 @@ def test_markdown_report_structure_and_titles(df_module): assert "# " in markdown_default # Header should exist # Shape info should be present assert "**shape** 3 rows × 3 columns" in markdown + assert "**memory usage**" in markdown # Unique values should be present (default value) assert "40 unique values." in markdown diff --git a/skrub/_reporting/tests/test_table_report.py b/skrub/_reporting/tests/test_table_report.py index a5a3eaab2..953de0f76 100644 --- a/skrub/_reporting/tests/test_table_report.py +++ b/skrub/_reporting/tests/test_table_report.py @@ -46,6 +46,7 @@ def test_report(air_quality): assert "With nulls" in html assert "First 10" in html assert "First 2" in html + assert "memory usage:" in html for col_name in sbd.column_names(air_quality): assert col_name in html report_id = get_report_id(html)