diff --git a/marimo/_plugins/ui/_impl/dataframes/transforms/handlers.py b/marimo/_plugins/ui/_impl/dataframes/transforms/handlers.py index 62f3eb21491..9c1519bcbef 100644 --- a/marimo/_plugins/ui/_impl/dataframes/transforms/handlers.py +++ b/marimo/_plugins/ui/_impl/dataframes/transforms/handlers.py @@ -498,7 +498,32 @@ def handle_explode_columns( def handle_expand_dict( df: DataFrame, transform: ExpandDictTransform ) -> DataFrame: - return df.explode(transform.column_id) + collected_df, undo = collect_and_preserve_type(df) + native_df = collected_df.to_native() + + # Keep pandas handling fully pandas-native so mixed/object columns in + # unrelated fields do not trigger Arrow coercion errors. + if nw.dependencies.is_pandas_dataframe(native_df): + import pandas as pd + + result_df = native_df.copy() + # max_level=0 was used so that pandas doesn't recursively unnest dicts + # causing mismatch between pandas vs. polars df + # using the map function to replace the None values + # Replace top-level null rows so pandas 2.x can normalise them, needed for + # older versions of pandas running on py310 otherwise CI will fail + expanded = pd.json_normalize( + result_df.pop(transform.column_id).map( + lambda value: {} if value is None else value + ), # type: ignore[arg-type] + max_level=0, + ) + expanded.index = result_df.index + return undo(nw.from_native(result_df.join(expanded))) + + polars_df = collected_df.to_polars() + unnested = polars_df.unnest(transform.column_id) + return undo(nw.from_native(unnested)) @staticmethod def handle_unique(df: DataFrame, transform: UniqueTransform) -> DataFrame: diff --git a/marimo/_plugins/ui/_impl/dataframes/transforms/print_code.py b/marimo/_plugins/ui/_impl/dataframes/transforms/print_code.py index d27d1d07698..70e324f7b73 100644 --- a/marimo/_plugins/ui/_impl/dataframes/transforms/print_code.py +++ b/marimo/_plugins/ui/_impl/dataframes/transforms/print_code.py @@ -222,8 +222,11 @@ def generate_where_clause(df_name: str, where: FilterCondition) -> str: elif transform.type == TransformType.EXPAND_DICT: column_id = _as_literal(transform.column_id) - args = f"{df_name}.pop({column_id}).values.tolist()" - return f"{df_name}.join(pd.DataFrame({args}))" + return ( + f"{df_name}.join(" + f"pd.json_normalize({df_name}.pop({column_id}).map(lambda value: {{}} if value is None else value), max_level=0).set_axis({df_name}.index, axis=0)" + f")" + ) elif transform.type == TransformType.UNIQUE: column_ids = transform.column_ids @@ -465,7 +468,7 @@ def generate_where_clause_polars(where: FilterCondition) -> str: elif transform.type == TransformType.EXPAND_DICT: column_id = _as_literal(transform.column_id) - return f"{df_name}.hstack(pl.DataFrame({df_name}.select({column_id}).to_series().to_list())).drop({column_id})" + return f"{df_name}.unnest({column_id})" elif transform.type == TransformType.UNIQUE: column_ids = transform.column_ids diff --git a/tests/_plugins/ui/_impl/dataframes/test_handlers.py b/tests/_plugins/ui/_impl/dataframes/test_handlers.py index cf1b60d507e..04b7efee859 100644 --- a/tests/_plugins/ui/_impl/dataframes/test_handlers.py +++ b/tests/_plugins/ui/_impl/dataframes/test_handlers.py @@ -47,8 +47,8 @@ pytest.importorskip("ibis") pd = pytest.importorskip("pandas") -pytest.importorskip("polars") pytest.importorskip("pyarrow") +pytest.importorskip("polars") def apply(df: DataFrameType, transform: Transform) -> DataFrameType: @@ -86,7 +86,10 @@ def assert_frame_equal(a: DataFrameType, b: DataFrameType) -> None: def assert_frame_equal_with_nans( - a: DataFrameType, b: DataFrameType, allow_nan_equals_zero: bool = False + a: DataFrameType, + b: DataFrameType, + allow_nan_equals_zero: bool = False, + allow_none_equals_nan: bool = False, ) -> None: """ Assert two dataframes are equal, treating NaNs in the same locations as equal. @@ -97,6 +100,9 @@ def assert_frame_equal_with_nans( allow_nan_equals_zero: If True, treat NaN and 0.0 as equivalent values. This is useful for pivot operations where missing aggregations may be filled with 0.0 or NaN depending on the backend. + allow_none_equals_nan: If True, treat None and NaN as equivalent + missing values. This is useful when different backends materialise + missing numeric values differently. """ import math @@ -137,7 +143,25 @@ def assert_frame_equal_with_nans( or val_b == 0.0 ) ) - if not (val_a == val_b or both_nan or nan_or_zero_match): + # Useful for expand dict operations where None and nan are equal + none_nan_match = allow_none_equals_nan and ( + ( + val_a is None + and isinstance(val_b, float) + and math.isnan(val_b) + ) + or ( + val_b is None + and isinstance(val_a, float) + and math.isnan(val_a) + ) + ) + if not ( + val_a == val_b + or both_nan + or nan_or_zero_match + or none_nan_match + ): raise AssertionError( f"DataFrame values differ at column '{col}', row {idx}: {val_a} != {val_b}" ) @@ -1733,18 +1757,15 @@ def test_explode_columns(df: DataFrameType) -> None: assert nw_result.columns == ["A", "B", "C"] @staticmethod - @pytest.mark.skip( - reason="Dict/struct expansion not supported uniformly across backends" - ) @pytest.mark.parametrize( ("df", "expected"), list( zip( create_test_dataframes( - {"A": [{"foo": 1, "bar": "hello"}], "B": [1]} + {"A": [{"foo": 1, "bar": "hello"}, None], "B": [1, 2]}, ), create_test_dataframes( - {"B": [1], "foo": [1], "bar": ["hello"]} + {"B": [1, 2], "foo": [1, None], "bar": ["hello", None]}, ), strict=False, ) @@ -1760,9 +1781,54 @@ def test_expand_dict(df: DataFrameType, expected: DataFrameType) -> None: nw_expected = collect_df(expected) result_cols = sorted(nw_result.columns) expected_cols = sorted(nw_expected.columns) - assert_frame_equal( + assert_frame_equal_with_nans( + nw_expected.select(expected_cols), + nw_result.select(result_cols), + allow_none_equals_nan=True, + ) + + @staticmethod + @pytest.mark.parametrize( + ("df", "expected"), + list( + zip( + create_test_dataframes( + { + "A": [ + {"foo": 1, "nested": {"x": 2}}, + None, + ], + "B": [1, 2], + }, + include=["pandas", "polars"], + ), + create_test_dataframes( + { + "B": [1, 2], + "foo": [1, None], + "nested": [{"x": 2}, None], + }, + include=["pandas", "polars"], + ), + strict=False, + ) + ), + ) + def test_expand_dict_nested_dicts( + df: DataFrameType, expected: DataFrameType + ) -> None: + transform = ExpandDictTransform( + type=TransformType.EXPAND_DICT, column_id="A" + ) + result = apply(df, transform) + nw_result = collect_df(result) + nw_expected = collect_df(expected) + result_cols = sorted(nw_result.columns) + expected_cols = sorted(nw_expected.columns) + assert_frame_equal_with_nans( nw_expected.select(expected_cols), nw_result.select(result_cols), + allow_none_equals_nan=True, ) @staticmethod @@ -2341,41 +2407,6 @@ def test_filter_rows_nulls_pandas( result = apply(df, in_transform) assert_frame_equal_with_nans(result, expected) - @staticmethod - @pytest.mark.parametrize( - ("df", "expected"), - list( - zip( - create_test_dataframes( - {"nulls": [1, 2, 3, None, "hello"]}, include=["pandas"] - ), - create_test_dataframes({"nulls": [None]}, include=["pandas"]), - strict=False, - ) - ), - ) - def test_filter_rows_null_pandas_object( - df: DataFrameType, expected: DataFrameType - ) -> None: - in_transform = FilterRowsTransform( - type=TransformType.FILTER_ROWS, - operation="keep_rows", - where=FilterGroup( - type="group", - operator="and", - children=[ - FilterCondition( - type="condition", - column_id="nulls", - operator="in", - value=[None], - ) - ], - ), - ) - result = apply(df, in_transform) - assert_frame_equal_with_nans(result, expected) - @staticmethod @pytest.mark.parametrize( ("df", "expected"), diff --git a/tests/_plugins/ui/_impl/dataframes/test_print_code.py b/tests/_plugins/ui/_impl/dataframes/test_print_code.py index 57a297b0cb4..14cf22be05b 100644 --- a/tests/_plugins/ui/_impl/dataframes/test_print_code.py +++ b/tests/_plugins/ui/_impl/dataframes/test_print_code.py @@ -525,6 +525,52 @@ def test_print_code_result_matches_actual_transform_pandas( ) +@pytest.mark.skipif( + not DependencyManager.pandas.has(), reason="pandas not installed" +) +def test_print_code_expand_dict_nested_dict_pandas() -> None: + import pandas as pd + + transform = ExpandDictTransform( + type=TransformType.EXPAND_DICT, + column_id="dicts", + ) + transformations = Transformations([transform]) + my_df = pd.DataFrame( + { + "dicts": [{"a": 1, "nested": {"x": 2}}, None], + "other": [10, 20], + } + ) + + pandas_code = python_print_transforms( + "my_df", + list(my_df.columns), + transformations.transforms, + python_print_pandas, + ) + assert pandas_code + + loc = {"pd": pd, "my_df": my_df.copy()} + exec(pandas_code, {}, loc) + code_result = loc["my_df_next"] + + nw_df = nw.from_native(my_df.copy(), eager_only=True).lazy() + result_nw = _apply_transforms( + nw_df, + NarwhalsTransformHandler(), + transformations, + ) + real_result = result_nw.collect().to_native().reset_index(drop=True) + + pd.testing.assert_frame_equal( + code_result.reset_index(drop=True), + real_result, + ) + + assert list(code_result.columns) == ["other", "a", "nested"] + + @given( transform=create_transform_strategy( defined_column_id, @@ -772,6 +818,50 @@ def test_print_code_result_matches_actual_transform_polars( pl_testing.assert_frame_equal(code_result, real_result) +@pytest.mark.skipif( + not DependencyManager.polars.has(), reason="polars not installed" +) +def test_print_code_expand_dict_nested_dict_polars() -> None: + import polars as pl + import polars.testing as pl_testing + + transform = ExpandDictTransform( + type=TransformType.EXPAND_DICT, + column_id="dicts", + ) + transformations = Transformations([transform]) + my_df = pl.DataFrame( + { + "dicts": [{"a": 1, "nested": {"x": 2}}, None], + "other": [10, 20], + } + ) + + polars_code = python_print_transforms( + "my_df", + my_df.columns, + transformations.transforms, + python_print_polars, + ) + assert polars_code + + loc = {"pl": pl, "my_df": my_df.clone()} + exec(polars_code, globals(), loc) + code_result = loc["my_df_next"] + + nw_df = nw.from_native(my_df.clone(), eager_only=True).lazy() + result_nw = _apply_transforms( + nw_df, + NarwhalsTransformHandler(), + transformations, + ) + real_result = result_nw.collect().to_native() + + pl_testing.assert_frame_equal(code_result, real_result) + + assert code_result.columns == ["a", "nested", "other"] + + @given( transform=create_transform_strategy( defined_column_id,