-
Notifications
You must be signed in to change notification settings - Fork 521
feat: dataframe support via narwhals
#3912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from 8 commits
a597301
1408db5
a6ad157
b2adf3d
7b55df9
35c803a
61e510f
5df5b4b
a62a381
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| import narwhals | ||
| from narwhals.typing import IntoDataFrame | ||
|
|
||
| if TYPE_CHECKING: | ||
| from dlt.common.libs.pyarrow import pyarrow | ||
|
|
||
|
|
||
| def df_to_arrow(df: IntoDataFrame) -> pyarrow.Table: | ||
| """Converts any narwhals-compatible eager or lazy frame to a pyarrow table. | ||
| lazy frames are eagerly collected. | ||
| """ | ||
| nw_df = narwhals.from_native(df, allow_series=False) | ||
| if isinstance(nw_df, narwhals.LazyFrame): | ||
| nw_df = nw_df.collect() | ||
|
|
||
| return nw_df.to_arrow() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,8 @@ | |
| DestinationCapabilitiesContext, | ||
| adjust_schema_to_capabilities, | ||
| ) | ||
| from dlt.common.libs import is_pandas_frame, is_polars_frame | ||
| from dlt.common.libs import is_arrow_object | ||
| from dlt.common.libs.narwhals import df_to_arrow | ||
| from dlt.common.metrics import DataWriterMetrics | ||
| from dlt.common.runtime.collector import Collector, NULL_COLLECTOR | ||
| from dlt.common.typing import TDataItems, TDataItem, TLoaderFileFormat | ||
|
|
@@ -35,19 +36,6 @@ | |
| from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem | ||
|
|
||
|
|
||
| def _to_arrow_table(item: Any) -> Any: | ||
| """Convert a pandas or polars frame to a pyarrow Table; pass arrow items through.""" | ||
| if is_pandas_frame(item): | ||
| from dlt.common.libs.pandas import pandas_to_arrow | ||
|
|
||
| return pandas_to_arrow(item) | ||
| if is_polars_frame(item): | ||
| from dlt.common.libs.polars import polars_to_arrow | ||
|
Comment on lines
-44
to
-45
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice that unlike
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks to you and Marco for the reviews! It's saving us a lot of time actually. @FBruzzesi it would be nice to have a type guard to that catches @rudolfix IMO, the "eager arrow code path" shouldn't support lazyframes and raise "Received LazyFrame, call Lazy objects should be supported via the "lazy model code path" where we have Ibis expressions, SQLGlot, etc. for now. Hopefully, we can unify both by implementing load package preparations and incremental logic via Narwhals There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I think calling instead of calling |
||
|
|
||
| return polars_to_arrow(item) | ||
| return item | ||
|
|
||
|
|
||
| class MaterializedEmptyList(List[Any]): | ||
| """A list variant that will materialize tables even if empty list was yielded""" | ||
|
|
||
|
|
@@ -384,14 +372,22 @@ def _retrieve_normalize_config(self) -> ItemsNormalizerConfiguration: | |
| ) | ||
|
|
||
| def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: | ||
| static_table_name = self._get_static_table_name(resource, meta) | ||
|
|
||
| items_list = items if isinstance(items, list) else [items] | ||
| items = [] | ||
| for item in items_list: | ||
| if not is_arrow_object(item): | ||
| try: | ||
| item = df_to_arrow(item) | ||
| except TypeError: | ||
| raise TypeError( | ||
| f"Received unsupported type `{type(item)}`. Not supported by pyarrow nor" | ||
| " narwhals." | ||
| ) | ||
|
|
||
| items.append(self._apply_contract_filters(item, resource, static_table_name)) | ||
|
|
||
| static_table_name = self._get_static_table_name(resource, meta) | ||
| items = [ | ||
| # 2. remove columns and rows in data contract filters | ||
| self._apply_contract_filters(_to_arrow_table(item), resource, static_table_name) | ||
| for item in items_list | ||
| ] | ||
| super().write_items(resource, items, meta) | ||
|
|
||
| def _write_to_static_table( | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.