diff --git a/dlt/sources/filesystem/readers.py b/dlt/sources/filesystem/readers.py index e1178c40ba..84d9c8b202 100644 --- a/dlt/sources/filesystem/readers.py +++ b/dlt/sources/filesystem/readers.py @@ -1,9 +1,9 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional - from dlt.common import json from dlt.common.typing import copy_sig_any from dlt.sources import TDataItems, DltResource, DltSource from dlt.sources.filesystem import FileItemDict +from dlt.common.libs.pandas import pandas from .helpers import fetch_arrow, fetch_json @@ -23,8 +23,6 @@ def _read_csv( Returns: TDataItem: The file content """ - import pandas as pd - # apply defaults to pandas kwargs kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs} # For some remote file systems (for example, sftp/paramiko), decoding may happen before @@ -42,10 +40,9 @@ def _read_csv( # Here we use pandas chunksize to read the file in chunks and avoid loading the whole file # in memory. with file_obj.open(mode=open_mode, **open_kwargs) as file: - for df in pd.read_csv(file, **kwargs): + for df in pandas.read_csv(file, **kwargs): yield df.to_dict(orient="records") - # NOTE inconsistent kwarg convention across readers `chunk_size` vs. `chunksize` # snakecased `chunk_size` is the more appropriate Python convention def _read_jsonl(items: Iterable[FileItemDict], chunksize: int = 1000) -> Iterator[TDataItems]: @@ -156,4 +153,4 @@ def read_parquet(self) -> DltResource: ... def read_csv_duckdb(self) -> DltResource: ... else: - ReadersSource = DltSource + ReadersSource = DltSource \ No newline at end of file