Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions dlt/sources/filesystem/readers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional

from dlt.common import json
from dlt.common.typing import copy_sig_any
from dlt.sources import TDataItems, DltResource, DltSource
from dlt.sources.filesystem import FileItemDict
from dlt.common.libs.pandas import pandas

from .helpers import fetch_arrow, fetch_json

Expand All @@ -23,8 +23,6 @@ def _read_csv(
Returns:
TDataItem: The file content
"""
import pandas as pd

# apply defaults to pandas kwargs
kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
# For some remote file systems (for example, sftp/paramiko), decoding may happen before
Expand All @@ -42,10 +40,9 @@ def _read_csv(
# Here we use pandas chunksize to read the file in chunks and avoid loading the whole file
# in memory.
with file_obj.open(mode=open_mode, **open_kwargs) as file:
for df in pd.read_csv(file, **kwargs):
for df in pandas.read_csv(file, **kwargs):
yield df.to_dict(orient="records")


# NOTE inconsistent kwarg convention across readers `chunk_size` vs. `chunksize`
# snakecased `chunk_size` is the more appropriate Python convention
def _read_jsonl(items: Iterable[FileItemDict], chunksize: int = 1000) -> Iterator[TDataItems]:
Expand Down Expand Up @@ -156,4 +153,4 @@ def read_parquet(self) -> DltResource: ...
def read_csv_duckdb(self) -> DltResource: ...

else:
ReadersSource = DltSource
ReadersSource = DltSource
Loading