Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions dlt/destinations/impl/filesystem/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def dataset_path(self) -> str:
"""A path within a bucket to tables in a dataset
NOTE: dataset_name changes if with_staging_dataset is active
"""
return self.pathlib.join(self.bucket_path, self.dataset_name, "") # type: ignore[no-any-return]
return self.pathlib.join(self.bucket_path, self.dataset_name) # type: ignore[no-any-return]

@contextmanager
def with_staging_dataset(self) -> Iterator["FilesystemClient"]:
Expand Down Expand Up @@ -874,12 +874,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema:
def get_table_dir(
self, table_name: str, remote: bool = False, schema_name: Optional[str] = None
) -> str:
"""Returns a directory containing table files, ending with separator.
Note that many tables can share the same table dir
"""Returns a directory containing table files.

Note that many tables can share the same table dir.
"""
# dlt tables do not respect layout (for now)
table_prefix = self.get_table_prefix(table_name, schema_name=schema_name)
table_dir: str = self.pathlib.dirname(table_prefix) + self.pathlib.sep
table_dir: str = self.pathlib.dirname(table_prefix)
if remote:
table_dir = self.make_remote_url(table_dir)
return table_dir
Expand Down
3 changes: 1 addition & 2 deletions tests/destinations/test_destination_name_and_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,5 +214,4 @@ def test_destination_config_in_name(environment: DictStrStr) -> None:
environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_url(
get_test_storage_root()
)
pathlib = p._fs_client().pathlib # type: ignore[attr-defined]
assert p._fs_client().dataset_path.endswith(p.dataset_name + pathlib.sep)
assert p._fs_client().dataset_path.endswith(p.dataset_name)
48 changes: 40 additions & 8 deletions tests/load/filesystem/test_filesystem_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,18 +134,18 @@ def test_trailing_separators(layout: str, with_gdrive_buckets_env: str) -> None:
os.environ["DESTINATION__FILESYSTEM__LAYOUT"] = layout
load = setup_loader("_data")
client: FilesystemClient = load.get_destination_client(Schema("empty")) # type: ignore[assignment]
# assert separators
assert client.dataset_path.endswith("_data/")
assert client.get_table_dir("_dlt_versions").endswith("_dlt_versions/")
assert client.get_table_dir("_dlt_versions", remote=True).endswith("_dlt_versions/")
# assert paths no longer carry a trailing separator after the strip-trailing-slash fix
assert client.dataset_path.endswith("_data")
assert client.get_table_dir("_dlt_versions").endswith("_dlt_versions")
assert client.get_table_dir("_dlt_versions", remote=True).endswith("_dlt_versions")
is_folder = layout.startswith("{table_name}/")
if is_folder:
assert client.get_table_dir("letters").endswith("_data/letters/")
assert client.get_table_dir("letters", remote=True).endswith("_data/letters/")
assert client.get_table_dir("letters").endswith("_data/letters")
assert client.get_table_dir("letters", remote=True).endswith("_data/letters")
else:
# strip prefix
assert client.get_table_dir("letters").endswith("_data/")
assert client.get_table_dir("letters", remote=True).endswith("_data/")
assert client.get_table_dir("letters").endswith("_data")
assert client.get_table_dir("letters", remote=True).endswith("_data")
if is_folder:
assert client.get_table_prefix("letters").endswith("_data/letters/")
else:
Expand Down Expand Up @@ -555,3 +555,35 @@ def assert_hf_endpoint_set(*args, **kwargs):

with patch("huggingface_hub.metadata_update", side_effect=assert_hf_endpoint_set):
client.update_dataset_card_metadata(load_id="test")


def test_dataset_path_has_no_trailing_separator() -> None:
"""`dataset_path` must not end with `/`.

OneLake (Microsoft Fabric) responds with `403 ClientAuthenticationError`
when `BlobClient.exists` targets a blob name ending in `/`, instead of
the `404 ResourceNotFoundError` that other Azure backends return. That
makes `FilesystemClient.initialize_storage` blow up on its first
`fs.isdir(self.dataset_path)` call before any data is written. Non-OneLake
backends observe the same latent defect as a silent `False`.
"""
client = _client_factory(filesystem(bucket_url="file:///tmp/dlt-test-bucket"))
assert not client.dataset_path.endswith(
"/"
), f"dataset_path must not end with '/', got {client.dataset_path!r}"


def test_get_table_dir_has_no_trailing_separator() -> None:
"""`get_table_dir` must not end with `/` or `\\`.

`FilesystemClient.truncate_tables` iterates `get_table_dirs(...)` and
calls `self.fs_client.exists(table_dir)` for each one. On OneLake that
produces a `403 ClientAuthenticationError` on every truncated table
once the `dataset_path` trailing-slash bug (see previous test) is
already fixed. Same root cause, one level deeper.
"""
client = _client_factory(filesystem(bucket_url="file:///tmp/dlt-test-bucket"))
table_dir = client.get_table_dir("some_table")
assert not table_dir.endswith(
("/", "\\")
), f"get_table_dir must not end with a separator, got {table_dir!r}"
Loading