From 82927d059752abc61b099f4ebdccfc12cd9f0c19 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 22 Apr 2026 10:42:49 +0200 Subject: [PATCH 01/16] add join compatibility based on `physical_location` to destination configs --- dlt/common/destination/client.py | 22 +- dlt/dataset/dataset.py | 11 +- dlt/destinations/impl/athena/configuration.py | 11 + .../impl/bigquery/configuration.py | 12 +- .../impl/clickhouse/configuration.py | 7 +- .../impl/databricks/configuration.py | 8 +- dlt/destinations/impl/dremio/configuration.py | 7 +- dlt/destinations/impl/duckdb/configuration.py | 6 + .../impl/ducklake/configuration.py | 22 +- dlt/destinations/impl/fabric/configuration.py | 7 + .../impl/filesystem/configuration.py | 32 + dlt/destinations/impl/lance/configuration.py | 27 +- .../impl/lancedb/configuration.py | 41 +- .../impl/motherduck/configuration.py | 26 +- dlt/destinations/impl/mssql/configuration.py | 8 +- .../impl/postgres/configuration.py | 32 +- dlt/destinations/impl/qdrant/configuration.py | 18 +- .../impl/redshift/configuration.py | 8 +- .../impl/snowflake/configuration.py | 7 +- .../impl/sqlalchemy/configuration.py | 68 +- .../impl/weaviate/configuration.py | 18 +- tests/destinations/test_join_compatibility.py | 1038 +++++++++++++++++ tests/load/ducklake/test_ducklake_client.py | 29 +- .../load/filesystem/test_filesystem_client.py | 2 +- .../redshift/test_redshift_table_builder.py | 2 +- 25 files changed, 1380 insertions(+), 89 deletions(-) create mode 100644 tests/destinations/test_join_compatibility.py diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py index db3f3f28fa..dfa08c7316 100644 --- a/dlt/common/destination/client.py +++ b/dlt/common/destination/client.py @@ -59,6 +59,7 @@ from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc from dlt.common.typing import is_optional_type +from dlt.common.utils import digest128 TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") @@ -163,10 +164,29 @@ class DestinationClientConfiguration(BaseConfiguration): __recommended_sections__: ClassVar[Sequence[str]] = (known_sections.DESTINATION, "") + def physical_destination(self) -> str: + """Returns a non-secret destination identity, or "" when unavailable.""" + return "" + def fingerprint(self) -> str: - """Returns a destination fingerprint which is a hash of selected configuration fields. ie. host in case of connection string""" + """Returns a hash of physical_destination(), or "" when unavailable.""" + phys_dest = self.physical_destination() + if phys_dest: + return digest128(phys_dest) return "" + def can_join_with(self, other: "DestinationClientConfiguration") -> bool: + """Returns True for same-type destinations with the same non-empty identity.""" + if not isinstance(other, DestinationClientConfiguration): + return False + if self.destination_type != other.destination_type: + return False + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if self_phys and other_phys and self_phys == other_phys: + return True + return False + def __str__(self) -> str: """Return displayable destination location""" return str(self.credentials) diff --git a/dlt/dataset/dataset.py b/dlt/dataset/dataset.py index 607c917d32..a1e7825d64 100644 --- a/dlt/dataset/dataset.py +++ b/dlt/dataset/dataset.py @@ -498,12 +498,11 @@ def get_dataset_sql_client(dataset: dlt.Dataset) -> SqlClientBase[Any]: def is_same_physical_destination(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: - """Check if both datasets are at the same physical destination. - - This is done by comparing the fingerprint of both destination configs. There - are potential false positive if two different config give access to the same destination. - """ - return str(dataset1.destination_client.config) == str(dataset2.destination_client.config) + """Check if tables from both datasets can be joined in a single query.""" + # NOTE: the name is historical -- this actually checks join compatibility via + # can_join_with(), which may return True even when the physical storage + # locations differ (e.g. filesystem destinations backed by different buckets). + return dataset1.destination_client.config.can_join_with(dataset2.destination_client.config) def _get_dataset_schema_from_destination_using_schema_name( diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index f7cc38fa6a..9b0f2adf48 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -60,6 +60,17 @@ def to_connector_params(self, use_catalog_name: bool = True) -> Dict[str, Any]: def _is_s3_tables_catalog(self) -> bool: return is_s3_tables_catalog(self.aws_data_catalog) + def physical_destination(self) -> str: + """Returns region/catalog, or "" when region is unavailable.""" + catalog = self.aws_data_catalog or DEFAULT_AWS_DATA_CATALOG + region = None + if self.credentials: + region = self.credentials.region_name + + if region: + return f"{region}/{catalog}" + return "" + def __str__(self) -> str: """Return displayable destination location""" if self.staging_config: diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 792a5f7eec..9bd59fa1cb 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -3,7 +3,6 @@ from dlt.common.configuration import configspec from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials -from dlt.common.utils import digest128 from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration @@ -38,8 +37,9 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): def get_location(self) -> str: return self.location - def fingerprint(self) -> str: - """Returns a fingerprint of project_id""" - if self.credentials and self.credentials.project_id: - return digest128(self.credentials.project_id) - return "" + def physical_destination(self) -> str: + """Returns configured project id, falling back to credentials.""" + project_id = self.project_id + if not project_id and self.credentials: + project_id = self.credentials.project_id + return project_id or "" diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py index 4a7a08d1dd..837c29b837 100644 --- a/dlt/destinations/impl/clickhouse/configuration.py +++ b/dlt/destinations/impl/clickhouse/configuration.py @@ -8,7 +8,6 @@ from dlt.common.destination.client import ( DestinationClientDwhWithStagingConfiguration, ) -from dlt.common.utils import digest128 from dlt.destinations.impl.clickhouse.typing import TSecureConnection, TTableEngineType @@ -97,8 +96,8 @@ class ClickHouseClientConfiguration(DestinationClientDwhWithStagingConfiguration "table_engine_type", ] - def fingerprint(self) -> str: - """Returns a fingerprint of the host part of a connection string.""" + def physical_destination(self) -> str: + """Returns host:port.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + return f"{self.credentials.host}:{self.credentials.port}" return "" diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 9a256317c2..2eba3d396f 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -14,13 +14,11 @@ from dlt.common.typing import TSecretStrValue from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.configuration.exceptions import ConfigurationValueError -from dlt.common.utils import digest128 from dlt.destinations.impl.databricks.typing import TDatabricksInsertApi if TYPE_CHECKING: from zerobus import ArrowStreamConfigurationOptions, IPCCompression - DATABRICKS_APPLICATION_ID = "dltHub_dlt" DEFAULT_DATABRICKS_INSERT_API: TDatabricksInsertApi = "copy_into" # ZSTD was fastest in my benchmarks out of the three `ipc_compression` options @@ -286,8 +284,8 @@ def on_resolved(self) -> None: " `destination.databricks.credentials.client_secret`." ) - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns the server hostname.""" if self.credentials and self.credentials.server_hostname: - return digest128(self.credentials.server_hostname) + return self.credentials.server_hostname return "" diff --git a/dlt/destinations/impl/dremio/configuration.py b/dlt/destinations/impl/dremio/configuration.py index 12ec842bba..9d38c188ce 100644 --- a/dlt/destinations/impl/dremio/configuration.py +++ b/dlt/destinations/impl/dremio/configuration.py @@ -5,7 +5,6 @@ from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.typing import TSecretStrValue -from dlt.common.utils import digest128 @configspec(init=False) @@ -37,8 +36,8 @@ class DremioClientConfiguration(DestinationClientDwhWithStagingConfiguration): staging_data_source: str = None """The name of the staging data source""" - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns host:port.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + return f"{self.credentials.host}:{self.credentials.port}" return "" diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py index 80ffac3c9e..c85aa71c73 100644 --- a/dlt/destinations/impl/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -318,5 +318,11 @@ def __init__( ) self.create_indexes = create_indexes + def physical_destination(self) -> str: + """Returns the database file path or ':memory:'.""" + if self.credentials and self.credentials.database: + return self.credentials.database + return "" + def on_resolved(self) -> None: self.credentials.database = self.make_location(self.credentials.database, DUCK_DB_NAME_PAT) diff --git a/dlt/destinations/impl/ducklake/configuration.py b/dlt/destinations/impl/ducklake/configuration.py index 412eedbe12..e55cefb603 100644 --- a/dlt/destinations/impl/ducklake/configuration.py +++ b/dlt/destinations/impl/ducklake/configuration.py @@ -13,7 +13,6 @@ FilesystemConfigurationWithLocalFiles, WithLocalFiles, ) -from dlt.common.utils import digest128 from dlt.destinations.impl.duckdb.configuration import DuckDbConnectionPool, DuckDbBaseCredentials from dlt.destinations.impl.duckdb.factory import _set_duckdb_raw_capabilities @@ -143,11 +142,24 @@ class DuckLakeClientConfiguration(WithLocalFiles, DestinationClientDwhWithStagin automatic_migration: bool = False """When true, attaches with `AUTOMATIC_MIGRATION true` so DuckDB migrates an older DuckLake catalog schema on attach.""" - def fingerprint(self) -> str: - """Use fingerprint of underlying storage. This is precise to bucket level""" - if self.credentials.storage is None: + def physical_destination(self) -> str: + """Returns credential-free catalog identity plus ducklake name.""" + if not self.credentials or not self.credentials.catalog: return "" - return self.credentials.storage.fingerprint() + + catalog = self.credentials.catalog + ducklake_name = self.credentials.ducklake_name or DEFAULT_DUCKLAKE_NAME + + if catalog.host: + port_str = f":{catalog.port}" if catalog.port else "" + db_str = f"/{catalog.database}" if catalog.database else "" + catalog_id = f"{catalog.drivername}://{catalog.host}{port_str}{db_str}" + elif catalog.database: + catalog_id = f"{catalog.drivername}://{catalog.database}" + else: + catalog_id = catalog.drivername or "unknown" + + return f"{catalog_id}#{ducklake_name}" def on_resolved(self) -> None: # redirect local catalog database file to `local_dir` diff --git a/dlt/destinations/impl/fabric/configuration.py b/dlt/destinations/impl/fabric/configuration.py index e7cfcc2ccf..01b52d01e6 100644 --- a/dlt/destinations/impl/fabric/configuration.py +++ b/dlt/destinations/impl/fabric/configuration.py @@ -165,5 +165,12 @@ class FabricClientConfiguration(DestinationClientDwhWithStagingConfiguration): Both have UTF-8 encoding. LongAsMax=yes is automatically configured. """ + def physical_destination(self) -> str: + """Returns host:port.""" + if self.credentials and self.credentials.host: + port = self.credentials.port or 1433 + return f"{self.credentials.host}:{port}" + return "" + __all__ = ["FabricCredentials", "FabricClientConfiguration"] diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 8bd4a69f60..2ee22bfccf 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -10,6 +10,7 @@ from dlt.common.configuration.specs.hf_credentials import HfCredentials from dlt.common.destination.client import ( CredentialsConfiguration, + DestinationClientConfiguration, DestinationClientStagingConfiguration, ) from dlt.common.storages import FilesystemConfigurationWithLocalFiles @@ -44,6 +45,37 @@ class FilesystemDestinationClientConfiguration(FilesystemConfigurationWithLocalF def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: return super().resolve_credentials_type() + def physical_destination(self) -> str: + """Returns scheme://netloc for remote filesystems, or "" for local.""" + if not self.bucket_url: + return "" + + if self.is_local_path(self.bucket_url): + return "" + + from urllib.parse import urlparse + + url = urlparse(self.bucket_url) + return f"{url.scheme}://{url.netloc}" + + def fingerprint(self) -> str: + # Explicit override to resolve MRO ambiguity: without it, Python picks + # FilesystemConfiguration.fingerprint() (which hashes the raw bucket URL) + # over DestinationClientConfiguration.fingerprint() (which hashes + # physical_destination()). Do not remove. + return DestinationClientStagingConfiguration.fingerprint(self) + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True for any other filesystem destination. + + Filesystem tables are queried through a local engine (e.g. DuckDB) that + can access multiple storage backends in a single query, so join + compatibility is determined by the engine, not by the storage location. + """ + if isinstance(other, FilesystemDestinationClientConfiguration): + return True + return False + def on_resolved(self) -> None: # Validate layout and show unused placeholders _, layout_placeholders = check_layout(self.layout, self.extra_placeholders) diff --git a/dlt/destinations/impl/lance/configuration.py b/dlt/destinations/impl/lance/configuration.py index e2ab9f4a62..a80f8f5b0a 100644 --- a/dlt/destinations/impl/lance/configuration.py +++ b/dlt/destinations/impl/lance/configuration.py @@ -12,7 +12,10 @@ resolve_type, ) from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials -from dlt.common.destination.client import DestinationClientDwhConfiguration +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhConfiguration, +) from dlt.common.storages.configuration import ( FileSystemCredentials, FilesystemConfiguration, @@ -353,5 +356,23 @@ def make_namespace(self) -> "LanceNamespace": props.update(self.credentials.to_namespace_properties()) return connect(self.catalog_type, props) - def fingerprint(self) -> str: - return self.storage.fingerprint() if self.storage else "" + def physical_destination(self) -> str: + """Returns the resolved Lance catalog root.""" + if ( + isinstance(self.credentials, DirectoryCatalogCredentials) + and self.credentials.bucket_url + ): + return f"{self.catalog_type}:{self.credentials.bucket_url.rstrip('/')}" + return "" + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True for the same Lance catalog and bound dlt dataset.""" + if not isinstance(other, LanceClientConfiguration): + return False + + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if not self_phys or not other_phys or self_phys != other_phys: + return False + + return self.dataset_name == other.dataset_name diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py index 4b8d725917..e22b61f096 100644 --- a/dlt/destinations/impl/lancedb/configuration.py +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -9,12 +9,13 @@ CredentialsConfiguration, NotResolved, ) -from dlt.common.destination.client import DestinationClientDwhConfiguration +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhConfiguration, +) from dlt.common.pendulum import timedelta from dlt.common.storages.configuration import FilesystemConfiguration, WithLocalFiles from dlt.common.typing import TSecretStrValue, Annotated -from dlt.common.utils import digest128 - from dlt.destinations.impl.lancedb.warnings import uri_on_credentials_deprecated if TYPE_CHECKING: @@ -190,9 +191,31 @@ def on_resolved(self) -> None: # TODO: move uri back to credentials to make it more like other connections self.credentials.uri = self.lance_uri - def fingerprint(self) -> str: - """Returns a fingerprint of a connection string.""" - - if self.lance_uri: - return digest128(self.lance_uri) - return "" + def physical_destination(self) -> str: + """Returns the resolved LanceDB URI, or "" for external native clients.""" + if not self.lance_uri or self.lance_uri == ":external:": + return "" + + if self.lance_uri.startswith("db://"): + region = self.credentials.region if self.credentials else None + host_override = self.credentials.host_override if self.credentials else None + endpoint_parts = [self.lance_uri] + if region: + endpoint_parts.append(region) + if host_override: + endpoint_parts.append(host_override) + return "|".join(endpoint_parts) + + return self.lance_uri + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True for the same LanceDB URI and table naming layout.""" + if not isinstance(other, LanceDBClientConfiguration): + return False + + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if not self_phys or not other_phys or self_phys != other_phys: + return False + + return self.dataset_separator == other.dataset_separator diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index cf91483c12..640746c690 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -8,10 +8,13 @@ from dlt.version import __version__ from dlt.common.configuration import configspec from dlt.common.configuration.specs.exceptions import NativeValueError -from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) from dlt.common.destination.exceptions import DestinationTerminalException -from dlt.common.typing import TSecretStrValue from dlt.common.utils import digest128 +from dlt.common.typing import TSecretStrValue from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials, DuckDbConnectionPool @@ -123,12 +126,29 @@ class MotherDuckClientConfiguration(DestinationClientDwhWithStagingConfiguration False # should unique indexes be created, this slows loading down massively ) + def physical_destination(self) -> str: + """Returns "" because MotherDuck has no non-secret account identity.""" + return "" + def fingerprint(self) -> str: - """Returns a fingerprint of user access token""" + """Returns a fingerprint of user access token.""" if self.credentials and self.credentials.password: return digest128(self.credentials.password) return "" + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True for MotherDuck configs with the same token.""" + if not isinstance(other, MotherDuckClientConfiguration): + return False + + self_token = self.credentials.password if self.credentials else None + other_token = other.credentials.password if other.credentials else None + + if not self_token or not other_token: + return False + + return self_token == other_token + class MotherDuckCatalogMissing(NativeValueError): pass diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index fcf4145084..97314deb41 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -3,7 +3,6 @@ from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials -from dlt.common.utils import digest128 from dlt.common.typing import TSecretStrValue from dlt.common.exceptions import SystemConfigurationException @@ -135,8 +134,9 @@ class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): create_indexes: bool = False has_case_sensitive_identifiers: bool = False - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns host:port.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + port = self.credentials.port or 1433 + return f"{self.credentials.host}:{port}" return "" diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index ab86fa2d0b..b7232bf4f4 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -4,10 +4,12 @@ from dlt.common.destination.configuration import CsvFormatConfiguration from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials -from dlt.common.utils import digest128 from dlt.common.typing import TSecretStrValue -from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) @configspec(init=False) @@ -46,8 +48,28 @@ class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): csv_format: Optional[CsvFormatConfiguration] = None """Optional csv format configuration""" - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns host:port as the physical destination identifier.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + port = self.credentials.port or 5432 + return f"{self.credentials.host}:{port}" return "" + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True for the same Postgres host:port and database.""" + if not isinstance(other, PostgresClientConfiguration): + return False + if self.destination_type != other.destination_type: + return False + + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if not self_phys or not other_phys or self_phys != other_phys: + return False + + self_db = self.credentials.database if self.credentials else None + other_db = other.credentials.database if other.credentials else None + if not self_db or not other_db or self_db != other_db: + return False + + return True diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index 8220d3ad0f..d388d5d87d 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -8,8 +8,10 @@ BaseConfiguration, CredentialsConfiguration, ) -from dlt.common.destination.client import DestinationClientDwhConfiguration -from dlt.common.utils import digest128 +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhConfiguration, +) from dlt.common.storages.configuration import WithLocalFiles from dlt.destinations.impl.qdrant.exceptions import InvalidInMemoryQdrantCredentials @@ -146,11 +148,13 @@ def on_resolved(self) -> None: if self.qd_path and not os.path.isabs(self.qd_path): self.qd_path = self.make_location(self.qd_path, "%s.qdrant") - def fingerprint(self) -> str: - """Returns a fingerprint of a connection string""" - if self.qd_location: - return digest128(self.qd_location) - return "" + def physical_destination(self) -> str: + """Returns the Qdrant connection location.""" + return self.qd_location or "" + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Qdrant does not support dlt SQL joins.""" + return False def __str__(self) -> str: """Return displayable destination location""" diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index 0545be9ccb..bef2ee4b2c 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -3,7 +3,6 @@ from dlt.common.typing import TSecretStrValue from dlt.common.configuration import configspec -from dlt.common.utils import digest128 from dlt.destinations.impl.postgres.configuration import ( PostgresCredentials, @@ -28,8 +27,9 @@ class RedshiftClientConfiguration(PostgresClientConfiguration): staging_iam_role: Optional[str] = None has_case_sensitive_identifiers: bool = False - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns host:port.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + port = self.credentials.port or 5439 + return f"{self.credentials.host}:{port}" return "" diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 71933d53e9..6eeea1e50b 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -10,7 +10,6 @@ from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.configuration import configspec from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration -from dlt.common.utils import digest128 from dlt.destinations.impl.snowflake.utils import ( read_snowflake_session_token, snowflake_session_token_available, @@ -179,8 +178,8 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration) use_decfloat: bool = False """Whether to use DECFLOAT type for unbound decimals instead of DECIMAL""" - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" + def physical_destination(self) -> str: + """Returns the account host.""" if self.credentials and self.credentials.host: - return digest128(self.credentials.host) + return self.credentials.host return "" diff --git a/dlt/destinations/impl/sqlalchemy/configuration.py b/dlt/destinations/impl/sqlalchemy/configuration.py index 88ad45c39c..9fbd071c9c 100644 --- a/dlt/destinations/impl/sqlalchemy/configuration.py +++ b/dlt/destinations/impl/sqlalchemy/configuration.py @@ -8,7 +8,10 @@ from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.configuration.specs.base_configuration import NotResolved -from dlt.common.destination.client import DestinationClientDwhConfiguration +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhConfiguration, +) from dlt.common.storages.configuration import WithLocalFiles from dlt.common.typing import Annotated from dlt.common.warnings import DltDeprecationWarning @@ -262,3 +265,66 @@ def on_resolved(self) -> None: self.credentials.database = os.path.normpath( self.make_location(db or None, SQLITE_DB_NAME_PAT) ) + + def physical_destination(self) -> str: + """Returns sqlite path for sqlite, otherwise host:port.""" + if not self.credentials: + return "" + + drivername = self.credentials.drivername or "" + database = self.credentials.database + host = self.credentials.host + port = self.credentials.port + + if drivername == "sqlite": + if SqlalchemyCredentials.is_memory_database(database, self.credentials.query): + return ":memory:" + return database or "" + + if host: + # Default-vs-explicit port mismatches may reject otherwise valid joins. + if port: + return f"{host}:{port}" + return host + return "" + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Returns True when dialect-specific destination identities match.""" + if not isinstance(other, SqlalchemyClientConfiguration): + return False + + if not self.credentials or not other.credentials: + return False + + self_dialect = (self.credentials.drivername or "").lower() + other_dialect = (other.credentials.drivername or "").lower() + + if self_dialect != other_dialect: + return False + + if self_dialect == "sqlite": + self_phys = self.physical_destination() + other_phys = other.physical_destination() + return bool(self_phys and other_phys and self_phys == other_phys) + + if self_dialect == "postgresql": + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if not self_phys or not other_phys or self_phys != other_phys: + return False + self_db = self.credentials.database + other_db = other.credentials.database + return self_db is not None and other_db is not None and self_db == other_db + + if self_dialect in ("mysql", "mssql", "oracle", "db2"): + self_phys = self.physical_destination() + other_phys = other.physical_destination() + return bool(self_phys and other_phys and self_phys == other_phys) + + self_phys = self.physical_destination() + other_phys = other.physical_destination() + if not self_phys or not other_phys or self_phys != other_phys: + return False + self_db = self.credentials.database + other_db = other.credentials.database + return self_db is not None and other_db is not None and self_db == other_db diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index ae4c20f55e..a75de83cbf 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -5,8 +5,10 @@ from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration -from dlt.common.destination.client import DestinationClientDwhConfiguration -from dlt.common.utils import digest128 +from dlt.common.destination.client import ( + DestinationClientConfiguration, + DestinationClientDwhConfiguration, +) TWeaviateBatchConsistency = Literal["ONE", "QUORUM", "ALL"] TWeaviateConnectionType = Literal["cloud", "local", "custom"] @@ -62,10 +64,12 @@ class WeaviateClientConfiguration(DestinationClientDwhConfiguration): } ) - def fingerprint(self) -> str: - """Returns a fingerprint of host part of a connection string""" - + def physical_destination(self) -> str: + """Returns the host part of the connection URL.""" if self.credentials and self.credentials.url: - hostname = urlparse(self.credentials.url).hostname - return digest128(hostname) + return urlparse(self.credentials.url).hostname or "" return "" + + def can_join_with(self, other: DestinationClientConfiguration) -> bool: + """Weaviate does not support dlt SQL joins.""" + return False diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py new file mode 100644 index 0000000000..72b49a0201 --- /dev/null +++ b/tests/destinations/test_join_compatibility.py @@ -0,0 +1,1038 @@ +"""Tests for destination configuration-level join-compatibility semantics.""" + +from typing import Callable, cast + +from typing_extensions import TypeAlias + +import pytest + +from dlt.common.utils import digest128 +from dlt.common.configuration.specs import ( + AwsCredentials, + ConnectionStringCredentials, + GcpServiceAccountCredentials, +) +from dlt.common.destination.client import DestinationClientConfiguration +from dlt.dataset.dataset import Dataset, is_same_physical_destination +from dlt.destinations.impl.postgres.configuration import ( + PostgresClientConfiguration, + PostgresCredentials, +) +from dlt.destinations.impl.redshift.configuration import ( + RedshiftClientConfiguration, + RedshiftCredentials, +) +from dlt.destinations.impl.snowflake.configuration import ( + SnowflakeClientConfiguration, + SnowflakeCredentials, +) +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.destinations.impl.mssql.configuration import ( + MsSqlClientConfiguration, + MsSqlCredentials, +) +from dlt.destinations.impl.synapse.configuration import ( + SynapseClientConfiguration, + SynapseCredentials, +) +from dlt.destinations.impl.clickhouse.configuration import ( + ClickHouseClientConfiguration, + ClickHouseCredentials, +) +from dlt.destinations.impl.databricks.configuration import ( + DatabricksClientConfiguration, + DatabricksCredentials, +) +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration +from dlt.destinations.impl.dremio.configuration import ( + DremioClientConfiguration, + DremioCredentials, +) +from dlt.destinations.impl.duckdb.configuration import ( + DuckDbClientConfiguration, + DuckDbCredentials, +) +from dlt.destinations.impl.filesystem.configuration import ( + FilesystemDestinationClientConfiguration, +) +from dlt.destinations.impl.ducklake.configuration import ( + DuckLakeClientConfiguration, + DuckLakeCredentials, + DEFAULT_DUCKLAKE_NAME, +) +from dlt.destinations.impl.fabric.configuration import ( + FabricClientConfiguration, + FabricCredentials, +) +from dlt.destinations.impl.motherduck.configuration import ( + MotherDuckClientConfiguration, + MotherDuckCredentials, +) +from dlt.destinations.impl.sqlalchemy.configuration import ( + SqlalchemyClientConfiguration, + SqlalchemyCredentials, +) +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBClientConfiguration, + LanceDBCredentials, +) +from dlt.destinations.impl.lance.configuration import ( + DirectoryCatalogCredentials, + LanceClientConfiguration, + LanceStorageConfiguration, +) +from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration +from dlt.destinations.impl.weaviate.configuration import ( + WeaviateClientConfiguration, + WeaviateCredentials, +) + + +ConfigFactory: TypeAlias = Callable[[], DestinationClientConfiguration] + + +class _PhysicalDestinationConfig(DestinationClientConfiguration): + def __init__(self, physical_destination: str = "") -> None: + super().__init__() + self._physical_destination = physical_destination + + def physical_destination(self) -> str: + return self._physical_destination + + +class _StringyPhysicalDestinationConfig(_PhysicalDestinationConfig): + def __init__(self, physical_destination: str, display_value: str) -> None: + super().__init__(physical_destination) + self._display_value = display_value + + def __str__(self) -> str: + return self._display_value + + +class _DestinationClientStub: + def __init__(self, config: DestinationClientConfiguration) -> None: + self.config = config + + +class _DatasetStub: + def __init__(self, config: DestinationClientConfiguration) -> None: + self.destination_client = _DestinationClientStub(config) + + +def assert_joinable( + config1: DestinationClientConfiguration, config2: DestinationClientConfiguration +) -> None: + assert config1.can_join_with(config2) + assert config2.can_join_with(config1) + + +def assert_not_joinable( + config1: DestinationClientConfiguration, config2: DestinationClientConfiguration +) -> None: + assert not config1.can_join_with(config2) + assert not config2.can_join_with(config1) + + +def assert_join_result( + config1: DestinationClientConfiguration, + config2: DestinationClientConfiguration, + expected: bool, +) -> None: + if expected: + assert_joinable(config1, config2) + else: + assert_not_joinable(config1, config2) + + +def _athena_config(region: str, catalog: str = "awsdatacatalog") -> AthenaClientConfiguration: + """Build Athena config.""" + return AthenaClientConfiguration( + credentials=AwsCredentials(region_name=region), + aws_data_catalog=catalog, + ) + + +def _ducklake_creds(catalog_str: str, name: str = DEFAULT_DUCKLAKE_NAME) -> DuckLakeCredentials: + """Build DuckLake credentials.""" + return DuckLakeCredentials( + ducklake_name=name, + catalog=ConnectionStringCredentials(catalog_str), + ) + + +def _fabric_creds(host: str, database: str) -> FabricCredentials: + """Build Fabric credentials.""" + # Fabric is normally configured via structured fields, not a connection string. + credentials = FabricCredentials() + credentials.host = host + credentials.database = database + return credentials + + +def _sqla_creds(connection_string: str) -> SqlalchemyCredentials: + """Parse SQLAlchemy credentials.""" + creds = SqlalchemyCredentials() + creds.parse_native_representation(connection_string) + return creds + + +def _sqla_config(conn_str: str) -> SqlalchemyClientConfiguration: + """Build SQLAlchemy config.""" + c = SqlalchemyClientConfiguration() + c.credentials = _sqla_creds(conn_str) + return c + + +def _lancedb_config( + lance_uri: str, + dataset_name: str = "dataset", + dataset_separator: str = "___", +) -> LanceDBClientConfiguration: + """Build resolved LanceDB config.""" + c = LanceDBClientConfiguration( + lance_uri=lance_uri, + credentials=LanceDBCredentials(uri=lance_uri), + dataset_separator=dataset_separator, + ) + c._bind_dataset_name(dataset_name) + return c + + +def _lance_config(catalog_root: str, dataset_name: str = "dataset") -> LanceClientConfiguration: + """Build resolved Lance config.""" + c = LanceClientConfiguration( + credentials=DirectoryCatalogCredentials(bucket_url=catalog_root), + storage=LanceStorageConfiguration(bucket_url=catalog_root), + ) + c._bind_dataset_name(dataset_name) + c.credentials.bucket_url = catalog_root + return c + + +# Base DestinationClientConfiguration contract +def test_base_fingerprint_derived_from_physical_destination() -> None: + config = _PhysicalDestinationConfig("test-host:5432") + assert config.fingerprint() == digest128("test-host:5432") + + +def test_base_fingerprint_empty_when_physical_destination_empty() -> None: + config = DestinationClientConfiguration() + assert config.physical_destination() == "" + assert config.fingerprint() == "" + + +def test_base_can_join_with_default_false_when_physical_destinations_differ() -> None: + config1 = _PhysicalDestinationConfig("host1") + config2 = _PhysicalDestinationConfig("host2") + assert_not_joinable(config1, config2) + + +def test_base_can_join_with_default_true_when_same_physical_destination() -> None: + config1 = _PhysicalDestinationConfig("host1") + config2 = _PhysicalDestinationConfig("host1") + assert_joinable(config1, config2) + + +def test_base_can_join_with_default_false_when_empty_physical_destination() -> None: + config1 = DestinationClientConfiguration() + config2 = _PhysicalDestinationConfig("host1") + assert_not_joinable(config1, config2) + + +def test_base_can_join_with_returns_false_for_non_config() -> None: + config = _PhysicalDestinationConfig("host1") + assert not config.can_join_with("not a config") # type: ignore[arg-type] + assert not config.can_join_with(None) + assert not config.can_join_with(42) # type: ignore[arg-type] + + +def test_is_same_physical_destination_delegates_to_can_join_with() -> None: + config1 = _StringyPhysicalDestinationConfig("host1", "first-display") + config2 = _StringyPhysicalDestinationConfig("host1", "second-display") + assert str(config1) != str(config2) + assert is_same_physical_destination( + cast(Dataset, _DatasetStub(config1)), cast(Dataset, _DatasetStub(config2)) + ) + + +# physical_destination() extraction across destinations + +PHYSICAL_DEST_CASES = [ + # Postgres: host:port format + pytest.param( + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5432/db") + ), + "h:5432", + id="pg_explicit_port", + ), + pytest.param( + lambda: PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")), + "h:5432", + id="pg_default_port", + ), + pytest.param( + lambda: PostgresClientConfiguration(credentials=PostgresCredentials()), "", id="pg_no_host" + ), + # Redshift + pytest.param( + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db") + ), + "h:5439", + id="rs_explicit_port", + ), + pytest.param( + lambda: RedshiftClientConfiguration(credentials=RedshiftCredentials("redshift://h")), + "h:5439", + id="rs_default_port", + ), + # Snowflake + pytest.param( + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@sf.snowflakecomputing.com/db") + ), + "sf.snowflakecomputing.com", + id="sf_host", + ), + pytest.param( + lambda: SnowflakeClientConfiguration(credentials=SnowflakeCredentials()), + "", + id="sf_no_host", + ), + # BigQuery: project_id from config or credentials + pytest.param( + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="cred-proj"), + project_id="cfg-proj", + ), + "cfg-proj", + id="bq_config_project", + ), + pytest.param( + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="cred-proj") + ), + "cred-proj", + id="bq_cred_project", + ), + pytest.param(lambda: BigQueryClientConfiguration(), "", id="bq_no_project"), + # MSSQL / Synapse + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h")), + "h:1433", + id="mssql_host", + ), + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials()), "", id="mssql_no_host" + ), + pytest.param( + lambda: SynapseClientConfiguration(credentials=SynapseCredentials("mssql://h")), + "h:1433", + id="synapse_host", + ), + # ClickHouse + pytest.param( + lambda: ClickHouseClientConfiguration(credentials=ClickHouseCredentials("clickhouse://h")), + "h:9440", + id="ch_host", + ), + # Databricks + pytest.param( + lambda: DatabricksClientConfiguration( + credentials=DatabricksCredentials(server_hostname="w.cloud.databricks.com") + ), + "w.cloud.databricks.com", + id="dbr_server", + ), + # Athena + pytest.param( + lambda: _athena_config("us-west-2", "cat"), "us-west-2/cat", id="athena_region_catalog" + ), + pytest.param( + lambda: AthenaClientConfiguration( + credentials=AwsCredentials(), + aws_data_catalog="cat", + ), + "", + id="athena_no_region", + ), + pytest.param( + lambda: _athena_config("eu-central-1"), + "eu-central-1/awsdatacatalog", + id="athena_default_catalog", + ), + # Dremio + pytest.param( + lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h")), + "h:32010", + id="dremio_host", + ), + # DuckDB + pytest.param( + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), + "/p/db.duckdb", + id="duckdb_path", + ), + pytest.param( + lambda: FilesystemDestinationClientConfiguration(bucket_url="s3://b/p"), + "s3://b", + id="fs_remote", + ), + pytest.param( + lambda: FilesystemDestinationClientConfiguration(bucket_url="/local/p"), "", id="fs_local" + ), + # DuckLake + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("pg://u@h:5432/db", "lake") + ), + "pg://h:5432/db#lake", + id="dl_remote_cat", + ), + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake") + ), + "sqlite://cat.sqlite#lake", + id="dl_local_cat", + ), + pytest.param( + lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("sqlite:///cat.sqlite")), + f"sqlite://cat.sqlite#{DEFAULT_DUCKLAKE_NAME}", + id="dl_default_name", + ), + # Fabric + pytest.param( + lambda: FabricClientConfiguration( + credentials=_fabric_creds("h.fabric.microsoft.com", "db") + ), + "h.fabric.microsoft.com:1433", + id="fabric_port", + ), + pytest.param( + lambda: FabricClientConfiguration( + credentials=_fabric_creds("h.fabric.microsoft.com", "db") + ), + "h.fabric.microsoft.com:1433", + id="fabric_default_port", + ), + pytest.param( + lambda: MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ), + "", + id="md_empty", + ), +] + + +@pytest.mark.parametrize("factory,expected", PHYSICAL_DEST_CASES) +def test_physical_destination(factory: ConfigFactory, expected: str) -> None: + assert factory().physical_destination() == expected + + +@pytest.mark.parametrize( + "factory,expected_fp", + [ + pytest.param( + lambda: PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")), + digest128("h:5432"), + id="pg", + ), + pytest.param( + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@h/db") + ), + digest128("h"), + id="sf", + ), + pytest.param( + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="p") + ), + digest128("p"), + id="bq", + ), + pytest.param( + lambda: FilesystemDestinationClientConfiguration(bucket_url="s3://b/p"), + digest128("s3://b"), + id="fs", + ), + pytest.param( + lambda: MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ), + digest128("token"), + id="md_token_hash", + ), + ], +) +def test_fingerprint(factory: ConfigFactory, expected_fp: str) -> None: + assert factory().fingerprint() == expected_fp + + +# can_join_with() matrices (symmetric) + +MSSQL_JOIN_CASES = [ + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://u:p@h:1433/db1")), + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://u:p@h:1433/db2")), + True, + id="mssql_same_host_diff_db", + ), + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h1")), + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h2")), + False, + id="mssql_diff_host", + ), + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h:1433/db")), + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h:1434/db")), + False, + id="mssql_same_host_diff_port", + ), +] + +SYNAPSE_JOIN_CASES = [ + pytest.param( + lambda: SynapseClientConfiguration( + credentials=SynapseCredentials("mssql://u:p@h:1433/db1") + ), + lambda: SynapseClientConfiguration( + credentials=SynapseCredentials("mssql://u:p@h:1433/db2") + ), + True, + id="synapse_same_host_diff_db", + ), + pytest.param( + lambda: SynapseClientConfiguration(credentials=SynapseCredentials("mssql://h:1433/db")), + lambda: SynapseClientConfiguration(credentials=SynapseCredentials("mssql://h:1434/db")), + False, + id="synapse_same_host_diff_port", + ), +] + +CLICKHOUSE_JOIN_CASES = [ + pytest.param( + lambda: ClickHouseClientConfiguration( + credentials=ClickHouseCredentials("clickhouse://u:p@h/db1") + ), + lambda: ClickHouseClientConfiguration( + credentials=ClickHouseCredentials("clickhouse://u:p@h/db2") + ), + True, + id="ch_same_host_diff_db", + ), + pytest.param( + lambda: ClickHouseClientConfiguration(credentials=ClickHouseCredentials("clickhouse://h1")), + lambda: ClickHouseClientConfiguration(credentials=ClickHouseCredentials("clickhouse://h2")), + False, + id="ch_diff_host", + ), + pytest.param( + lambda: ClickHouseClientConfiguration( + credentials=ClickHouseCredentials("clickhouse://h:9440/db") + ), + lambda: ClickHouseClientConfiguration( + credentials=ClickHouseCredentials("clickhouse://h:9000/db") + ), + False, + id="ch_same_host_diff_port", + ), +] + +DREMIO_JOIN_CASES = [ + pytest.param( + lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h")), + lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h")), + True, + id="dremio_same_host", + ), + pytest.param( + lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h:32010")), + lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h:32011")), + False, + id="dremio_same_host_diff_port", + ), +] + +FABRIC_JOIN_CASES = [ + pytest.param( + lambda: FabricClientConfiguration(credentials=_fabric_creds("h", "db1")), + lambda: FabricClientConfiguration(credentials=_fabric_creds("h", "db2")), + True, + id="fabric_same_host_diff_db", + ), +] + +POSTGRES_JOIN_CASES = [ + pytest.param( + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5432/db") + ), + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5432/db") + ), + True, + id="pg_same_host_db", + ), + pytest.param( + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5432/db1") + ), + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5432/db2") + ), + False, + id="pg_same_host_diff_db", + ), + pytest.param( + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h1:5432/db") + ), + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h2:5432/db") + ), + False, + id="pg_diff_host", + ), +] + +REDSHIFT_JOIN_CASES = [ + pytest.param( + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db") + ), + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db") + ), + True, + id="rs_same_host_db", + ), + pytest.param( + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db1") + ), + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db2") + ), + False, + id="rs_same_host_diff_db", + ), +] + +DUCKDB_JOIN_CASES = [ + pytest.param( + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), + True, + id="duckdb_same_path", + ), + pytest.param( + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db1.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db2.duckdb")), + False, + id="duckdb_diff_path", + ), +] + +DUCKLAKE_JOIN_CASES = [ + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake1") + ), + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake1") + ), + True, + id="dl_same_cat_name", + ), + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake1") + ), + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake2") + ), + False, + id="dl_same_cat_diff_name", + ), +] + +SNOWFLAKE_JOIN_CASES = [ + pytest.param( + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@a.snowflake.com/db1") + ), + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@a.snowflake.com/db2") + ), + True, + id="sf_same_account", + ), + pytest.param( + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@a1.snowflake.com/db") + ), + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@a2.snowflake.com/db") + ), + False, + id="sf_diff_account", + ), +] + +BIGQUERY_JOIN_CASES = [ + pytest.param( + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="proj") + ), + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="proj") + ), + True, + id="bq_same_project", + ), + pytest.param( + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="p1") + ), + lambda: BigQueryClientConfiguration( + credentials=GcpServiceAccountCredentials(project_id="p2") + ), + False, + id="bq_diff_project", + ), +] + +DATABRICKS_JOIN_CASES = [ + pytest.param( + lambda: DatabricksClientConfiguration( + credentials=DatabricksCredentials(server_hostname="w.databricks.com") + ), + lambda: DatabricksClientConfiguration( + credentials=DatabricksCredentials(server_hostname="w.databricks.com") + ), + True, + id="dbr_same_server", + ), +] + +# Athena Glue catalogs are regional, so physical identity includes region and catalog. +ATHENA_JOIN_CASES = [ + pytest.param( + lambda: _athena_config("us-west-2", "cat"), + lambda: _athena_config("us-west-2", "cat"), + True, + id="athena_same_region_catalog", + ), + pytest.param( + lambda: _athena_config("us-west-2", "cat"), + lambda: _athena_config("eu-central-1", "cat"), + False, + id="athena_diff_region", + ), + pytest.param( + lambda: _athena_config("us-west-2", "c1"), + lambda: _athena_config("us-west-2", "c2"), + False, + id="athena_diff_catalog", + ), + pytest.param( + lambda: AthenaClientConfiguration( + credentials=AwsCredentials(), + aws_data_catalog="cat", + ), + lambda: AthenaClientConfiguration( + credentials=AwsCredentials(), + aws_data_catalog="cat", + ), + False, + id="athena_no_region", + ), +] + +CAN_JOIN_WITH_CASES = ( + POSTGRES_JOIN_CASES + + REDSHIFT_JOIN_CASES + + MSSQL_JOIN_CASES + + SYNAPSE_JOIN_CASES + + CLICKHOUSE_JOIN_CASES + + DREMIO_JOIN_CASES + + FABRIC_JOIN_CASES + + SNOWFLAKE_JOIN_CASES + + BIGQUERY_JOIN_CASES + + DATABRICKS_JOIN_CASES + + ATHENA_JOIN_CASES + + DUCKDB_JOIN_CASES + + DUCKLAKE_JOIN_CASES +) + + +@pytest.mark.parametrize("f1,f2,expected", CAN_JOIN_WITH_CASES) +def test_can_join_with_matrix(f1: ConfigFactory, f2: ConfigFactory, expected: bool) -> None: + c1, c2 = f1(), f2() + assert_join_result(c1, c2, expected) + + +# Cross-type rejection + + +@pytest.mark.parametrize( + "f1,f2", + [ + pytest.param( + lambda: PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")), + lambda: _PhysicalDestinationConfig("h:5432"), + id="pg_vs_base", + ), + pytest.param( + lambda: PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")), + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h")), + id="pg_vs_mssql", + ), + pytest.param( + lambda: SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@h/db") + ), + lambda: ClickHouseClientConfiguration( + credentials=ClickHouseCredentials("clickhouse://h") + ), + id="default_same_identity_different_type", + ), + pytest.param( + lambda: PostgresClientConfiguration( + credentials=PostgresCredentials("postgresql://u:p@h:5439/db") + ), + lambda: RedshiftClientConfiguration( + credentials=RedshiftCredentials("redshift://u:p@h:5439/db") + ), + id="postgres_vs_redshift_same_identity", + ), + pytest.param( + lambda: MsSqlClientConfiguration(credentials=MsSqlCredentials("mssql://h")), + lambda: SynapseClientConfiguration(credentials=SynapseCredentials("mssql://h")), + id="mssql_vs_synapse_same_identity", + ), + ], +) +def test_cross_type_rejection(f1: ConfigFactory, f2: ConfigFactory) -> None: + c1, c2 = f1(), f2() + if isinstance(c2, _PhysicalDestinationConfig): + c2._physical_destination = c1.physical_destination() + assert_not_joinable(c1, c2) + + +def test_cross_type_different_physical_destinations() -> None: + sf = SnowflakeClientConfiguration( + credentials=SnowflakeCredentials("snowflake://u:p@a1.snowflake.com/db") + ) + bq = BigQueryClientConfiguration(credentials=GcpServiceAccountCredentials(project_id="p2")) + assert sf.physical_destination() != bq.physical_destination() + assert_not_joinable(sf, bq) + + +# Filesystem special cases + + +def test_filesystem_joinability_is_engine_based_not_location_based() -> None: + c1 = FilesystemDestinationClientConfiguration(bucket_url="s3://b1/p") + c2 = FilesystemDestinationClientConfiguration(bucket_url="s3://b2/p") + c3 = FilesystemDestinationClientConfiguration(bucket_url="/local/p") + c4 = FilesystemDestinationClientConfiguration(bucket_url="gs://b/p") + assert_joinable(c1, c2) + assert_joinable(c1, c3) + assert_joinable(c1, c4) + + +def test_filesystem_cannot_join_with_non_filesystem() -> None: + c = FilesystemDestinationClientConfiguration(bucket_url="s3://b/p") + other = _PhysicalDestinationConfig("s3://b") + assert_not_joinable(c, other) + + +def test_filesystem_fingerprint_empty_for_local() -> None: + c = FilesystemDestinationClientConfiguration(bucket_url="/local/p") + assert c.physical_destination() == "" + assert c.fingerprint() == "" + + +# MotherDuck token-based joinability + + +def test_motherduck_token_not_exposed_as_physical_destination() -> None: + md = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + assert md.physical_destination() == "" + + +def test_motherduck_fingerprint_hashes_token() -> None: + md = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + assert md.fingerprint() == digest128("token") + + +def test_motherduck_can_join_with_same_token_without_exposing_location() -> None: + """Same token can join without exposing token via physical destination.""" + c1 = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + c2 = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + assert_joinable(c1, c2) + + +def test_motherduck_different_tokens_are_not_proven_joinable() -> None: + """Different tokens are treated as not joinable.""" + c1 = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token1") + ) + c2 = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token2") + ) + # Tokens may belong to the same account, but we do not have a safe account id to prove it. + assert_not_joinable(c1, c2) + + +def test_motherduck_can_join_with_missing_token() -> None: + """Missing token cannot join.""" + with_token = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + without_token = MotherDuckClientConfiguration(credentials=MotherDuckCredentials("md:db")) + assert_not_joinable(with_token, without_token) + w1 = MotherDuckClientConfiguration(credentials=MotherDuckCredentials("md:db1")) + w2 = MotherDuckClientConfiguration(credentials=MotherDuckCredentials("md:db2")) + assert_not_joinable(w1, w2) + + +def test_motherduck_can_join_with_non_motherduck() -> None: + """MotherDuck cannot join with other destination types.""" + md = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:db?motherduck_token=token") + ) + pg = PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")) + assert_not_joinable(md, pg) + + +# SQLAlchemy dialect-specific cases + + +SQLA_CASES = [ + pytest.param("postgresql://u@h:5432/db", "postgresql://u@h:5432/db", True, id="pg_same"), + pytest.param("postgresql://u@h:5432/db1", "postgresql://u@h:5432/db2", False, id="pg_diff_db"), + pytest.param( + "postgresql://u@h1:5432/db", "postgresql://u@h2:5432/db", False, id="pg_diff_host" + ), + pytest.param("mysql://u@h:3306/db1", "mysql://u@h:3306/db2", True, id="mysql_same_host"), + pytest.param("mysql://u@h1:3306/db", "mysql://u@h2:3306/db", False, id="mysql_diff_host"), + pytest.param("sqlite:////p/db.sqlite", "sqlite:////p/db.sqlite", True, id="sqlite_same"), + pytest.param("sqlite:////p/db1.sqlite", "sqlite:////p/db2.sqlite", False, id="sqlite_diff"), + pytest.param("postgresql://u@h:5432/db", "mysql://u@h:3306/db", False, id="diff_dialects"), + pytest.param("unknown://u@h:1234/db", "unknown://u@h:1234/db", True, id="unknown_same"), + pytest.param("unknown://u@h:1234/db1", "unknown://u@h:1234/db2", False, id="unknown_diff_db"), +] + + +@pytest.mark.parametrize("conn1,conn2,expected", SQLA_CASES) +def test_sqlalchemy_can_join_with(conn1: str, conn2: str, expected: bool) -> None: + c1 = _sqla_config(conn1) + c2 = _sqla_config(conn2) + assert_join_result(c1, c2, expected) + + +@pytest.mark.parametrize( + "f1,f2,expected", + [ + pytest.param( + lambda: _lancedb_config("/tmp/db.lancedb"), + lambda: _lancedb_config("/tmp/db.lancedb"), + True, + id="same_uri_dataset_separator", + ), + pytest.param( + lambda: _lancedb_config("/tmp/db1.lancedb"), + lambda: _lancedb_config("/tmp/db2.lancedb"), + False, + id="different_uri", + ), + pytest.param( + lambda: _lancedb_config("/tmp/db.lancedb", dataset_name="dataset1"), + lambda: _lancedb_config("/tmp/db.lancedb", dataset_name="dataset2"), + True, + id="different_dataset_same_uri", + ), + pytest.param( + lambda: _lancedb_config("/tmp/db.lancedb", dataset_separator="___"), + lambda: _lancedb_config("/tmp/db.lancedb", dataset_separator="__"), + False, + id="different_separator", + ), + pytest.param( + lambda: _lancedb_config(":external:"), + lambda: _lancedb_config(":external:"), + False, + id="external_native_client", + ), + ], +) +def test_lancedb_can_join_with(f1: ConfigFactory, f2: ConfigFactory, expected: bool) -> None: + assert_join_result(f1(), f2(), expected) + + +@pytest.mark.parametrize( + "f1,f2,expected", + [ + pytest.param( + lambda: _lance_config("file:///tmp/lance"), + lambda: _lance_config("file:///tmp/lance"), + True, + id="same_catalog_dataset", + ), + pytest.param( + lambda: _lance_config("file:///tmp/lance1"), + lambda: _lance_config("file:///tmp/lance2"), + False, + id="different_catalog", + ), + pytest.param( + lambda: _lance_config("file:///tmp/lance", dataset_name="dataset1"), + lambda: _lance_config("file:///tmp/lance", dataset_name="dataset2"), + False, + id="different_dataset", + ), + ], +) +def test_lance_can_join_with(f1: ConfigFactory, f2: ConfigFactory, expected: bool) -> None: + assert_join_result(f1(), f2(), expected) + + +def test_lance_and_lancedb_cannot_join_with_each_other() -> None: + lance = _lance_config("file:///tmp/lance") + lancedb = _lancedb_config("file:///tmp/lance") + assert_not_joinable(lance, lancedb) + + +def test_weaviate_physical_destination_but_not_joinable() -> None: + c1 = WeaviateClientConfiguration( + credentials=WeaviateCredentials(url="https://cluster.weaviate.cloud") + ) + c2 = WeaviateClientConfiguration( + credentials=WeaviateCredentials(url="https://cluster.weaviate.cloud") + ) + assert c1.physical_destination() == "cluster.weaviate.cloud" + assert c1.fingerprint() == digest128("cluster.weaviate.cloud") + assert_not_joinable(c1, c2) + + +def test_qdrant_physical_destination_but_not_joinable() -> None: + c1 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") + c2 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") + assert c1.physical_destination() == "https://cluster.qdrant.io" + assert c1.fingerprint() == digest128("https://cluster.qdrant.io") + assert_not_joinable(c1, c2) diff --git a/tests/load/ducklake/test_ducklake_client.py b/tests/load/ducklake/test_ducklake_client.py index 319ae51461..20d1ef5ae9 100644 --- a/tests/load/ducklake/test_ducklake_client.py +++ b/tests/load/ducklake/test_ducklake_client.py @@ -102,8 +102,10 @@ def test_ducklake_configuration_default() -> None: assert credentials.storage_url == str(local_dir / "ducklake.files") # file url assert credentials.storage.bucket_url.startswith("file://") - # fingerprint is local - assert configuration.fingerprint() == digest128("file://") + # fingerprint derived from catalog identity + ducklake name + expected_phys = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_destination() == expected_phys + assert configuration.fingerprint() == digest128(expected_phys) def test_ducklake_configuration_duckdb_catalog() -> None: @@ -121,7 +123,9 @@ def test_ducklake_configuration_duckdb_catalog() -> None: assert credentials.ducklake_name == DEFAULT_DUCKLAKE_NAME conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.duckdb")) - assert configuration.fingerprint() == digest128("file://") + expected_phys = f"duckdb://{local_dir / 'ducklake.duckdb'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_destination() == expected_phys + assert configuration.fingerprint() == digest128(expected_phys) def test_ducklake_configuration_ducklake_name() -> None: @@ -138,8 +142,10 @@ def test_ducklake_configuration_ducklake_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "my_ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "my_ducklake.files") - # fingerprint is local - assert configuration.fingerprint() == digest128("file://") + # fingerprint derived from catalog identity + ducklake name + expected_phys = f"sqlite://{local_dir / 'my_ducklake.sqlite'}#my_ducklake" + assert configuration.physical_destination() == expected_phys + assert configuration.fingerprint() == digest128(expected_phys) def test_ducklake_configuration_destination_name() -> None: @@ -156,8 +162,10 @@ def test_ducklake_configuration_destination_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "ducklake.files") - # fingerprint is local - assert configuration.fingerprint() == digest128("file://") + # fingerprint derived from catalog identity + ducklake name + expected_phys = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_destination() == expected_phys + assert configuration.fingerprint() == digest128(expected_phys) def test_ducklake_configuration_pipeline_name() -> None: @@ -202,8 +210,11 @@ def test_ducklake_configuration_storage_credentials() -> None: ) # NOTE: dataset folders will be created in /lake/ assert credentials.storage_url == "s3://dlt-ci-test-bucket/lake" - # fingerprint is NOT local - assert configuration.fingerprint() == digest128("s3://dlt-ci-test-bucket") + # fingerprint derived from remote catalog identity + ducklake name + assert ( + configuration.physical_destination() == "postgresql://localhost:5432/dlt_data#my_ducklake" + ) + assert configuration.fingerprint() == digest128(configuration.physical_destination()) def test_ducklake_configuration_catalog_credentials() -> None: diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index 9ff9a008be..41dd779c8d 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -79,7 +79,7 @@ def _client_factory(fs: filesystem) -> FilesystemClient: "url, exp", ( (None, ""), - ("/path/path2", digest128("")), + ("/path/path2", ""), ("file:///home/ducklake.d", digest128("file://")), ("s3://cool", digest128("s3://cool")), ("s3://cool.domain/path/path2", digest128("s3://cool.domain")), diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index ac85d1abcf..bc33f4f16f 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -64,7 +64,7 @@ def test_redshift_configuration() -> None: RedshiftCredentials(), explicit_value="postgres://user1:pass@host1/db1?warehouse=warehouse1&role=role1", ) - assert RedshiftClientConfiguration(credentials=c).fingerprint() == digest128("host1") + assert RedshiftClientConfiguration(credentials=c).fingerprint() == digest128("host1:5439") def test_create_table(client: RedshiftClient) -> None: From 7a5f3dc5effcd038e72db2f0db39089c72437c23 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 29 Apr 2026 14:35:42 +0200 Subject: [PATCH 02/16] fix absolute paths failing on windows in tests --- tests/destinations/test_join_compatibility.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 72b49a0201..81af1e303a 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -370,8 +370,8 @@ def test_is_same_physical_destination_delegates_to_can_join_with() -> None: ), # DuckDB pytest.param( - lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), - "/p/db.duckdb", + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("p/db.duckdb")), + "p/db.duckdb", id="duckdb_path", ), pytest.param( @@ -625,14 +625,14 @@ def test_fingerprint(factory: ConfigFactory, expected_fp: str) -> None: DUCKDB_JOIN_CASES = [ pytest.param( - lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), - lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("p/db.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("p/db.duckdb")), True, id="duckdb_same_path", ), pytest.param( - lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db1.duckdb")), - lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("/p/db2.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("p/db1.duckdb")), + lambda: DuckDbClientConfiguration(credentials=DuckDbCredentials("p/db2.duckdb")), False, id="duckdb_diff_path", ), From afa3768429b1eb262e735e8354a6632143d97726 Mon Sep 17 00:00:00 2001 From: travior Date: Tue, 5 May 2026 10:17:35 +0200 Subject: [PATCH 03/16] fix outdated clickhouse fingerprints --- tests/load/clickhouse/test_clickhouse_configuration.py | 4 +++- tests/load/clickhouse/test_clickhouse_table_builder.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index ee6b3562f1..2b058a7598 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -84,7 +84,9 @@ def test_clickhouse_configuration() -> None: ClickHouseCredentials(), explicit_value="clickhouse://user1:pass1@host1:9000/db1", ) - assert ClickHouseClientConfiguration(credentials=config).fingerprint() == digest128("host1") + assert ClickHouseClientConfiguration(credentials=config).fingerprint() == digest128( + "host1:9000" + ) def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: diff --git a/tests/load/clickhouse/test_clickhouse_table_builder.py b/tests/load/clickhouse/test_clickhouse_table_builder.py index 1f892d19cc..04152bedb5 100644 --- a/tests/load/clickhouse/test_clickhouse_table_builder.py +++ b/tests/load/clickhouse/test_clickhouse_table_builder.py @@ -40,7 +40,7 @@ def test_clickhouse_configuration() -> None: ClickHouseCredentials(), explicit_value="clickhouse://user1:pass@host1/db1", ) - assert ClickHouseClientConfiguration(credentials=c).fingerprint() == digest128("host1") + assert ClickHouseClientConfiguration(credentials=c).fingerprint() == digest128("host1:9440") def test_clickhouse_create_table(clickhouse_client: ClickHouseClient) -> None: From 179d819ea8fb6aaee21d716eea22b604d08ad45f Mon Sep 17 00:00:00 2001 From: travior Date: Tue, 12 May 2026 13:35:19 +0200 Subject: [PATCH 04/16] add integration test shell --- tests/destinations/test_join_compatibility.py | 5 +- .../load/pipeline/test_join_compatibility.py | 208 ++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 tests/load/pipeline/test_join_compatibility.py diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 81af1e303a..096c266610 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -200,12 +200,13 @@ def _lancedb_config( def _lance_config(catalog_root: str, dataset_name: str = "dataset") -> LanceClientConfiguration: """Build resolved Lance config.""" + credentials = DirectoryCatalogCredentials(bucket_url=catalog_root) c = LanceClientConfiguration( - credentials=DirectoryCatalogCredentials(bucket_url=catalog_root), + credentials=credentials, storage=LanceStorageConfiguration(bucket_url=catalog_root), ) c._bind_dataset_name(dataset_name) - c.credentials.bucket_url = catalog_root + credentials.bucket_url = catalog_root return c diff --git a/tests/load/pipeline/test_join_compatibility.py b/tests/load/pipeline/test_join_compatibility.py new file mode 100644 index 0000000000..0ed405c047 --- /dev/null +++ b/tests/load/pipeline/test_join_compatibility.py @@ -0,0 +1,208 @@ +from copy import deepcopy +from pathlib import Path +from typing import Any, Optional + +import pytest + +import dlt +from dlt.common.destination import TDestinationReferenceArg +from dlt.common.utils import uniq_id +from dlt.destinations.impl.ducklake.configuration import DuckLakeCredentials +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials + +from tests.load.utils import ( + DestinationTestConfiguration, + destinations_configs, +) +from tests.pipeline.utils import assert_load_info + + +pytestmark = pytest.mark.essential + + +SAME_DATABASE_JOIN_COMPATIBILITY_CONFIGS = destinations_configs( + default_sql_configs=True, + local_filesystem_configs=True, + subset=[ + "clickhouse", + "dremio", + "duckdb", + "ducklake", + "filesystem", + "postgres", + "snowflake", + "sqlalchemy", + ], +) + +FILESYSTEM_DIFFERENT_LOCATION_JOIN_COMPATIBILITY_CONFIGS = destinations_configs( + local_filesystem_configs=True, + subset=["filesystem"], +) + +# Same-host/different-database compatibility needs a pre-existing second database. +SAME_HOST_DIFFERENT_DATABASE_JOIN_COMPATIBILITY_CONFIGS = destinations_configs( + default_sql_configs=True, + subset=["snowflake"], +) + + +def _load_table( + pipeline: dlt.Pipeline, + destination_config: DestinationTestConfiguration, + table_name: str, + rows: list[dict[str, Any]], +) -> None: + info = pipeline.run(rows, table_name=table_name, **destination_config.run_kwargs) + assert_load_info(info) + + +def _make_same_database_destinations( + destination_config: DestinationTestConfiguration, + tmp_path: Path, + test_id: str, +) -> tuple[Optional[TDestinationReferenceArg], Optional[TDestinationReferenceArg]]: + if destination_config.destination_type == "duckdb": + database_path = tmp_path / f"join_compat_{test_id}.duckdb" + return dlt.destinations.duckdb(str(database_path)), dlt.destinations.duckdb( + str(database_path) + ) + + if destination_config.destination_type == "ducklake": + credentials = DuckLakeCredentials( + ducklake_name=f"join_compat_{test_id}", + catalog=f"sqlite:///{tmp_path / f'join_compat_{test_id}.sqlite'}", + storage=str(tmp_path / f"join_compat_{test_id}.files"), + ) + return ( + dlt.destinations.ducklake(credentials=deepcopy(credentials)), + dlt.destinations.ducklake(credentials=deepcopy(credentials)), + ) + + if destination_config.destination_name == "sqlalchemy_sqlite": + connection_string = f"sqlite:///{tmp_path / f'join_compat_{test_id}.sqlite'}" + return ( + dlt.destinations.sqlalchemy(credentials=connection_string), + dlt.destinations.sqlalchemy(credentials=connection_string), + ) + + return None, None + + +def _make_same_host_different_database_destinations( + destination_config: DestinationTestConfiguration, +) -> tuple[Optional[TDestinationReferenceArg], Optional[TDestinationReferenceArg]]: + if destination_config.destination_type == "snowflake": + second_database: Optional[str] = dlt.secrets.get( + "destination.snowflake.join_compatibility_database" + ) + if not second_database: + pytest.skip("Second Snowflake database not configured") + + destination_config.setup() + base_credentials = dlt.secrets.get( + "destination.snowflake.credentials", SnowflakeCredentials + ) + + first_credentials = deepcopy(base_credentials) + second_credentials = deepcopy(base_credentials) + second_credentials.database = second_database + + return ( + dlt.destinations.snowflake(credentials=first_credentials), + dlt.destinations.snowflake(credentials=second_credentials), + ) + + return None, None + + +def _make_filesystem_different_location_destinations( + tmp_path: Path, + test_id: str, +) -> tuple[TDestinationReferenceArg, TDestinationReferenceArg]: + return ( + dlt.destinations.filesystem(str(tmp_path / f"join_compat_first_{test_id}")), + dlt.destinations.filesystem(str(tmp_path / f"join_compat_second_{test_id}")), + ) + + +def _run_two_pipeline_check( + destination_config: DestinationTestConfiguration, + first_destination: Optional[TDestinationReferenceArg], + second_destination: Optional[TDestinationReferenceArg], + expected: bool, +) -> None: + test_id = uniq_id() + first_pipeline = destination_config.setup_pipeline( + "join_first_" + test_id, + dataset_name="join_compat_first_" + test_id, + destination=first_destination, + ) + second_pipeline = destination_config.setup_pipeline( + "join_second_" + test_id, + dataset_name="join_compat_second_" + test_id, + destination=second_destination, + ) + + _load_table( + first_pipeline, + destination_config, + "join_items", + [{"id": 1, "name": "first"}], + ) + _load_table( + second_pipeline, + destination_config, + "join_items", + [{"id": 1, "name": "second"}], + ) + + first_config = first_pipeline.dataset().destination_client.config + second_config = second_pipeline.dataset().destination_client.config + assert first_config.can_join_with(second_config) is expected + assert second_config.can_join_with(first_config) is expected + + +@pytest.mark.parametrize( + "destination_config", + SAME_DATABASE_JOIN_COMPATIBILITY_CONFIGS, + ids=lambda x: x.name, +) +def test_same_database_join_compatibility( + destination_config: DestinationTestConfiguration, + tmp_path: Path, +) -> None: + test_id = uniq_id() + first_destination, second_destination = _make_same_database_destinations( + destination_config, tmp_path, test_id + ) + _run_two_pipeline_check(destination_config, first_destination, second_destination, True) + + +@pytest.mark.parametrize( + "destination_config", + FILESYSTEM_DIFFERENT_LOCATION_JOIN_COMPATIBILITY_CONFIGS, + ids=lambda x: x.name, +) +def test_filesystem_different_location_join_compatibility( + destination_config: DestinationTestConfiguration, + tmp_path: Path, +) -> None: + first_destination, second_destination = _make_filesystem_different_location_destinations( + tmp_path, uniq_id() + ) + _run_two_pipeline_check(destination_config, first_destination, second_destination, True) + + +@pytest.mark.parametrize( + "destination_config", + SAME_HOST_DIFFERENT_DATABASE_JOIN_COMPATIBILITY_CONFIGS, + ids=lambda x: x.name, +) +def test_same_host_different_database_join_compatibility( + destination_config: DestinationTestConfiguration, +) -> None: + first_destination, second_destination = _make_same_host_different_database_destinations( + destination_config + ) + _run_two_pipeline_check(destination_config, first_destination, second_destination, True) From 8c2736fa2d1c8b3f0f436e3870c938c6c16440c9 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 27 May 2026 12:06:47 +0200 Subject: [PATCH 05/16] rename physical_destination to physical_location --- dlt/common/destination/client.py | 18 +++--- dlt/dataset/dataset.py | 8 +-- dlt/dataset/relation.py | 2 +- dlt/destinations/impl/athena/configuration.py | 2 +- .../impl/bigquery/configuration.py | 2 +- .../impl/clickhouse/configuration.py | 2 +- .../impl/databricks/configuration.py | 2 +- dlt/destinations/impl/dremio/configuration.py | 2 +- dlt/destinations/impl/duckdb/configuration.py | 2 +- .../impl/ducklake/configuration.py | 2 +- dlt/destinations/impl/fabric/configuration.py | 2 +- .../impl/filesystem/configuration.py | 4 +- dlt/destinations/impl/lance/configuration.py | 8 +-- .../impl/lancedb/configuration.py | 8 +-- .../impl/motherduck/configuration.py | 2 +- dlt/destinations/impl/mssql/configuration.py | 2 +- .../impl/postgres/configuration.py | 10 ++-- dlt/destinations/impl/qdrant/configuration.py | 2 +- .../impl/redshift/configuration.py | 2 +- .../impl/snowflake/configuration.py | 2 +- .../impl/sqlalchemy/configuration.py | 26 ++++----- .../impl/weaviate/configuration.py | 2 +- tests/destinations/test_join_compatibility.py | 58 +++++++++---------- tests/load/ducklake/test_ducklake_client.py | 28 ++++----- 24 files changed, 99 insertions(+), 99 deletions(-) diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py index dfa08c7316..9711cf78bd 100644 --- a/dlt/common/destination/client.py +++ b/dlt/common/destination/client.py @@ -164,15 +164,15 @@ class DestinationClientConfiguration(BaseConfiguration): __recommended_sections__: ClassVar[Sequence[str]] = (known_sections.DESTINATION, "") - def physical_destination(self) -> str: - """Returns a non-secret destination identity, or "" when unavailable.""" + def physical_location(self) -> str: + """Returns a non-secret physical location identity, or "" when unavailable.""" return "" def fingerprint(self) -> str: - """Returns a hash of physical_destination(), or "" when unavailable.""" - phys_dest = self.physical_destination() - if phys_dest: - return digest128(phys_dest) + """Returns a hash of physical_location(), or "" when unavailable.""" + phys_loc = self.physical_location() + if phys_loc: + return digest128(phys_loc) return "" def can_join_with(self, other: "DestinationClientConfiguration") -> bool: @@ -181,9 +181,9 @@ def can_join_with(self, other: "DestinationClientConfiguration") -> bool: return False if self.destination_type != other.destination_type: return False - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if self_phys and other_phys and self_phys == other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if self_loc and other_loc and self_loc == other_loc: return True return False diff --git a/dlt/dataset/dataset.py b/dlt/dataset/dataset.py index a1e7825d64..ad3f190a2a 100644 --- a/dlt/dataset/dataset.py +++ b/dlt/dataset/dataset.py @@ -219,12 +219,12 @@ def open_table_client(self) -> SupportsOpenTables: # TODO remove method; need to update `dlthub` to avoid conflict # this is only used by `dlt.hub.transformation` currently - def is_same_physical_destination(self, other: dlt.Dataset) -> bool: + def is_same_physical_location(self, other: dlt.Dataset) -> bool: """ - Returns true if the other dataset is on the same physical destination + Returns true if the other dataset is on the same physical location helpful if we want to run sql queries without extracting the data """ - return is_same_physical_destination(self, other) + return is_same_physical_location(self, other) def query( self, @@ -497,7 +497,7 @@ def get_dataset_sql_client(dataset: dlt.Dataset) -> SqlClientBase[Any]: raise SqlClientNotAvailable("dataset", dataset.dataset_name, client.config.destination_type) -def is_same_physical_destination(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: +def is_same_physical_location(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: """Check if tables from both datasets can be joined in a single query.""" # NOTE: the name is historical -- this actually checks join compatibility via # can_join_with(), which may return True even when the physical storage diff --git a/dlt/dataset/relation.py b/dlt/dataset/relation.py index 8f477e9ef0..995f9bbef3 100644 --- a/dlt/dataset/relation.py +++ b/dlt/dataset/relation.py @@ -418,7 +418,7 @@ def join( if isinstance(other, dlt.Relation): # TODO: remove once we allow cross-dataset joins if not ( - self._dataset.is_same_physical_destination(other._dataset) + self._dataset.is_same_physical_location(other._dataset) and self._dataset.dataset_name == other._dataset.dataset_name ): raise ValueError( diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index 9b0f2adf48..0406bfabbd 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -60,7 +60,7 @@ def to_connector_params(self, use_catalog_name: bool = True) -> Dict[str, Any]: def _is_s3_tables_catalog(self) -> bool: return is_s3_tables_catalog(self.aws_data_catalog) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns region/catalog, or "" when region is unavailable.""" catalog = self.aws_data_catalog or DEFAULT_AWS_DATA_CATALOG region = None diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 9bd59fa1cb..efb02aae8b 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -37,7 +37,7 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): def get_location(self) -> str: return self.location - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns configured project id, falling back to credentials.""" project_id = self.project_id if not project_id and self.credentials: diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py index 837c29b837..aac5f32bca 100644 --- a/dlt/destinations/impl/clickhouse/configuration.py +++ b/dlt/destinations/impl/clickhouse/configuration.py @@ -96,7 +96,7 @@ class ClickHouseClientConfiguration(DestinationClientDwhWithStagingConfiguration "table_engine_type", ] - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: return f"{self.credentials.host}:{self.credentials.port}" diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 2eba3d396f..b8757d383c 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -284,7 +284,7 @@ def on_resolved(self) -> None: " `destination.databricks.credentials.client_secret`." ) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the server hostname.""" if self.credentials and self.credentials.server_hostname: return self.credentials.server_hostname diff --git a/dlt/destinations/impl/dremio/configuration.py b/dlt/destinations/impl/dremio/configuration.py index 9d38c188ce..93292afbc6 100644 --- a/dlt/destinations/impl/dremio/configuration.py +++ b/dlt/destinations/impl/dremio/configuration.py @@ -36,7 +36,7 @@ class DremioClientConfiguration(DestinationClientDwhWithStagingConfiguration): staging_data_source: str = None """The name of the staging data source""" - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: return f"{self.credentials.host}:{self.credentials.port}" diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py index c85aa71c73..14b4e71cd1 100644 --- a/dlt/destinations/impl/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -318,7 +318,7 @@ def __init__( ) self.create_indexes = create_indexes - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the database file path or ':memory:'.""" if self.credentials and self.credentials.database: return self.credentials.database diff --git a/dlt/destinations/impl/ducklake/configuration.py b/dlt/destinations/impl/ducklake/configuration.py index e55cefb603..ca23848a26 100644 --- a/dlt/destinations/impl/ducklake/configuration.py +++ b/dlt/destinations/impl/ducklake/configuration.py @@ -142,7 +142,7 @@ class DuckLakeClientConfiguration(WithLocalFiles, DestinationClientDwhWithStagin automatic_migration: bool = False """When true, attaches with `AUTOMATIC_MIGRATION true` so DuckDB migrates an older DuckLake catalog schema on attach.""" - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns credential-free catalog identity plus ducklake name.""" if not self.credentials or not self.credentials.catalog: return "" diff --git a/dlt/destinations/impl/fabric/configuration.py b/dlt/destinations/impl/fabric/configuration.py index 01b52d01e6..7409ec7c89 100644 --- a/dlt/destinations/impl/fabric/configuration.py +++ b/dlt/destinations/impl/fabric/configuration.py @@ -165,7 +165,7 @@ class FabricClientConfiguration(DestinationClientDwhWithStagingConfiguration): Both have UTF-8 encoding. LongAsMax=yes is automatically configured. """ - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: port = self.credentials.port or 1433 diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 2ee22bfccf..8506684fe3 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -45,7 +45,7 @@ class FilesystemDestinationClientConfiguration(FilesystemConfigurationWithLocalF def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: return super().resolve_credentials_type() - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns scheme://netloc for remote filesystems, or "" for local.""" if not self.bucket_url: return "" @@ -62,7 +62,7 @@ def fingerprint(self) -> str: # Explicit override to resolve MRO ambiguity: without it, Python picks # FilesystemConfiguration.fingerprint() (which hashes the raw bucket URL) # over DestinationClientConfiguration.fingerprint() (which hashes - # physical_destination()). Do not remove. + # physical_location()). Do not remove. return DestinationClientStagingConfiguration.fingerprint(self) def can_join_with(self, other: DestinationClientConfiguration) -> bool: diff --git a/dlt/destinations/impl/lance/configuration.py b/dlt/destinations/impl/lance/configuration.py index a80f8f5b0a..3b12a8d8d3 100644 --- a/dlt/destinations/impl/lance/configuration.py +++ b/dlt/destinations/impl/lance/configuration.py @@ -356,7 +356,7 @@ def make_namespace(self) -> "LanceNamespace": props.update(self.credentials.to_namespace_properties()) return connect(self.catalog_type, props) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the resolved Lance catalog root.""" if ( isinstance(self.credentials, DirectoryCatalogCredentials) @@ -370,9 +370,9 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: if not isinstance(other, LanceClientConfiguration): return False - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if not self_phys or not other_phys or self_phys != other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if not self_loc or not other_loc or self_loc != other_loc: return False return self.dataset_name == other.dataset_name diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py index e22b61f096..711bfbc059 100644 --- a/dlt/destinations/impl/lancedb/configuration.py +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -191,7 +191,7 @@ def on_resolved(self) -> None: # TODO: move uri back to credentials to make it more like other connections self.credentials.uri = self.lance_uri - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the resolved LanceDB URI, or "" for external native clients.""" if not self.lance_uri or self.lance_uri == ":external:": return "" @@ -213,9 +213,9 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: if not isinstance(other, LanceDBClientConfiguration): return False - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if not self_phys or not other_phys or self_phys != other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if not self_loc or not other_loc or self_loc != other_loc: return False return self.dataset_separator == other.dataset_separator diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index 640746c690..209d8da9b5 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -126,7 +126,7 @@ class MotherDuckClientConfiguration(DestinationClientDwhWithStagingConfiguration False # should unique indexes be created, this slows loading down massively ) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns "" because MotherDuck has no non-secret account identity.""" return "" diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 97314deb41..c4c14d7072 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -134,7 +134,7 @@ class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): create_indexes: bool = False has_case_sensitive_identifiers: bool = False - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: port = self.credentials.port or 1433 diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index b7232bf4f4..94190a6aa7 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -48,8 +48,8 @@ class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): csv_format: Optional[CsvFormatConfiguration] = None """Optional csv format configuration""" - def physical_destination(self) -> str: - """Returns host:port as the physical destination identifier.""" + def physical_location(self) -> str: + """Returns host:port as the physical location identifier.""" if self.credentials and self.credentials.host: port = self.credentials.port or 5432 return f"{self.credentials.host}:{port}" @@ -62,9 +62,9 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: if self.destination_type != other.destination_type: return False - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if not self_phys or not other_phys or self_phys != other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if not self_loc or not other_loc or self_loc != other_loc: return False self_db = self.credentials.database if self.credentials else None diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index d388d5d87d..0795d1c4f2 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -148,7 +148,7 @@ def on_resolved(self) -> None: if self.qd_path and not os.path.isabs(self.qd_path): self.qd_path = self.make_location(self.qd_path, "%s.qdrant") - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the Qdrant connection location.""" return self.qd_location or "" diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index bef2ee4b2c..d2be73b54f 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -27,7 +27,7 @@ class RedshiftClientConfiguration(PostgresClientConfiguration): staging_iam_role: Optional[str] = None has_case_sensitive_identifiers: bool = False - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: port = self.credentials.port or 5439 diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 6eeea1e50b..d5c21dd34d 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -178,7 +178,7 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration) use_decfloat: bool = False """Whether to use DECFLOAT type for unbound decimals instead of DECIMAL""" - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the account host.""" if self.credentials and self.credentials.host: return self.credentials.host diff --git a/dlt/destinations/impl/sqlalchemy/configuration.py b/dlt/destinations/impl/sqlalchemy/configuration.py index 9fbd071c9c..c0ff7b4994 100644 --- a/dlt/destinations/impl/sqlalchemy/configuration.py +++ b/dlt/destinations/impl/sqlalchemy/configuration.py @@ -266,7 +266,7 @@ def on_resolved(self) -> None: self.make_location(db or None, SQLITE_DB_NAME_PAT) ) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns sqlite path for sqlite, otherwise host:port.""" if not self.credentials: return "" @@ -303,27 +303,27 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: return False if self_dialect == "sqlite": - self_phys = self.physical_destination() - other_phys = other.physical_destination() - return bool(self_phys and other_phys and self_phys == other_phys) + self_loc = self.physical_location() + other_loc = other.physical_location() + return bool(self_loc and other_loc and self_loc == other_loc) if self_dialect == "postgresql": - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if not self_phys or not other_phys or self_phys != other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if not self_loc or not other_loc or self_loc != other_loc: return False self_db = self.credentials.database other_db = other.credentials.database return self_db is not None and other_db is not None and self_db == other_db if self_dialect in ("mysql", "mssql", "oracle", "db2"): - self_phys = self.physical_destination() - other_phys = other.physical_destination() - return bool(self_phys and other_phys and self_phys == other_phys) + self_loc = self.physical_location() + other_loc = other.physical_location() + return bool(self_loc and other_loc and self_loc == other_loc) - self_phys = self.physical_destination() - other_phys = other.physical_destination() - if not self_phys or not other_phys or self_phys != other_phys: + self_loc = self.physical_location() + other_loc = other.physical_location() + if not self_loc or not other_loc or self_loc != other_loc: return False self_db = self.credentials.database other_db = other.credentials.database diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index a75de83cbf..397637839b 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -64,7 +64,7 @@ class WeaviateClientConfiguration(DestinationClientDwhConfiguration): } ) - def physical_destination(self) -> str: + def physical_location(self) -> str: """Returns the host part of the connection URL.""" if self.credentials and self.credentials.url: return urlparse(self.credentials.url).hostname or "" diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 096c266610..4e149f7552 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -13,7 +13,7 @@ GcpServiceAccountCredentials, ) from dlt.common.destination.client import DestinationClientConfiguration -from dlt.dataset.dataset import Dataset, is_same_physical_destination +from dlt.dataset.dataset import Dataset, is_same_physical_location from dlt.destinations.impl.postgres.configuration import ( PostgresClientConfiguration, PostgresCredentials, @@ -92,17 +92,17 @@ class _PhysicalDestinationConfig(DestinationClientConfiguration): - def __init__(self, physical_destination: str = "") -> None: + def __init__(self, physical_location: str = "") -> None: super().__init__() - self._physical_destination = physical_destination + self._physical_location = physical_location - def physical_destination(self) -> str: - return self._physical_destination + def physical_location(self) -> str: + return self._physical_location class _StringyPhysicalDestinationConfig(_PhysicalDestinationConfig): - def __init__(self, physical_destination: str, display_value: str) -> None: - super().__init__(physical_destination) + def __init__(self, physical_location: str, display_value: str) -> None: + super().__init__(physical_location) self._display_value = display_value def __str__(self) -> str: @@ -211,30 +211,30 @@ def _lance_config(catalog_root: str, dataset_name: str = "dataset") -> LanceClie # Base DestinationClientConfiguration contract -def test_base_fingerprint_derived_from_physical_destination() -> None: +def test_base_fingerprint_derived_from_physical_location() -> None: config = _PhysicalDestinationConfig("test-host:5432") assert config.fingerprint() == digest128("test-host:5432") -def test_base_fingerprint_empty_when_physical_destination_empty() -> None: +def test_base_fingerprint_empty_when_physical_location_empty() -> None: config = DestinationClientConfiguration() - assert config.physical_destination() == "" + assert config.physical_location() == "" assert config.fingerprint() == "" -def test_base_can_join_with_default_false_when_physical_destinations_differ() -> None: +def test_base_can_join_with_default_false_when_physical_locations_differ() -> None: config1 = _PhysicalDestinationConfig("host1") config2 = _PhysicalDestinationConfig("host2") assert_not_joinable(config1, config2) -def test_base_can_join_with_default_true_when_same_physical_destination() -> None: +def test_base_can_join_with_default_true_when_same_physical_location() -> None: config1 = _PhysicalDestinationConfig("host1") config2 = _PhysicalDestinationConfig("host1") assert_joinable(config1, config2) -def test_base_can_join_with_default_false_when_empty_physical_destination() -> None: +def test_base_can_join_with_default_false_when_empty_physical_location() -> None: config1 = DestinationClientConfiguration() config2 = _PhysicalDestinationConfig("host1") assert_not_joinable(config1, config2) @@ -247,16 +247,16 @@ def test_base_can_join_with_returns_false_for_non_config() -> None: assert not config.can_join_with(42) # type: ignore[arg-type] -def test_is_same_physical_destination_delegates_to_can_join_with() -> None: +def test_is_same_physical_location_delegates_to_can_join_with() -> None: config1 = _StringyPhysicalDestinationConfig("host1", "first-display") config2 = _StringyPhysicalDestinationConfig("host1", "second-display") assert str(config1) != str(config2) - assert is_same_physical_destination( + assert is_same_physical_location( cast(Dataset, _DatasetStub(config1)), cast(Dataset, _DatasetStub(config2)) ) -# physical_destination() extraction across destinations +# physical_location() extraction across destinations PHYSICAL_DEST_CASES = [ # Postgres: host:port format @@ -429,8 +429,8 @@ def test_is_same_physical_destination_delegates_to_can_join_with() -> None: @pytest.mark.parametrize("factory,expected", PHYSICAL_DEST_CASES) -def test_physical_destination(factory: ConfigFactory, expected: str) -> None: - assert factory().physical_destination() == expected +def test_physical_location(factory: ConfigFactory, expected: str) -> None: + assert factory().physical_location() == expected @pytest.mark.parametrize( @@ -822,16 +822,16 @@ def test_can_join_with_matrix(f1: ConfigFactory, f2: ConfigFactory, expected: bo def test_cross_type_rejection(f1: ConfigFactory, f2: ConfigFactory) -> None: c1, c2 = f1(), f2() if isinstance(c2, _PhysicalDestinationConfig): - c2._physical_destination = c1.physical_destination() + c2._physical_location = c1.physical_location() assert_not_joinable(c1, c2) -def test_cross_type_different_physical_destinations() -> None: +def test_cross_type_different_physical_locations() -> None: sf = SnowflakeClientConfiguration( credentials=SnowflakeCredentials("snowflake://u:p@a1.snowflake.com/db") ) bq = BigQueryClientConfiguration(credentials=GcpServiceAccountCredentials(project_id="p2")) - assert sf.physical_destination() != bq.physical_destination() + assert sf.physical_location() != bq.physical_location() assert_not_joinable(sf, bq) @@ -856,18 +856,18 @@ def test_filesystem_cannot_join_with_non_filesystem() -> None: def test_filesystem_fingerprint_empty_for_local() -> None: c = FilesystemDestinationClientConfiguration(bucket_url="/local/p") - assert c.physical_destination() == "" + assert c.physical_location() == "" assert c.fingerprint() == "" # MotherDuck token-based joinability -def test_motherduck_token_not_exposed_as_physical_destination() -> None: +def test_motherduck_token_not_exposed_as_physical_location() -> None: md = MotherDuckClientConfiguration( credentials=MotherDuckCredentials("md:db?motherduck_token=token") ) - assert md.physical_destination() == "" + assert md.physical_location() == "" def test_motherduck_fingerprint_hashes_token() -> None: @@ -878,7 +878,7 @@ def test_motherduck_fingerprint_hashes_token() -> None: def test_motherduck_can_join_with_same_token_without_exposing_location() -> None: - """Same token can join without exposing token via physical destination.""" + """Same token can join without exposing token via physical location.""" c1 = MotherDuckClientConfiguration( credentials=MotherDuckCredentials("md:db?motherduck_token=token") ) @@ -1019,21 +1019,21 @@ def test_lance_and_lancedb_cannot_join_with_each_other() -> None: assert_not_joinable(lance, lancedb) -def test_weaviate_physical_destination_but_not_joinable() -> None: +def test_weaviate_physical_location_but_not_joinable() -> None: c1 = WeaviateClientConfiguration( credentials=WeaviateCredentials(url="https://cluster.weaviate.cloud") ) c2 = WeaviateClientConfiguration( credentials=WeaviateCredentials(url="https://cluster.weaviate.cloud") ) - assert c1.physical_destination() == "cluster.weaviate.cloud" + assert c1.physical_location() == "cluster.weaviate.cloud" assert c1.fingerprint() == digest128("cluster.weaviate.cloud") assert_not_joinable(c1, c2) -def test_qdrant_physical_destination_but_not_joinable() -> None: +def test_qdrant_physical_location_but_not_joinable() -> None: c1 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") c2 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") - assert c1.physical_destination() == "https://cluster.qdrant.io" + assert c1.physical_location() == "https://cluster.qdrant.io" assert c1.fingerprint() == digest128("https://cluster.qdrant.io") assert_not_joinable(c1, c2) diff --git a/tests/load/ducklake/test_ducklake_client.py b/tests/load/ducklake/test_ducklake_client.py index 20d1ef5ae9..231f8e637a 100644 --- a/tests/load/ducklake/test_ducklake_client.py +++ b/tests/load/ducklake/test_ducklake_client.py @@ -103,9 +103,9 @@ def test_ducklake_configuration_default() -> None: # file url assert credentials.storage.bucket_url.startswith("file://") # fingerprint derived from catalog identity + ducklake name - expected_phys = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" - assert configuration.physical_destination() == expected_phys - assert configuration.fingerprint() == digest128(expected_phys) + expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_location() == expected_loc + assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_duckdb_catalog() -> None: @@ -123,9 +123,9 @@ def test_ducklake_configuration_duckdb_catalog() -> None: assert credentials.ducklake_name == DEFAULT_DUCKLAKE_NAME conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.duckdb")) - expected_phys = f"duckdb://{local_dir / 'ducklake.duckdb'}#{DEFAULT_DUCKLAKE_NAME}" - assert configuration.physical_destination() == expected_phys - assert configuration.fingerprint() == digest128(expected_phys) + expected_loc = f"duckdb://{local_dir / 'ducklake.duckdb'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_location() == expected_loc + assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_ducklake_name() -> None: @@ -143,9 +143,9 @@ def test_ducklake_configuration_ducklake_name() -> None: assert conn_str.endswith(str(local_dir / "my_ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "my_ducklake.files") # fingerprint derived from catalog identity + ducklake name - expected_phys = f"sqlite://{local_dir / 'my_ducklake.sqlite'}#my_ducklake" - assert configuration.physical_destination() == expected_phys - assert configuration.fingerprint() == digest128(expected_phys) + expected_loc = f"sqlite://{local_dir / 'my_ducklake.sqlite'}#my_ducklake" + assert configuration.physical_location() == expected_loc + assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_destination_name() -> None: @@ -163,9 +163,9 @@ def test_ducklake_configuration_destination_name() -> None: assert conn_str.endswith(str(local_dir / "ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "ducklake.files") # fingerprint derived from catalog identity + ducklake name - expected_phys = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" - assert configuration.physical_destination() == expected_phys - assert configuration.fingerprint() == digest128(expected_phys) + expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + assert configuration.physical_location() == expected_loc + assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_pipeline_name() -> None: @@ -212,9 +212,9 @@ def test_ducklake_configuration_storage_credentials() -> None: assert credentials.storage_url == "s3://dlt-ci-test-bucket/lake" # fingerprint derived from remote catalog identity + ducklake name assert ( - configuration.physical_destination() == "postgresql://localhost:5432/dlt_data#my_ducklake" + configuration.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" ) - assert configuration.fingerprint() == digest128(configuration.physical_destination()) + assert configuration.fingerprint() == digest128(configuration.physical_location()) def test_ducklake_configuration_catalog_credentials() -> None: From 5300cbd0553d3069e90c89f9f788f313170834cf Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 27 May 2026 12:50:18 +0200 Subject: [PATCH 06/16] fix duplicate fabric test --- tests/destinations/test_join_compatibility.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 4e149f7552..040416d099 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -1,6 +1,6 @@ """Tests for destination configuration-level join-compatibility semantics.""" -from typing import Callable, cast +from typing import Callable, cast, Optional from typing_extensions import TypeAlias @@ -160,12 +160,14 @@ def _ducklake_creds(catalog_str: str, name: str = DEFAULT_DUCKLAKE_NAME) -> Duck ) -def _fabric_creds(host: str, database: str) -> FabricCredentials: +def _fabric_creds(host: str, database: str, port: Optional[int] = None) -> FabricCredentials: """Build Fabric credentials.""" # Fabric is normally configured via structured fields, not a connection string. credentials = FabricCredentials() credentials.host = host credentials.database = database + if port is not None: + credentials.port = port return credentials @@ -406,7 +408,7 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: # Fabric pytest.param( lambda: FabricClientConfiguration( - credentials=_fabric_creds("h.fabric.microsoft.com", "db") + credentials=_fabric_creds("h.fabric.microsoft.com", "db", port=1433) ), "h.fabric.microsoft.com:1433", id="fabric_port", @@ -418,6 +420,11 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: "h.fabric.microsoft.com:1433", id="fabric_default_port", ), + pytest.param( + lambda: FabricClientConfiguration(credentials=FabricCredentials()), + "", + id="fabric_no_host", + ), pytest.param( lambda: MotherDuckClientConfiguration( credentials=MotherDuckCredentials("md:db?motherduck_token=token") From f0fc271d5708b6d98e46058be0b0d3a2ff58138a Mon Sep 17 00:00:00 2001 From: travior Date: Mon, 1 Jun 2026 11:18:23 +0200 Subject: [PATCH 07/16] revert fingerprint to legacy behavior + add tests --- dlt/dataset/dataset.py | 19 +++-- .../impl/bigquery/configuration.py | 14 +++- .../impl/clickhouse/configuration.py | 7 ++ .../impl/databricks/configuration.py | 13 ++- dlt/destinations/impl/dremio/configuration.py | 13 ++- .../impl/ducklake/configuration.py | 6 ++ .../impl/filesystem/configuration.py | 15 ++-- dlt/destinations/impl/lance/configuration.py | 4 + .../impl/lancedb/configuration.py | 7 ++ dlt/destinations/impl/mssql/configuration.py | 15 +++- .../impl/postgres/configuration.py | 15 +++- dlt/destinations/impl/qdrant/configuration.py | 13 ++- .../impl/redshift/configuration.py | 13 ++- .../impl/snowflake/configuration.py | 15 +++- .../impl/weaviate/configuration.py | 15 +++- .../test_destination_fingerprints.py | 24 ++++++ tests/destinations/test_join_compatibility.py | 83 +++---------------- .../test_athena_configuration.py | 27 ++++++ tests/load/bigquery/test_bigquery_client.py | 8 +- .../bigquery/test_bigquery_configuration.py | 46 ++++++++++ .../test_clickhouse_configuration.py | 38 ++++++--- .../test_clickhouse_table_builder.py | 14 +--- .../test_databricks_configuration.py | 22 ++++- .../load/dremio/test_dremio_configuration.py | 36 ++++++++ .../load/duckdb/test_duckdb_configuration.py | 36 ++++++++ tests/load/ducklake/test_ducklake_client.py | 15 +--- .../ducklake/test_ducklake_configuration.py | 54 ++++++++++++ .../load/fabric/test_fabric_configuration.py | 29 ++++++- .../load/filesystem/test_filesystem_client.py | 29 ++++--- tests/load/lance/test_lance_configuration.py | 26 +++++- tests/load/lancedb/test_config.py | 19 ++++- .../test_motherduck_configuration.py | 45 ++++++++++ tests/load/mssql/test_mssql_configuration.py | 33 +++++++- .../postgres/test_postgres_configuration.py | 36 ++++++++ .../load/qdrant/test_qdrant_configuration.py | 27 ++++++ .../redshift/test_redshift_configuration.py | 40 +++++++++ .../redshift/test_redshift_table_builder.py | 11 +-- .../snowflake/test_snowflake_configuration.py | 37 ++++++--- .../test_sqlalchemy_configuration.py | 26 +++++- .../synapse/test_synapse_configuration.py | 29 ++++++- .../weaviate/test_weaviate_configuration.py | 36 ++++++++ 41 files changed, 811 insertions(+), 199 deletions(-) create mode 100644 tests/destinations/test_destination_fingerprints.py create mode 100644 tests/load/bigquery/test_bigquery_configuration.py create mode 100644 tests/load/dremio/test_dremio_configuration.py create mode 100644 tests/load/duckdb/test_duckdb_configuration.py create mode 100644 tests/load/ducklake/test_ducklake_configuration.py create mode 100644 tests/load/motherduck/test_motherduck_configuration.py create mode 100644 tests/load/postgres/test_postgres_configuration.py create mode 100644 tests/load/qdrant/test_qdrant_configuration.py create mode 100644 tests/load/redshift/test_redshift_configuration.py create mode 100644 tests/load/weaviate/test_weaviate_configuration.py diff --git a/dlt/dataset/dataset.py b/dlt/dataset/dataset.py index ad3f190a2a..607c917d32 100644 --- a/dlt/dataset/dataset.py +++ b/dlt/dataset/dataset.py @@ -219,12 +219,12 @@ def open_table_client(self) -> SupportsOpenTables: # TODO remove method; need to update `dlthub` to avoid conflict # this is only used by `dlt.hub.transformation` currently - def is_same_physical_location(self, other: dlt.Dataset) -> bool: + def is_same_physical_destination(self, other: dlt.Dataset) -> bool: """ - Returns true if the other dataset is on the same physical location + Returns true if the other dataset is on the same physical destination helpful if we want to run sql queries without extracting the data """ - return is_same_physical_location(self, other) + return is_same_physical_destination(self, other) def query( self, @@ -497,12 +497,13 @@ def get_dataset_sql_client(dataset: dlt.Dataset) -> SqlClientBase[Any]: raise SqlClientNotAvailable("dataset", dataset.dataset_name, client.config.destination_type) -def is_same_physical_location(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: - """Check if tables from both datasets can be joined in a single query.""" - # NOTE: the name is historical -- this actually checks join compatibility via - # can_join_with(), which may return True even when the physical storage - # locations differ (e.g. filesystem destinations backed by different buckets). - return dataset1.destination_client.config.can_join_with(dataset2.destination_client.config) +def is_same_physical_destination(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: + """Check if both datasets are at the same physical destination. + + This is done by comparing the fingerprint of both destination configs. There + are potential false positive if two different config give access to the same destination. + """ + return str(dataset1.destination_client.config) == str(dataset2.destination_client.config) def _get_dataset_schema_from_destination_using_schema_name( diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index efb02aae8b..c69d736b48 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -1,15 +1,17 @@ import dataclasses -from typing import ClassVar, List, Final, Optional, Union +from typing import ClassVar, List, Optional, Union from dlt.common.configuration import configspec from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials - from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration +from dlt.common.utils import digest128 @configspec class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="bigquery", init=False, repr=False, compare=False + ) credentials: Union[GcpServiceAccountCredentials, GcpOAuthCredentials] = None location: str = "US" project_id: Optional[str] = None @@ -37,6 +39,12 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): def get_location(self) -> str: return self.location + def fingerprint(self) -> str: + """Returns a fingerprint of the credentials project id.""" + if self.credentials and self.credentials.project_id: + return digest128(self.credentials.project_id) + return "" + def physical_location(self) -> str: """Returns configured project id, falling back to credentials.""" project_id = self.project_id diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py index aac5f32bca..8d0e074448 100644 --- a/dlt/destinations/impl/clickhouse/configuration.py +++ b/dlt/destinations/impl/clickhouse/configuration.py @@ -8,6 +8,7 @@ from dlt.common.destination.client import ( DestinationClientDwhWithStagingConfiguration, ) +from dlt.common.utils import digest128 from dlt.destinations.impl.clickhouse.typing import TSecureConnection, TTableEngineType @@ -96,6 +97,12 @@ class ClickHouseClientConfiguration(DestinationClientDwhWithStagingConfiguration "table_engine_type", ] + def fingerprint(self) -> str: + """Returns a fingerprint of the configured host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index b8757d383c..79c7cab4c3 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -2,7 +2,7 @@ import dataclasses from copy import deepcopy -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Union, cast from urllib.parse import urlparse from dlt.common import logger @@ -14,6 +14,7 @@ from dlt.common.typing import TSecretStrValue from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.configuration.exceptions import ConfigurationValueError +from dlt.common.utils import digest128 from dlt.destinations.impl.databricks.typing import TDatabricksInsertApi if TYPE_CHECKING: @@ -235,7 +236,9 @@ def _coerce_ipc_compression(ipc_compression: Union[str, IPCCompression]) -> IPCC @configspec class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="databricks", init=False, repr=False, compare=False) # type: ignore[misc] + destination_type: str = dataclasses.field( + default="databricks", init=False, repr=False, compare=False + ) credentials: DatabricksCredentials = None staging_credentials_name: Optional[str] = None "If set, credentials with given name will be used in copy command" @@ -284,6 +287,12 @@ def on_resolved(self) -> None: " `destination.databricks.credentials.client_secret`." ) + def fingerprint(self) -> str: + """Returns a fingerprint of the server hostname.""" + if self.credentials and self.credentials.server_hostname: + return digest128(self.credentials.server_hostname) + return "" + def physical_location(self) -> str: """Returns the server hostname.""" if self.credentials and self.credentials.server_hostname: diff --git a/dlt/destinations/impl/dremio/configuration.py b/dlt/destinations/impl/dremio/configuration.py index 93292afbc6..0e2c403f70 100644 --- a/dlt/destinations/impl/dremio/configuration.py +++ b/dlt/destinations/impl/dremio/configuration.py @@ -1,10 +1,11 @@ import dataclasses -from typing import Final, Optional, Any, Dict, ClassVar, List +from typing import Optional, Any, Dict, ClassVar, List from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.typing import TSecretStrValue +from dlt.common.utils import digest128 @configspec(init=False) @@ -31,11 +32,19 @@ def db_kwargs(self) -> Dict[str, Any]: @configspec class DremioClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="dremio", init=False, repr=False, compare=False) # type: ignore[misc] + destination_type: str = dataclasses.field( + default="dremio", init=False, repr=False, compare=False + ) credentials: DremioCredentials = None staging_data_source: str = None """The name of the staging data source""" + def fingerprint(self) -> str: + """Returns a fingerprint of the configured host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/ducklake/configuration.py b/dlt/destinations/impl/ducklake/configuration.py index ca23848a26..1f783d04c5 100644 --- a/dlt/destinations/impl/ducklake/configuration.py +++ b/dlt/destinations/impl/ducklake/configuration.py @@ -142,6 +142,12 @@ class DuckLakeClientConfiguration(WithLocalFiles, DestinationClientDwhWithStagin automatic_migration: bool = False """When true, attaches with `AUTOMATIC_MIGRATION true` so DuckDB migrates an older DuckLake catalog schema on attach.""" + def fingerprint(self) -> str: + """Returns a fingerprint of the underlying storage.""" + if not self.credentials or self.credentials.storage is None: + return "" + return self.credentials.storage.fingerprint() + def physical_location(self) -> str: """Returns credential-free catalog identity plus ducklake name.""" if not self.credentials or not self.credentials.catalog: diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 8506684fe3..b61cbbfcb4 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -1,7 +1,7 @@ import dataclasses import os -from typing import Dict, Final, Optional, Type +from typing import Dict, Optional, Type from dlt.common.typing import DictStrAny, DictStrOptionalStr @@ -20,8 +20,10 @@ @configspec -class FilesystemDestinationClientConfiguration(FilesystemConfigurationWithLocalFiles, DestinationClientStagingConfiguration): # type: ignore[misc] - destination_type: Final[str] = dataclasses.field( # type: ignore[misc] +class FilesystemDestinationClientConfiguration( # type: ignore[misc] + FilesystemConfigurationWithLocalFiles, DestinationClientStagingConfiguration +): + destination_type: str = dataclasses.field( default="filesystem", init=False, repr=False, compare=False ) current_datetime: Optional[TCurrentDateTime] = None @@ -58,13 +60,6 @@ def physical_location(self) -> str: url = urlparse(self.bucket_url) return f"{url.scheme}://{url.netloc}" - def fingerprint(self) -> str: - # Explicit override to resolve MRO ambiguity: without it, Python picks - # FilesystemConfiguration.fingerprint() (which hashes the raw bucket URL) - # over DestinationClientConfiguration.fingerprint() (which hashes - # physical_location()). Do not remove. - return DestinationClientStagingConfiguration.fingerprint(self) - def can_join_with(self, other: DestinationClientConfiguration) -> bool: """Returns True for any other filesystem destination. diff --git a/dlt/destinations/impl/lance/configuration.py b/dlt/destinations/impl/lance/configuration.py index 3b12a8d8d3..bd8b95e665 100644 --- a/dlt/destinations/impl/lance/configuration.py +++ b/dlt/destinations/impl/lance/configuration.py @@ -356,6 +356,10 @@ def make_namespace(self) -> "LanceNamespace": props.update(self.credentials.to_namespace_properties()) return connect(self.catalog_type, props) + def fingerprint(self) -> str: + """Returns a fingerprint of the configured storage.""" + return self.storage.fingerprint() if self.storage else "" + def physical_location(self) -> str: """Returns the resolved Lance catalog root.""" if ( diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py index 711bfbc059..5ca152643e 100644 --- a/dlt/destinations/impl/lancedb/configuration.py +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -16,6 +16,7 @@ from dlt.common.pendulum import timedelta from dlt.common.storages.configuration import FilesystemConfiguration, WithLocalFiles from dlt.common.typing import TSecretStrValue, Annotated +from dlt.common.utils import digest128 from dlt.destinations.impl.lancedb.warnings import uri_on_credentials_deprecated if TYPE_CHECKING: @@ -191,6 +192,12 @@ def on_resolved(self) -> None: # TODO: move uri back to credentials to make it more like other connections self.credentials.uri = self.lance_uri + def fingerprint(self) -> str: + """Returns a fingerprint of the LanceDB URI.""" + if self.lance_uri: + return digest128(self.lance_uri) + return "" + def physical_location(self) -> str: """Returns the resolved LanceDB URI, or "" for external native clients.""" if not self.lance_uri or self.lance_uri == ":external:": diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index c4c14d7072..e838523421 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -1,5 +1,5 @@ import dataclasses -from typing import Final, ClassVar, Any, List, Dict, Optional +from typing import ClassVar, Any, List, Dict, Optional from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials @@ -7,6 +7,7 @@ from dlt.common.exceptions import SystemConfigurationException from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration +from dlt.common.utils import digest128 def escape_mssql_odbc_value(value: Optional[str]) -> str: @@ -50,7 +51,7 @@ def build_odbc_dsn(params: Dict[str, Any]) -> str: @configspec(init=False) class MsSqlCredentials(ConnectionStringCredentials): - drivername: Final[str] = dataclasses.field(default="mssql", init=False, repr=False, compare=False) # type: ignore + drivername: str = dataclasses.field(default="mssql", init=False, repr=False, compare=False) database: str = None username: str = None password: TSecretStrValue = None @@ -128,12 +129,20 @@ def to_odbc_dsn(self) -> str: @configspec class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="mssql", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="mssql", init=False, repr=False, compare=False + ) credentials: MsSqlCredentials = None create_indexes: bool = False has_case_sensitive_identifiers: bool = False + def fingerprint(self) -> str: + """Returns a fingerprint of the configured host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index 94190a6aa7..384574d9d4 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -1,10 +1,11 @@ import dataclasses -from typing import Dict, Final, ClassVar, Any, List, Optional +from typing import Dict, ClassVar, Any, List, Optional from dlt.common.destination.configuration import CsvFormatConfiguration from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.typing import TSecretStrValue +from dlt.common.utils import digest128 from dlt.common.destination.client import ( DestinationClientConfiguration, @@ -14,7 +15,7 @@ @configspec(init=False) class PostgresCredentials(ConnectionStringCredentials): - drivername: Final[str] = dataclasses.field(default="postgresql", init=False, repr=False, compare=False) # type: ignore + drivername: str = dataclasses.field(default="postgresql", init=False, repr=False, compare=False) database: str = None username: str = None password: TSecretStrValue = None @@ -40,7 +41,9 @@ def get_query(self) -> Dict[str, Any]: @configspec class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="postgres", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="postgres", init=False, repr=False, compare=False + ) credentials: PostgresCredentials = None create_indexes: bool = True @@ -48,6 +51,12 @@ class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): csv_format: Optional[CsvFormatConfiguration] = None """Optional csv format configuration""" + def fingerprint(self) -> str: + """Returns a fingerprint of the configured host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns host:port as the physical location identifier.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index 0795d1c4f2..06ebb1122e 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,6 +1,6 @@ import os import dataclasses -from typing import Optional, Final, Any +from typing import Optional, Any from typing_extensions import Annotated, TYPE_CHECKING from dlt.common.configuration import configspec, NotResolved @@ -13,6 +13,7 @@ DestinationClientDwhConfiguration, ) from dlt.common.storages.configuration import WithLocalFiles +from dlt.common.utils import digest128 from dlt.destinations.impl.qdrant.exceptions import InvalidInMemoryQdrantCredentials from dlt.destinations.impl.qdrant.warnings import location_on_credentials_deprecated @@ -72,7 +73,9 @@ class QdrantClientOptions(BaseConfiguration): @configspec class QdrantClientConfiguration(WithLocalFiles, DestinationClientDwhConfiguration): - destination_type: Final[str] = dataclasses.field(default="qdrant", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="qdrant", init=False, repr=False, compare=False + ) credentials: QdrantCredentials = None "Qdrant connection credentials" qd_location: Optional[str] = None @@ -148,6 +151,12 @@ def on_resolved(self) -> None: if self.qd_path and not os.path.isabs(self.qd_path): self.qd_path = self.make_location(self.qd_path, "%s.qdrant") + def fingerprint(self) -> str: + """Returns a fingerprint of the connection location.""" + if self.qd_location: + return digest128(self.qd_location) + return "" + def physical_location(self) -> str: """Returns the Qdrant connection location.""" return self.qd_location or "" diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index d2be73b54f..a756a7cff2 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -1,8 +1,9 @@ import dataclasses -from typing import Final, Optional +from typing import Optional from dlt.common.typing import TSecretStrValue from dlt.common.configuration import configspec +from dlt.common.utils import digest128 from dlt.destinations.impl.postgres.configuration import ( PostgresCredentials, @@ -21,12 +22,20 @@ class RedshiftCredentials(PostgresCredentials): @configspec class RedshiftClientConfiguration(PostgresClientConfiguration): - destination_type: Final[str] = dataclasses.field(default="redshift", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="redshift", init=False, repr=False, compare=False + ) credentials: RedshiftCredentials = None staging_iam_role: Optional[str] = None has_case_sensitive_identifiers: bool = False + def fingerprint(self) -> str: + """Returns a fingerprint of the configured host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns host:port.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index d5c21dd34d..704f5589a5 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -1,7 +1,7 @@ import dataclasses import os from pathlib import Path -from typing import Final, Optional, Any, Dict, ClassVar, List +from typing import Optional, Any, Dict, ClassVar, List from dlt.common.destination.configuration import CsvFormatConfiguration from dlt.common.libs.cryptography import decode_private_key @@ -10,6 +10,7 @@ from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.configuration import configspec from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration +from dlt.common.utils import digest128 from dlt.destinations.impl.snowflake.utils import ( read_snowflake_session_token, snowflake_session_token_available, @@ -21,7 +22,7 @@ @configspec(init=False) class SnowflakeCredentialsWithoutDefaults(ConnectionStringCredentials): - drivername: Final[str] = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) # type: ignore[misc] + drivername: str = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) database: str = None host: str = None """Snowflake account identifier, e.g. `kgiotue-wn98412`""" @@ -152,7 +153,9 @@ def _ensure_fresh_token(self) -> None: @configspec class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) # type: ignore[misc] + destination_type: str = dataclasses.field( + default="snowflake", init=False, repr=False, compare=False + ) credentials: SnowflakeCredentials = None stage_name: Optional[str] = None @@ -178,6 +181,12 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration) use_decfloat: bool = False """Whether to use DECFLOAT type for unbound decimals instead of DECIMAL""" + def fingerprint(self) -> str: + """Returns a fingerprint of the account host.""" + if self.credentials and self.credentials.host: + return digest128(self.credentials.host) + return "" + def physical_location(self) -> str: """Returns the account host.""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 397637839b..57c72eaad9 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -1,5 +1,5 @@ import dataclasses -from typing import Dict, Literal, Optional, Final +from typing import Dict, Literal, Optional from typing_extensions import Annotated from urllib.parse import urlparse @@ -9,6 +9,7 @@ DestinationClientConfiguration, DestinationClientDwhConfiguration, ) +from dlt.common.utils import digest128 TWeaviateBatchConsistency = Literal["ONE", "QUORUM", "ALL"] TWeaviateConnectionType = Literal["cloud", "local", "custom"] @@ -31,7 +32,9 @@ def __str__(self) -> str: @configspec class WeaviateClientConfiguration(DestinationClientDwhConfiguration): - destination_type: Final[str] = dataclasses.field(default="weaviate", init=False, repr=False, compare=False) # type: ignore + destination_type: str = dataclasses.field( + default="weaviate", init=False, repr=False, compare=False + ) # make it optional so empty dataset is allowed dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False @@ -64,6 +67,14 @@ class WeaviateClientConfiguration(DestinationClientDwhConfiguration): } ) + def fingerprint(self) -> str: + """Returns a fingerprint of the connection host.""" + if self.credentials and self.credentials.url: + hostname = urlparse(self.credentials.url).hostname + if hostname: + return digest128(hostname) + return "" + def physical_location(self) -> str: """Returns the host part of the connection URL.""" if self.credentials and self.credentials.url: diff --git a/tests/destinations/test_destination_fingerprints.py b/tests/destinations/test_destination_fingerprints.py new file mode 100644 index 0000000000..7acb017027 --- /dev/null +++ b/tests/destinations/test_destination_fingerprints.py @@ -0,0 +1,24 @@ +from dlt.common.destination.client import DestinationClientConfiguration +from dlt.common.utils import digest128 + + +class _PhysicalDestinationConfig(DestinationClientConfiguration): + def __init__(self, physical_location: str) -> None: + super().__init__() + self._physical_location = physical_location + + def physical_location(self) -> str: + return self._physical_location + + +def test_base_fingerprint_hashes_non_empty_physical_location() -> None: + config = _PhysicalDestinationConfig("test-host:5432") + + assert config.fingerprint() == digest128("test-host:5432") + + +def test_base_fingerprint_returns_empty_string_without_physical_location() -> None: + config = DestinationClientConfiguration() + + assert config.physical_location() == "" + assert config.fingerprint() == "" diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 040416d099..68b635982c 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -6,14 +6,14 @@ import pytest -from dlt.common.utils import digest128 from dlt.common.configuration.specs import ( AwsCredentials, ConnectionStringCredentials, GcpServiceAccountCredentials, ) from dlt.common.destination.client import DestinationClientConfiguration -from dlt.dataset.dataset import Dataset, is_same_physical_location +from dlt.common.storages import FilesystemConfigurationWithLocalFiles +from dlt.dataset.dataset import Dataset, is_same_physical_destination from dlt.destinations.impl.postgres.configuration import ( PostgresClientConfiguration, PostgresCredentials, @@ -152,11 +152,18 @@ def _athena_config(region: str, catalog: str = "awsdatacatalog") -> AthenaClient ) -def _ducklake_creds(catalog_str: str, name: str = DEFAULT_DUCKLAKE_NAME) -> DuckLakeCredentials: +def _ducklake_creds( + catalog_str: str, + name: str = DEFAULT_DUCKLAKE_NAME, + storage_url: Optional[str] = None, +) -> DuckLakeCredentials: """Build DuckLake credentials.""" return DuckLakeCredentials( ducklake_name=name, catalog=ConnectionStringCredentials(catalog_str), + storage=( + FilesystemConfigurationWithLocalFiles(bucket_url=storage_url) if storage_url else None + ), ) @@ -212,18 +219,7 @@ def _lance_config(catalog_root: str, dataset_name: str = "dataset") -> LanceClie return c -# Base DestinationClientConfiguration contract -def test_base_fingerprint_derived_from_physical_location() -> None: - config = _PhysicalDestinationConfig("test-host:5432") - assert config.fingerprint() == digest128("test-host:5432") - - -def test_base_fingerprint_empty_when_physical_location_empty() -> None: - config = DestinationClientConfiguration() - assert config.physical_location() == "" - assert config.fingerprint() == "" - - +# Base DestinationClientConfiguration join contract def test_base_can_join_with_default_false_when_physical_locations_differ() -> None: config1 = _PhysicalDestinationConfig("host1") config2 = _PhysicalDestinationConfig("host2") @@ -253,7 +249,7 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: config1 = _StringyPhysicalDestinationConfig("host1", "first-display") config2 = _StringyPhysicalDestinationConfig("host1", "second-display") assert str(config1) != str(config2) - assert is_same_physical_location( + assert is_same_physical_destination( cast(Dataset, _DatasetStub(config1)), cast(Dataset, _DatasetStub(config2)) ) @@ -440,46 +436,6 @@ def test_physical_location(factory: ConfigFactory, expected: str) -> None: assert factory().physical_location() == expected -@pytest.mark.parametrize( - "factory,expected_fp", - [ - pytest.param( - lambda: PostgresClientConfiguration(credentials=PostgresCredentials("postgresql://h")), - digest128("h:5432"), - id="pg", - ), - pytest.param( - lambda: SnowflakeClientConfiguration( - credentials=SnowflakeCredentials("snowflake://u:p@h/db") - ), - digest128("h"), - id="sf", - ), - pytest.param( - lambda: BigQueryClientConfiguration( - credentials=GcpServiceAccountCredentials(project_id="p") - ), - digest128("p"), - id="bq", - ), - pytest.param( - lambda: FilesystemDestinationClientConfiguration(bucket_url="s3://b/p"), - digest128("s3://b"), - id="fs", - ), - pytest.param( - lambda: MotherDuckClientConfiguration( - credentials=MotherDuckCredentials("md:db?motherduck_token=token") - ), - digest128("token"), - id="md_token_hash", - ), - ], -) -def test_fingerprint(factory: ConfigFactory, expected_fp: str) -> None: - assert factory().fingerprint() == expected_fp - - # can_join_with() matrices (symmetric) MSSQL_JOIN_CASES = [ @@ -861,12 +817,6 @@ def test_filesystem_cannot_join_with_non_filesystem() -> None: assert_not_joinable(c, other) -def test_filesystem_fingerprint_empty_for_local() -> None: - c = FilesystemDestinationClientConfiguration(bucket_url="/local/p") - assert c.physical_location() == "" - assert c.fingerprint() == "" - - # MotherDuck token-based joinability @@ -877,13 +827,6 @@ def test_motherduck_token_not_exposed_as_physical_location() -> None: assert md.physical_location() == "" -def test_motherduck_fingerprint_hashes_token() -> None: - md = MotherDuckClientConfiguration( - credentials=MotherDuckCredentials("md:db?motherduck_token=token") - ) - assert md.fingerprint() == digest128("token") - - def test_motherduck_can_join_with_same_token_without_exposing_location() -> None: """Same token can join without exposing token via physical location.""" c1 = MotherDuckClientConfiguration( @@ -1034,7 +977,6 @@ def test_weaviate_physical_location_but_not_joinable() -> None: credentials=WeaviateCredentials(url="https://cluster.weaviate.cloud") ) assert c1.physical_location() == "cluster.weaviate.cloud" - assert c1.fingerprint() == digest128("cluster.weaviate.cloud") assert_not_joinable(c1, c2) @@ -1042,5 +984,4 @@ def test_qdrant_physical_location_but_not_joinable() -> None: c1 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") c2 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") assert c1.physical_location() == "https://cluster.qdrant.io" - assert c1.fingerprint() == digest128("https://cluster.qdrant.io") assert_not_joinable(c1, c2) diff --git a/tests/load/athena_iceberg/test_athena_configuration.py b/tests/load/athena_iceberg/test_athena_configuration.py index 16253c2fac..4a7bc38a19 100644 --- a/tests/load/athena_iceberg/test_athena_configuration.py +++ b/tests/load/athena_iceberg/test_athena_configuration.py @@ -1,6 +1,10 @@ from typing import cast +import pytest + +from dlt.common.configuration.specs import AwsCredentials from dlt.common.typing import StrAny +from dlt.common.utils import digest128 from dlt.destinations.impl.athena.configuration import ( AthenaClientConfiguration, DEFAULT_AWS_DATA_CATALOG, @@ -9,6 +13,29 @@ from tests.load.utils import S3_TABLES_CATALOG, cm_yield_client +@pytest.mark.parametrize( + "config,expected_fingerprint", + [ + pytest.param(AthenaClientConfiguration(), "", id="empty"), + pytest.param( + AthenaClientConfiguration(credentials=AwsCredentials(region_name="us-west-2")), + digest128(f"us-west-2/{DEFAULT_AWS_DATA_CATALOG}"), + id="default_catalog", + ), + pytest.param( + AthenaClientConfiguration( + credentials=AwsCredentials(region_name="us-west-2"), + aws_data_catalog="custom_catalog", + ), + digest128("us-west-2/custom_catalog"), + id="custom_catalog", + ), + ], +) +def test_athena_fingerprint(config: AthenaClientConfiguration, expected_fingerprint: str) -> None: + assert config.fingerprint() == expected_fingerprint + + def test_s3_tables_naming_convention_setting() -> None: # naming convention should be adjusted to `s3_tables` when using S3 Tables Catalog config = {"aws_data_catalog": S3_TABLES_CATALOG} diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index 4cb460268e..fff23f3995 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -19,7 +19,7 @@ from dlt.common.configuration.specs.exceptions import InvalidGoogleNativeCredentialsType from dlt.common.schema.utils import new_table from dlt.common.storages import FileStorage -from dlt.common.utils import digest128, uniq_id, custom_environ +from dlt.common.utils import custom_environ, uniq_id from dlt.common.destination.client import RunnableLoadJob from dlt.destinations.impl.bigquery.bigquery import ( BigQueryClient, @@ -243,7 +243,6 @@ def test_bigquery_configuration() -> None: assert config.http_timeout == 15.0 assert config.retry_deadline == 60.0 assert config.file_upload_timeout == 1800.0 - assert config.fingerprint() == digest128("chat-analytics-rasa-ci") assert config.ignore_unknown_values is False # credential location is deprecated @@ -269,11 +268,6 @@ def test_bigquery_configuration() -> None: ) assert config.file_upload_timeout == 20000.0 - # default fingerprint is empty - assert ( - BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset").fingerprint() == "" - ) - def test_bigquery_different_project_id(bigquery_project_id) -> None: """Test scenario when bigquery project_id different from gcp credentials project_id.""" diff --git a/tests/load/bigquery/test_bigquery_configuration.py b/tests/load/bigquery/test_bigquery_configuration.py new file mode 100644 index 0000000000..2526b7a9f2 --- /dev/null +++ b/tests/load/bigquery/test_bigquery_configuration.py @@ -0,0 +1,46 @@ +from typing import Optional + +import pytest + +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from dlt.common.utils import digest128 +from dlt.destinations.impl.bigquery.bigquery import BigQueryClientConfiguration + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +def _credentials(project_id: str) -> GcpServiceAccountCredentials: + credentials = GcpServiceAccountCredentials() + credentials.project_id = project_id + return credentials + + +@pytest.mark.parametrize( + "credentials_project_id,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + "credentials-project", + digest128("credentials-project"), + id="legacy_credentials_project_id", + ), + ], +) +def test_bigquery_fingerprint( + credentials_project_id: Optional[str], expected_fingerprint: str +) -> None: + credentials = _credentials(credentials_project_id) if credentials_project_id else None + config = BigQueryClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint + + +def test_bigquery_fingerprint_uses_credentials_project_id_not_config_project_id() -> None: + config = BigQueryClientConfiguration( + credentials=_credentials("credentials-project"), + project_id="configured-project", + ) + + assert config.physical_location() == "configured-project" + assert config.fingerprint() == digest128("credentials-project") diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index 2b058a7598..dac2914bd1 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -76,17 +76,33 @@ def test_clickhouse_factory_select_sequential_consistency() -> None: assert "select_sequential_consistency" not in dest.config_params -def test_clickhouse_configuration() -> None: - # def empty fingerprint - assert ClickHouseClientConfiguration().fingerprint() == "" - # based on host - config = resolve_configuration( - ClickHouseCredentials(), - explicit_value="clickhouse://user1:pass1@host1:9000/db1", - ) - assert ClickHouseClientConfiguration(credentials=config).fingerprint() == digest128( - "host1:9000" - ) +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "clickhouse://user1:pass1@host1:9000/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + pytest.param( + "clickhouse://user1:pass1@host1:9440/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + ], +) +def test_clickhouse_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = resolve_configuration( + ClickHouseCredentials(), + explicit_value=connection_string, + ) + config = ClickHouseClientConfiguration(credentials=credentials) + else: + config = ClickHouseClientConfiguration() + + assert config.fingerprint() == expected_fingerprint def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: diff --git a/tests/load/clickhouse/test_clickhouse_table_builder.py b/tests/load/clickhouse/test_clickhouse_table_builder.py index 04152bedb5..3a5cc85b30 100644 --- a/tests/load/clickhouse/test_clickhouse_table_builder.py +++ b/tests/load/clickhouse/test_clickhouse_table_builder.py @@ -4,12 +4,11 @@ import pytest from dlt.common.configuration import resolve_configuration -from dlt.common.utils import custom_environ, digest128 -from dlt.common.utils import uniq_id +from dlt.common.utils import custom_environ, uniq_id from dlt.destinations.impl.clickhouse.clickhouse import ClickHouseClient, ClickHouseMergeJob from dlt.destinations.impl.clickhouse.configuration import ( - ClickHouseCredentials, ClickHouseClientConfiguration, + ClickHouseCredentials, ) from dlt.common.schema.utils import new_table, pipeline_state_table from tests.load.clickhouse.utils import clickhouse_client @@ -33,15 +32,6 @@ def test_clickhouse_configuration() -> None: assert C.database == "mydb" assert C.password == "fuss_do_rah" - # Check fingerprint. - assert ClickHouseClientConfiguration().fingerprint() == "" - # Based on host. - c = resolve_configuration( - ClickHouseCredentials(), - explicit_value="clickhouse://user1:pass@host1/db1", - ) - assert ClickHouseClientConfiguration(credentials=c).fingerprint() == digest128("host1:9440") - def test_clickhouse_create_table(clickhouse_client: ClickHouseClient) -> None: statements = clickhouse_client._get_table_update_sql("event_test_table", TABLE_UPDATE, False) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index 7413baccf2..a54b19ad9f 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -364,6 +364,25 @@ def sdk_authenticator(): assert creds.access_token is sdk_authenticator +@pytest.mark.parametrize( + "credentials,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + DatabricksCredentials(server_hostname="workspace.cloud.databricks.com"), + digest128("workspace.cloud.databricks.com"), + id="legacy_server_hostname", + ), + ], +) +def test_databricks_fingerprint( + credentials: Optional[DatabricksCredentials], expected_fingerprint: str +) -> None: + config = DatabricksClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint + + @pytest.mark.parametrize("auth_type", ("pat", "oauth2")) def test_default_credentials(auth_type: str) -> None: # create minimal default env @@ -398,9 +417,6 @@ def test_default_credentials(auth_type: str) -> None: with bricks.client(Schema("schema"), config) as client: assert not client.is_storage_initialized() - # check fingerprint not default - assert config.fingerprint() != digest128("") - def test_oauth2_credentials() -> None: dlt.secrets["destination.databricks.credentials.access_token"] = "" diff --git a/tests/load/dremio/test_dremio_configuration.py b/tests/load/dremio/test_dremio_configuration.py new file mode 100644 index 0000000000..efe0062e24 --- /dev/null +++ b/tests/load/dremio/test_dremio_configuration.py @@ -0,0 +1,36 @@ +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.dremio.configuration import ( + DremioClientConfiguration, + DremioCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "grpc://user1:pass1@host1:32010/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + pytest.param( + "grpc://user1:pass1@host1:32011/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + ], +) +def test_dremio_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = DremioCredentials(connection_string) + config = DremioClientConfiguration(credentials=credentials) + else: + config = DremioClientConfiguration() + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/duckdb/test_duckdb_configuration.py b/tests/load/duckdb/test_duckdb_configuration.py new file mode 100644 index 0000000000..e7d1981cca --- /dev/null +++ b/tests/load/duckdb/test_duckdb_configuration.py @@ -0,0 +1,36 @@ +from typing import Optional + +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.duckdb.configuration import ( + DuckDbClientConfiguration, + DuckDbCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "credentials,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + DuckDbCredentials(":memory:"), + digest128(":memory:"), + id="memory_database", + ), + pytest.param( + DuckDbCredentials("local.duckdb"), + digest128("local.duckdb"), + id="database_path", + ), + ], +) +def test_duckdb_fingerprint( + credentials: Optional[DuckDbCredentials], expected_fingerprint: str +) -> None: + config = DuckDbClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/ducklake/test_ducklake_client.py b/tests/load/ducklake/test_ducklake_client.py index 231f8e637a..0d0852990c 100644 --- a/tests/load/ducklake/test_ducklake_client.py +++ b/tests/load/ducklake/test_ducklake_client.py @@ -8,8 +8,6 @@ from dlt.common.configuration.exceptions import ConfigFieldMissingException, ConfigurationValueError from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.connection_string_credentials import ConnectionStringCredentials -from dlt.common.utils import digest128 -from dlt.destinations.impl.ducklake.sql_client import DuckLakeSqlClient from dlt.destinations.impl.ducklake.configuration import ( DuckLakeCredentials, DuckLakeClientConfiguration, @@ -102,10 +100,8 @@ def test_ducklake_configuration_default() -> None: assert credentials.storage_url == str(local_dir / "ducklake.files") # file url assert credentials.storage.bucket_url.startswith("file://") - # fingerprint derived from catalog identity + ducklake name expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" assert configuration.physical_location() == expected_loc - assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_duckdb_catalog() -> None: @@ -125,7 +121,6 @@ def test_ducklake_configuration_duckdb_catalog() -> None: assert conn_str.endswith(str(local_dir / "ducklake.duckdb")) expected_loc = f"duckdb://{local_dir / 'ducklake.duckdb'}#{DEFAULT_DUCKLAKE_NAME}" assert configuration.physical_location() == expected_loc - assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_ducklake_name() -> None: @@ -142,10 +137,8 @@ def test_ducklake_configuration_ducklake_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "my_ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "my_ducklake.files") - # fingerprint derived from catalog identity + ducklake name expected_loc = f"sqlite://{local_dir / 'my_ducklake.sqlite'}#my_ducklake" assert configuration.physical_location() == expected_loc - assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_destination_name() -> None: @@ -162,10 +155,8 @@ def test_ducklake_configuration_destination_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "ducklake.files") - # fingerprint derived from catalog identity + ducklake name expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" assert configuration.physical_location() == expected_loc - assert configuration.fingerprint() == digest128(expected_loc) def test_ducklake_configuration_pipeline_name() -> None: @@ -210,11 +201,7 @@ def test_ducklake_configuration_storage_credentials() -> None: ) # NOTE: dataset folders will be created in /lake/ assert credentials.storage_url == "s3://dlt-ci-test-bucket/lake" - # fingerprint derived from remote catalog identity + ducklake name - assert ( - configuration.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" - ) - assert configuration.fingerprint() == digest128(configuration.physical_location()) + assert configuration.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" def test_ducklake_configuration_catalog_credentials() -> None: diff --git a/tests/load/ducklake/test_ducklake_configuration.py b/tests/load/ducklake/test_ducklake_configuration.py new file mode 100644 index 0000000000..ae69ef4c99 --- /dev/null +++ b/tests/load/ducklake/test_ducklake_configuration.py @@ -0,0 +1,54 @@ +from typing import Optional + +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.ducklake.configuration import ( + DuckLakeClientConfiguration, + DuckLakeCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "credentials,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + DuckLakeCredentials(storage="ducklake.files"), + digest128(""), + id="storage_local", + ), + pytest.param( + DuckLakeCredentials( + "my_ducklake", + catalog="postgresql://loader:loader@localhost:5432/dlt_data", + storage="s3://dlt-ci-test-bucket/lake", + ), + digest128("s3://dlt-ci-test-bucket"), + id="storage_remote_bucket_only", + ), + ], +) +def test_ducklake_fingerprint( + credentials: Optional[DuckLakeCredentials], expected_fingerprint: str +) -> None: + config = DuckLakeClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint + + +def test_ducklake_fingerprint_uses_storage_not_physical_location() -> None: + config = DuckLakeClientConfiguration( + credentials=DuckLakeCredentials( + "my_ducklake", + catalog="postgresql://loader:loader@localhost:5432/dlt_data", + storage="s3://dlt-ci-test-bucket/lake", + ) + ) + + assert config.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" + assert config.fingerprint() == digest128("s3://dlt-ci-test-bucket") + assert config.fingerprint() != digest128(config.physical_location()) diff --git a/tests/load/fabric/test_fabric_configuration.py b/tests/load/fabric/test_fabric_configuration.py index 329d72c5ac..e69c71bde7 100644 --- a/tests/load/fabric/test_fabric_configuration.py +++ b/tests/load/fabric/test_fabric_configuration.py @@ -1,10 +1,13 @@ """Tests for Microsoft Fabric Warehouse destination configuration""" + import os +from typing import Optional + import pytest from dlt.common.configuration import resolve_configuration from dlt.common.schema import Schema - +from dlt.common.utils import digest128 from dlt.destinations.impl.fabric.factory import fabric from dlt.destinations.impl.fabric.configuration import ( FabricCredentials, @@ -80,6 +83,30 @@ def test_fabric_configuration_defaults() -> None: assert config.destination_type == "fabric" +@pytest.mark.parametrize( + "host,port,expected_fingerprint", + [ + pytest.param(None, None, "", id="empty"), + pytest.param("host1", 1433, digest128("host1:1433"), id="host_default_port"), + pytest.param("host1", 1444, digest128("host1:1444"), id="host_custom_port"), + ], +) +def test_fabric_fingerprint( + host: Optional[str], port: Optional[int], expected_fingerprint: str +) -> None: + if host: + credentials = FabricCredentials() + credentials.host = host + if port is not None: + credentials.port = port + else: + credentials = None + + config = FabricClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint + + def test_fabric_configuration_custom_collation() -> None: """Test Fabric configuration with custom collation""" config = FabricClientConfiguration() diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index 41dd779c8d..d30cd060ea 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict +from typing import Dict, List, Optional, Union import posixpath import os import json @@ -76,17 +76,24 @@ def _client_factory(fs: filesystem) -> FilesystemClient: @pytest.mark.parametrize( - "url, exp", - ( - (None, ""), - ("/path/path2", ""), - ("file:///home/ducklake.d", digest128("file://")), - ("s3://cool", digest128("s3://cool")), - ("s3://cool.domain/path/path2", digest128("s3://cool.domain")), - ), + "bucket_url,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param("/path/path2", digest128(""), id="local_absolute_path"), + pytest.param("lake", digest128(""), id="local_relative_path"), + pytest.param("file:///home/ducklake.d", digest128("file://"), id="file_scheme"), + pytest.param("s3://cool", digest128("s3://cool"), id="remote_bucket"), + pytest.param( + "s3://cool.domain/path/path2", + digest128("s3://cool.domain"), + id="remote_bucket_path_ignored", + ), + ], ) -def test_filesystem_destination_configuration(url, exp) -> None: - assert FilesystemDestinationClientConfiguration(bucket_url=url).fingerprint() == exp +def test_filesystem_fingerprint(bucket_url: Optional[str], expected_fingerprint: str) -> None: + config = FilesystemDestinationClientConfiguration(bucket_url=bucket_url) + + assert config.fingerprint() == expected_fingerprint def test_filesystem_factory_buckets(with_gdrive_buckets_env: str) -> None: diff --git a/tests/load/lance/test_lance_configuration.py b/tests/load/lance/test_lance_configuration.py index 77a1868293..12729f026d 100644 --- a/tests/load/lance/test_lance_configuration.py +++ b/tests/load/lance/test_lance_configuration.py @@ -11,7 +11,7 @@ from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.common.known_env import DLT_LOCAL_DIR from dlt.common.runtime.run_context import active -from dlt.common.utils import uniq_id +from dlt.common.utils import digest128, uniq_id from dlt.destinations.impl.lance.configuration import ( DEFAULT_LANCE_BUCKET_URL, @@ -65,6 +65,30 @@ def test_lance_storage_configuration_namespace_uri() -> None: assert config.namespace_uri == f"{local_dir_uri}/foo/bar" +@pytest.mark.parametrize( + "storage,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + LanceStorageConfiguration(bucket_url="data/lance"), + digest128(""), + id="storage_local", + ), + pytest.param( + LanceStorageConfiguration(bucket_url="s3://my-bucket/path"), + digest128("s3://my-bucket"), + id="storage_remote_bucket_only", + ), + ], +) +def test_lance_fingerprint( + storage: Optional[LanceStorageConfiguration], expected_fingerprint: str +) -> None: + config = LanceClientConfiguration(storage=storage) + + assert config.fingerprint() == expected_fingerprint + + def test_lance_storage_configuration_options() -> None: CREDS_PROVIDED_OPTS = {"creds_opt": "foo", "another_creds_opt": "bar"} USER_PROVIDED_OPTS = {"user_opt": "foo", "another_user_opt": "bar"} diff --git a/tests/load/lancedb/test_config.py b/tests/load/lancedb/test_config.py index 55f1fe9787..e9e0f8e6d7 100644 --- a/tests/load/lancedb/test_config.py +++ b/tests/load/lancedb/test_config.py @@ -1,12 +1,12 @@ import os -from typing import Iterator +from typing import Iterator, Optional import pytest import dlt from dlt.common.configuration import resolve_configuration from dlt.common.known_env import DLT_LOCAL_DIR -from dlt.common.utils import uniq_id +from dlt.common.utils import digest128, uniq_id from dlt.destinations.impl.lancedb.configuration import ( LanceDBClientConfiguration, @@ -32,6 +32,21 @@ def test_lancedb_configuration() -> None: assert config.embedding_model == "text-embedding-3-small" +@pytest.mark.parametrize( + "lance_uri,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param("local.db", digest128("local.db"), id="raw_local_uri"), + pytest.param("db://host/unknown", digest128("db://host/unknown"), id="raw_cloud_uri"), + pytest.param(":external:", digest128(":external:"), id="external_native_client"), + ], +) +def test_lancedb_fingerprint(lance_uri: Optional[str], expected_fingerprint: str) -> None: + config = LanceDBClientConfiguration(lance_uri=lance_uri) + + assert config.fingerprint() == expected_fingerprint + + def test_lancedb_follows_local_dir() -> None: local_dir = os.path.join(get_test_storage_root(), uniq_id()) os.makedirs(local_dir) diff --git a/tests/load/motherduck/test_motherduck_configuration.py b/tests/load/motherduck/test_motherduck_configuration.py new file mode 100644 index 0000000000..0ffe37de82 --- /dev/null +++ b/tests/load/motherduck/test_motherduck_configuration.py @@ -0,0 +1,45 @@ +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.motherduck.configuration import ( + MotherDuckClientConfiguration, + MotherDuckCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "md:///dlt_data?token=TOKEN", + digest128("TOKEN"), + id="legacy_token_query_param", + ), + pytest.param( + "md:///dlt_data?motherduck_token=TOKEN", + digest128("TOKEN"), + id="legacy_motherduck_token_query_param", + ), + ], +) +def test_motherduck_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = MotherDuckCredentials(connection_string) + config = MotherDuckClientConfiguration(credentials=credentials) + else: + config = MotherDuckClientConfiguration() + + assert config.fingerprint() == expected_fingerprint + + +def test_motherduck_fingerprint_uses_token_not_physical_location() -> None: + config = MotherDuckClientConfiguration( + credentials=MotherDuckCredentials("md:///dlt_data?token=TOKEN") + ) + + assert config.physical_location() == "" + assert config.fingerprint() == digest128("TOKEN") diff --git a/tests/load/mssql/test_mssql_configuration.py b/tests/load/mssql/test_mssql_configuration.py index 4c2b20bfa9..d261391edc 100644 --- a/tests/load/mssql/test_mssql_configuration.py +++ b/tests/load/mssql/test_mssql_configuration.py @@ -1,13 +1,14 @@ import os + import pyodbc import pytest -from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException +from dlt.common.configuration import ConfigFieldMissingException, resolve_configuration from dlt.common.exceptions import SystemConfigurationException from dlt.common.schema import Schema - +from dlt.common.utils import digest128 from dlt.destinations import mssql -from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, MsSqlClientConfiguration +from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -55,6 +56,32 @@ def test_mssql_credentials_defaults() -> None: assert creds.port == 1433 +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "mssql://user1:pass1@host1:1433/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + pytest.param( + "mssql://user1:pass1@host1:1434/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + ], +) +def test_mssql_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = MsSqlCredentials(connection_string) + config = MsSqlClientConfiguration(credentials=credentials) + else: + config = MsSqlClientConfiguration() + + assert config.fingerprint() == expected_fingerprint + + def test_parse_native_representation() -> None: # Case: unsupported driver specified. with pytest.raises(SystemConfigurationException): diff --git a/tests/load/postgres/test_postgres_configuration.py b/tests/load/postgres/test_postgres_configuration.py new file mode 100644 index 0000000000..be977c69a2 --- /dev/null +++ b/tests/load/postgres/test_postgres_configuration.py @@ -0,0 +1,36 @@ +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.postgres.configuration import ( + PostgresClientConfiguration, + PostgresCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "postgres://user1:pass1@host1:5432/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + pytest.param( + "postgres://user1:pass1@host1:5433/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + ], +) +def test_postgres_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = PostgresCredentials(connection_string) + config = PostgresClientConfiguration(credentials=credentials) + else: + config = PostgresClientConfiguration() + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/qdrant/test_qdrant_configuration.py b/tests/load/qdrant/test_qdrant_configuration.py new file mode 100644 index 0000000000..9618c6af42 --- /dev/null +++ b/tests/load/qdrant/test_qdrant_configuration.py @@ -0,0 +1,27 @@ +from typing import Optional + +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "qd_location,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param(":memory:", digest128(":memory:"), id="raw_memory_location"), + pytest.param( + "https://qdrant.example.com:6333/path", + digest128("https://qdrant.example.com:6333/path"), + id="raw_url_location", + ), + ], +) +def test_qdrant_fingerprint(qd_location: Optional[str], expected_fingerprint: str) -> None: + config = QdrantClientConfiguration(qd_location=qd_location) + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/redshift/test_redshift_configuration.py b/tests/load/redshift/test_redshift_configuration.py new file mode 100644 index 0000000000..66486c9151 --- /dev/null +++ b/tests/load/redshift/test_redshift_configuration.py @@ -0,0 +1,40 @@ +import pytest + +from dlt.common.configuration import resolve_configuration +from dlt.common.utils import digest128 +from dlt.destinations.impl.redshift.configuration import ( + RedshiftClientConfiguration, + RedshiftCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "postgres://user1:pass1@host1:5439/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + pytest.param( + "postgres://user1:pass1@host1:1234/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + ], +) +def test_redshift_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = resolve_configuration( + RedshiftCredentials(), + explicit_value=connection_string, + ) + config = RedshiftClientConfiguration(credentials=credentials) + else: + config = RedshiftClientConfiguration() + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index bc33f4f16f..bfb4cf2ea6 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -2,7 +2,7 @@ import sqlfluff from copy import deepcopy -from dlt.common.utils import uniq_id, custom_environ, digest128 +from dlt.common.utils import custom_environ, uniq_id from dlt.common.schema import Schema, utils from dlt.common.configuration import resolve_configuration @@ -57,15 +57,6 @@ def test_redshift_configuration() -> None: assert C.database == "UPPER_CASE_DATABASE" assert C.password == "pass" - # check fingerprint - assert RedshiftClientConfiguration().fingerprint() == "" - # based on host - c = resolve_configuration( - RedshiftCredentials(), - explicit_value="postgres://user1:pass@host1/db1?warehouse=warehouse1&role=role1", - ) - assert RedshiftClientConfiguration(credentials=c).fingerprint() == digest128("host1:5439") - def test_create_table(client: RedshiftClient) -> None: assert client.capabilities.generates_case_sensitive_identifiers() is False diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index fcb3adaaab..70ee6a5698 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -4,10 +4,12 @@ skip_if_not_active("snowflake") import os -import pytest from pathlib import Path +from typing import Optional from unittest.mock import patch +import pytest + from dlt.common.configuration.utils import add_config_to_env from tests.utils import TEST_DICT_CONFIG_PROVIDER, get_test_storage_root, test_storage @@ -429,12 +431,27 @@ def test_snowflake_provided_oauth_token(test_storage: FileStorage) -> None: assert creds.token != "SNOW_TOK" -def test_snowflake_configuration() -> None: - # def empty fingerprint - assert SnowflakeClientConfiguration().fingerprint() == "" - # based on host - c = resolve_configuration( - SnowflakeCredentials(), - explicit_value="snowflake://user1:pass@host1/db1?warehouse=warehouse1&role=role1", - ) - assert SnowflakeClientConfiguration(credentials=c).fingerprint() == digest128("host1") +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "snowflake://user1:pass@host1/db1?warehouse=warehouse1&role=role1", + digest128("host1"), + id="legacy_host_only", + ), + ], +) +def test_snowflake_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + credentials: Optional[SnowflakeCredentials] + if connection_string: + credentials = resolve_configuration( + SnowflakeCredentials(), + explicit_value=connection_string, + ) + else: + credentials = None + + config = SnowflakeClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint diff --git a/tests/load/sqlalchemy/test_sqlalchemy_configuration.py b/tests/load/sqlalchemy/test_sqlalchemy_configuration.py index f4e64f5d32..112877c9d2 100644 --- a/tests/load/sqlalchemy/test_sqlalchemy_configuration.py +++ b/tests/load/sqlalchemy/test_sqlalchemy_configuration.py @@ -9,7 +9,7 @@ from dlt.common.configuration import resolve_configuration from dlt.common.known_env import DLT_LOCAL_DIR -from dlt.common.utils import uniq_id +from dlt.common.utils import digest128, uniq_id from dlt.destinations import sqlalchemy as dlt_sqlalchemy from dlt.destinations.impl.sqlalchemy.configuration import ( SqlalchemyClientConfiguration, @@ -19,6 +19,30 @@ from tests.utils import get_test_storage_root +@pytest.mark.parametrize( + "credentials,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + SqlalchemyCredentials("sqlite:///:memory:"), + digest128(":memory:"), + id="sqlite_memory", + ), + pytest.param( + SqlalchemyCredentials("postgresql://user1:pass1@host1:5432/db1"), + digest128("host1:5432"), + id="postgres_host_port", + ), + ], +) +def test_sqlalchemy_fingerprint( + credentials: Optional[SqlalchemyCredentials], expected_fingerprint: str +) -> None: + config = SqlalchemyClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint + + @pytest.mark.parametrize( "database,query,expected", [ diff --git a/tests/load/synapse/test_synapse_configuration.py b/tests/load/synapse/test_synapse_configuration.py index 8aaea03b0f..ac052ffd8c 100644 --- a/tests/load/synapse/test_synapse_configuration.py +++ b/tests/load/synapse/test_synapse_configuration.py @@ -1,10 +1,11 @@ import os + import pytest from dlt.common.configuration import resolve_configuration from dlt.common.exceptions import SystemConfigurationException from dlt.common.schema import Schema - +from dlt.common.utils import digest128 from dlt.destinations import synapse from dlt.destinations.impl.synapse.configuration import ( SynapseClientConfiguration, @@ -23,6 +24,32 @@ def test_synapse_configuration() -> None: assert c.staging_use_msi is False +@pytest.mark.parametrize( + "connection_string,expected_fingerprint", + [ + pytest.param("", "", id="empty"), + pytest.param( + "synapse://user1:pass1@host1:1433/db1", + digest128("host1"), + id="legacy_host_only_default_port", + ), + pytest.param( + "synapse://user1:pass1@host1:1434/db1", + digest128("host1"), + id="legacy_host_only_custom_port", + ), + ], +) +def test_synapse_fingerprint(connection_string: str, expected_fingerprint: str) -> None: + if connection_string: + credentials = SynapseCredentials(connection_string) + config = SynapseClientConfiguration(credentials=credentials) + else: + config = SynapseClientConfiguration() + + assert config.fingerprint() == expected_fingerprint + + def test_synapse_factory() -> None: schema = Schema("schema") dest = synapse() diff --git a/tests/load/weaviate/test_weaviate_configuration.py b/tests/load/weaviate/test_weaviate_configuration.py new file mode 100644 index 0000000000..4f7496ccf5 --- /dev/null +++ b/tests/load/weaviate/test_weaviate_configuration.py @@ -0,0 +1,36 @@ +from typing import Optional + +import pytest + +from dlt.common.utils import digest128 +from dlt.destinations.impl.weaviate.configuration import ( + WeaviateClientConfiguration, + WeaviateCredentials, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "credentials,expected_fingerprint", + [ + pytest.param(None, "", id="empty"), + pytest.param( + WeaviateCredentials(url="https://weaviate.example.com:8080/v1"), + digest128("weaviate.example.com"), + id="hostname_only_url", + ), + pytest.param( + WeaviateCredentials(url="http://localhost:8080"), + digest128("localhost"), + id="hostname_only_localhost", + ), + ], +) +def test_weaviate_fingerprint( + credentials: Optional[WeaviateCredentials], expected_fingerprint: str +) -> None: + config = WeaviateClientConfiguration(credentials=credentials) + + assert config.fingerprint() == expected_fingerprint From 4d2fcdd6cf8e9ac7c459517f2d298dfe6bdfcb04 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 3 Jun 2026 12:19:49 +0200 Subject: [PATCH 08/16] deprecate is_same_physical_destination --- dlt/dataset/dataset.py | 14 ++++++++------ dlt/dataset/relation.py | 2 +- tests/destinations/test_join_compatibility.py | 9 ++++++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/dlt/dataset/dataset.py b/dlt/dataset/dataset.py index 607c917d32..81ab840d6c 100644 --- a/dlt/dataset/dataset.py +++ b/dlt/dataset/dataset.py @@ -28,6 +28,7 @@ from dlt.common.destination.client import JobClientBase, SupportsOpenTables, WithStateSync from dlt.common.schema import Schema from dlt.common.typing import Self +from dlt.common.warnings import Dlt100DeprecationWarning, deprecated from dlt.common.schema.typing import ( C_DLT_LOAD_ID, C_DLT_LOADS_TABLE_LOAD_ID, @@ -497,13 +498,14 @@ def get_dataset_sql_client(dataset: dlt.Dataset) -> SqlClientBase[Any]: raise SqlClientNotAvailable("dataset", dataset.dataset_name, client.config.destination_type) +@deprecated( + "Use `destination_client.config.can_join_with(other.destination_client.config)` instead.", + category=Dlt100DeprecationWarning, + stacklevel=2, +) def is_same_physical_destination(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: - """Check if both datasets are at the same physical destination. - - This is done by comparing the fingerprint of both destination configs. There - are potential false positive if two different config give access to the same destination. - """ - return str(dataset1.destination_client.config) == str(dataset2.destination_client.config) + """Check if both datasets are at the same physical destination.""" + return dataset1.destination_client.config.can_join_with(dataset2.destination_client.config) def _get_dataset_schema_from_destination_using_schema_name( diff --git a/dlt/dataset/relation.py b/dlt/dataset/relation.py index 995f9bbef3..8f477e9ef0 100644 --- a/dlt/dataset/relation.py +++ b/dlt/dataset/relation.py @@ -418,7 +418,7 @@ def join( if isinstance(other, dlt.Relation): # TODO: remove once we allow cross-dataset joins if not ( - self._dataset.is_same_physical_location(other._dataset) + self._dataset.is_same_physical_destination(other._dataset) and self._dataset.dataset_name == other._dataset.dataset_name ): raise ValueError( diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 68b635982c..14b47353d7 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -13,6 +13,7 @@ ) from dlt.common.destination.client import DestinationClientConfiguration from dlt.common.storages import FilesystemConfigurationWithLocalFiles +from dlt.common.warnings import Dlt100DeprecationWarning from dlt.dataset.dataset import Dataset, is_same_physical_destination from dlt.destinations.impl.postgres.configuration import ( PostgresClientConfiguration, @@ -249,9 +250,11 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: config1 = _StringyPhysicalDestinationConfig("host1", "first-display") config2 = _StringyPhysicalDestinationConfig("host1", "second-display") assert str(config1) != str(config2) - assert is_same_physical_destination( - cast(Dataset, _DatasetStub(config1)), cast(Dataset, _DatasetStub(config2)) - ) + + with pytest.warns(Dlt100DeprecationWarning, match="can_join_with"): + assert is_same_physical_destination( + cast(Dataset, _DatasetStub(config1)), cast(Dataset, _DatasetStub(config2)) + ) # physical_location() extraction across destinations From be6460597e76d935d86194c4214ab890bd109bc1 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 3 Jun 2026 13:41:37 +0200 Subject: [PATCH 09/16] return absolute path for local file destinations --- dlt/destinations/impl/filesystem/configuration.py | 9 +++++---- tests/destinations/test_join_compatibility.py | 13 ++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index b61cbbfcb4..2cbc6e37bc 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -2,6 +2,7 @@ import os from typing import Dict, Optional, Type +from urllib.parse import urlparse from dlt.common.typing import DictStrAny, DictStrOptionalStr @@ -48,16 +49,16 @@ def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: return super().resolve_credentials_type() def physical_location(self) -> str: - """Returns scheme://netloc for remote filesystems, or "" for local.""" + """Returns scheme://netloc for remote filesystems, or the absolute local path.""" if not self.bucket_url: return "" if self.is_local_path(self.bucket_url): - return "" - - from urllib.parse import urlparse + return self.make_local_path(self.make_file_url(self.bucket_url)) url = urlparse(self.bucket_url) + if url.scheme == "file": + return self.make_local_path(self.bucket_url) return f"{url.scheme}://{url.netloc}" def can_join_with(self, other: DestinationClientConfiguration) -> bool: diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 14b47353d7..4529674cdc 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -1,6 +1,7 @@ """Tests for destination configuration-level join-compatibility semantics.""" -from typing import Callable, cast, Optional +import os +from typing import Callable, cast, Optional, Union from typing_extensions import TypeAlias @@ -12,6 +13,7 @@ GcpServiceAccountCredentials, ) from dlt.common.destination.client import DestinationClientConfiguration +from dlt.common.runtime.run_context import active from dlt.common.storages import FilesystemConfigurationWithLocalFiles from dlt.common.warnings import Dlt100DeprecationWarning from dlt.dataset.dataset import Dataset, is_same_physical_destination @@ -90,6 +92,7 @@ ConfigFactory: TypeAlias = Callable[[], DestinationClientConfiguration] +ExpectedLocation: TypeAlias = Union[str, Callable[[], str]] class _PhysicalDestinationConfig(DestinationClientConfiguration): @@ -382,7 +385,9 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: id="fs_remote", ), pytest.param( - lambda: FilesystemDestinationClientConfiguration(bucket_url="/local/p"), "", id="fs_local" + lambda: FilesystemDestinationClientConfiguration(bucket_url="local/p"), + lambda: os.path.join(os.path.abspath(active().local_dir), "local/p"), + id="fs_local", ), # DuckLake pytest.param( @@ -435,7 +440,9 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: @pytest.mark.parametrize("factory,expected", PHYSICAL_DEST_CASES) -def test_physical_location(factory: ConfigFactory, expected: str) -> None: +def test_physical_location(factory: ConfigFactory, expected: ExpectedLocation) -> None: + if callable(expected): + expected = expected() assert factory().physical_location() == expected From 2a6cbf6679c8d2c8b01bdeb1602b2b0f310c74f4 Mon Sep 17 00:00:00 2001 From: travior Date: Wed, 3 Jun 2026 13:54:40 +0200 Subject: [PATCH 10/16] fix incorrect path separator on windows tests --- tests/destinations/test_join_compatibility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 4529674cdc..b12e3ad196 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -386,7 +386,7 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: ), pytest.param( lambda: FilesystemDestinationClientConfiguration(bucket_url="local/p"), - lambda: os.path.join(os.path.abspath(active().local_dir), "local/p"), + lambda: os.path.join(os.path.abspath(active().local_dir), "local", "p"), id="fs_local", ), # DuckLake From 9750ca5b8b9c5157282bd74ace4ce01888237224 Mon Sep 17 00:00:00 2001 From: travior Date: Thu, 4 Jun 2026 10:20:49 +0200 Subject: [PATCH 11/16] restore legacy fingerprint semantics for duckdb --- dlt/common/destination/client.py | 9 ++++----- dlt/destinations/impl/athena/configuration.py | 8 ++++++++ dlt/destinations/impl/fabric/configuration.py | 8 ++++++++ dlt/destinations/impl/sqlalchemy/configuration.py | 8 ++++++++ tests/destinations/test_destination_fingerprints.py | 5 ++--- tests/load/duckdb/test_duckdb_configuration.py | 5 ++--- 6 files changed, 32 insertions(+), 11 deletions(-) diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py index 9711cf78bd..939b888251 100644 --- a/dlt/common/destination/client.py +++ b/dlt/common/destination/client.py @@ -59,7 +59,6 @@ from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc from dlt.common.typing import is_optional_type -from dlt.common.utils import digest128 TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") @@ -168,11 +167,11 @@ def physical_location(self) -> str: """Returns a non-secret physical location identity, or "" when unavailable.""" return "" + # TODO: If we ever clean up fingerprinting across all destinations, consider making + # the default `digest128(self.physical_location())`. This will break telemetry + # semantics, so it must be a deliberate cutover. def fingerprint(self) -> str: - """Returns a hash of physical_location(), or "" when unavailable.""" - phys_loc = self.physical_location() - if phys_loc: - return digest128(phys_loc) + """Returns a destination fingerprint derived from selected configuration fields.""" return "" def can_join_with(self, other: "DestinationClientConfiguration") -> bool: diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index 0406bfabbd..1bd66e08e4 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -4,6 +4,7 @@ from dlt.common.configuration import configspec from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.configuration.specs import AwsCredentials +from dlt.common.utils import digest128 from dlt.destinations.impl.athena.utils import is_s3_tables_catalog @@ -71,6 +72,13 @@ def physical_location(self) -> str: return f"{region}/{catalog}" return "" + def fingerprint(self) -> str: + """Returns a fingerprint of the physical Athena location.""" + physical_location = self.physical_location() + if physical_location: + return digest128(physical_location) + return "" + def __str__(self) -> str: """Return displayable destination location""" if self.staging_config: diff --git a/dlt/destinations/impl/fabric/configuration.py b/dlt/destinations/impl/fabric/configuration.py index 7409ec7c89..37c34a3582 100644 --- a/dlt/destinations/impl/fabric/configuration.py +++ b/dlt/destinations/impl/fabric/configuration.py @@ -5,6 +5,7 @@ from dlt.common.configuration.specs import AzureServicePrincipalCredentials from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration from dlt.common.exceptions import MissingDependencyException +from dlt.common.utils import digest128 from dlt import version _AZURE_STORAGE_EXTRA = f"{version.DLT_PKG_NAME}[az]" @@ -172,5 +173,12 @@ def physical_location(self) -> str: return f"{self.credentials.host}:{port}" return "" + def fingerprint(self) -> str: + """Returns a fingerprint of the physical Fabric location.""" + physical_location = self.physical_location() + if physical_location: + return digest128(physical_location) + return "" + __all__ = ["FabricCredentials", "FabricClientConfiguration"] diff --git a/dlt/destinations/impl/sqlalchemy/configuration.py b/dlt/destinations/impl/sqlalchemy/configuration.py index c0ff7b4994..4902cbf98f 100644 --- a/dlt/destinations/impl/sqlalchemy/configuration.py +++ b/dlt/destinations/impl/sqlalchemy/configuration.py @@ -14,6 +14,7 @@ ) from dlt.common.storages.configuration import WithLocalFiles from dlt.common.typing import Annotated +from dlt.common.utils import digest128 from dlt.common.warnings import DltDeprecationWarning if TYPE_CHECKING: @@ -288,6 +289,13 @@ def physical_location(self) -> str: return host return "" + def fingerprint(self) -> str: + """Returns a fingerprint of the physical SQLAlchemy location.""" + physical_location = self.physical_location() + if physical_location: + return digest128(physical_location) + return "" + def can_join_with(self, other: DestinationClientConfiguration) -> bool: """Returns True when dialect-specific destination identities match.""" if not isinstance(other, SqlalchemyClientConfiguration): diff --git a/tests/destinations/test_destination_fingerprints.py b/tests/destinations/test_destination_fingerprints.py index 7acb017027..8788cbb1a9 100644 --- a/tests/destinations/test_destination_fingerprints.py +++ b/tests/destinations/test_destination_fingerprints.py @@ -1,5 +1,4 @@ from dlt.common.destination.client import DestinationClientConfiguration -from dlt.common.utils import digest128 class _PhysicalDestinationConfig(DestinationClientConfiguration): @@ -11,10 +10,10 @@ def physical_location(self) -> str: return self._physical_location -def test_base_fingerprint_hashes_non_empty_physical_location() -> None: +def test_base_fingerprint_ignores_physical_location() -> None: config = _PhysicalDestinationConfig("test-host:5432") - assert config.fingerprint() == digest128("test-host:5432") + assert config.fingerprint() == "" def test_base_fingerprint_returns_empty_string_without_physical_location() -> None: diff --git a/tests/load/duckdb/test_duckdb_configuration.py b/tests/load/duckdb/test_duckdb_configuration.py index e7d1981cca..9056524e46 100644 --- a/tests/load/duckdb/test_duckdb_configuration.py +++ b/tests/load/duckdb/test_duckdb_configuration.py @@ -2,7 +2,6 @@ import pytest -from dlt.common.utils import digest128 from dlt.destinations.impl.duckdb.configuration import ( DuckDbClientConfiguration, DuckDbCredentials, @@ -18,12 +17,12 @@ pytest.param(None, "", id="empty"), pytest.param( DuckDbCredentials(":memory:"), - digest128(":memory:"), + "", id="memory_database", ), pytest.param( DuckDbCredentials("local.duckdb"), - digest128("local.duckdb"), + "", id="database_path", ), ], From e25aeb1198551cf860ea6b03919159f66c31a565 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jun 2026 21:43:36 +0200 Subject: [PATCH 12/16] uses location for filesystem read check, write always false --- .../impl/filesystem/configuration.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 2cbc6e37bc..5bcc890acf 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -61,17 +61,20 @@ def physical_location(self) -> str: return self.make_local_path(self.bucket_url) return f"{url.scheme}://{url.netloc}" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: - """Returns True for any other filesystem destination. - - Filesystem tables are queried through a local engine (e.g. DuckDB) that - can access multiple storage backends in a single query, so join - compatibility is determined by the engine, not by the storage location. + def can_write_from(self, other: DestinationClientConfiguration) -> bool: + """Filesystem does not have an engine that can write. `dlt` is that engine, + and setting False here we enforce it's usage """ - if isinstance(other, FilesystemDestinationClientConfiguration): - return True return False + def can_read_from(self, other: DestinationClientConfiguration) -> bool: + # filesystem tables are queried through a local engine (e.g. DuckDB) that + # can access multiple storage backends in a single query, so join + # compatibility is determined by the engine, not by the storage location. + + # until auto ATTACH is implemented, storage location must be used + return super().can_read_from(other) + def on_resolved(self) -> None: # Validate layout and show unused placeholders _, layout_placeholders = check_layout(self.layout, self.extra_placeholders) From fb074717f41fb08aee651eda3e7c65d28fe692b8 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jun 2026 21:44:20 +0200 Subject: [PATCH 13/16] uses schema name to compute ducklake location --- dlt/destinations/impl/motherduck/configuration.py | 7 ++++++- tests/load/ducklake/test_ducklake_client.py | 10 +++++----- tests/load/ducklake/test_ducklake_configuration.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index 209d8da9b5..1aae0c665f 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -136,7 +136,7 @@ def fingerprint(self) -> str: return digest128(self.credentials.password) return "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Returns True for MotherDuck configs with the same token.""" if not isinstance(other, MotherDuckClientConfiguration): return False @@ -149,6 +149,11 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: return self_token == other_token + def can_write_from(self, other: "DestinationClientConfiguration") -> bool: + # motherduck will be able to write from any attached duckdb + # until ATTACH is implemented we require the same token which is used as identity + return super().can_read_from(other) + class MotherDuckCatalogMissing(NativeValueError): pass diff --git a/tests/load/ducklake/test_ducklake_client.py b/tests/load/ducklake/test_ducklake_client.py index 0d0852990c..d444a939ad 100644 --- a/tests/load/ducklake/test_ducklake_client.py +++ b/tests/load/ducklake/test_ducklake_client.py @@ -100,7 +100,7 @@ def test_ducklake_configuration_default() -> None: assert credentials.storage_url == str(local_dir / "ducklake.files") # file url assert credentials.storage.bucket_url.startswith("file://") - expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}" assert configuration.physical_location() == expected_loc @@ -119,7 +119,7 @@ def test_ducklake_configuration_duckdb_catalog() -> None: assert credentials.ducklake_name == DEFAULT_DUCKLAKE_NAME conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.duckdb")) - expected_loc = f"duckdb://{local_dir / 'ducklake.duckdb'}#{DEFAULT_DUCKLAKE_NAME}" + expected_loc = f"duckdb://{local_dir / 'ducklake.duckdb'}" assert configuration.physical_location() == expected_loc @@ -137,7 +137,7 @@ def test_ducklake_configuration_ducklake_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "my_ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "my_ducklake.files") - expected_loc = f"sqlite://{local_dir / 'my_ducklake.sqlite'}#my_ducklake" + expected_loc = f"sqlite://{local_dir / 'my_ducklake.sqlite'}" assert configuration.physical_location() == expected_loc @@ -155,7 +155,7 @@ def test_ducklake_configuration_destination_name() -> None: conn_str = credentials.catalog.to_native_representation() assert conn_str.endswith(str(local_dir / "ducklake.sqlite")) assert credentials.storage_url == str(local_dir / "ducklake.files") - expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}#{DEFAULT_DUCKLAKE_NAME}" + expected_loc = f"sqlite://{local_dir / 'ducklake.sqlite'}" assert configuration.physical_location() == expected_loc @@ -201,7 +201,7 @@ def test_ducklake_configuration_storage_credentials() -> None: ) # NOTE: dataset folders will be created in /lake/ assert credentials.storage_url == "s3://dlt-ci-test-bucket/lake" - assert configuration.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" + assert configuration.physical_location() == "postgres://localhost:5432/dlt_data#my_ducklake" def test_ducklake_configuration_catalog_credentials() -> None: diff --git a/tests/load/ducklake/test_ducklake_configuration.py b/tests/load/ducklake/test_ducklake_configuration.py index ae69ef4c99..d66a94edbb 100644 --- a/tests/load/ducklake/test_ducklake_configuration.py +++ b/tests/load/ducklake/test_ducklake_configuration.py @@ -49,6 +49,6 @@ def test_ducklake_fingerprint_uses_storage_not_physical_location() -> None: ) ) - assert config.physical_location() == "postgresql://localhost:5432/dlt_data#my_ducklake" + assert config.physical_location() == "postgres://localhost:5432/dlt_data#my_ducklake" assert config.fingerprint() == digest128("s3://dlt-ci-test-bucket") assert config.fingerprint() != digest128(config.physical_location()) From 0b10ab246a26c2f43c5cdcdf86737b1a41272d76 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jun 2026 21:45:22 +0200 Subject: [PATCH 14/16] comutes lance data location from catalog location over storage location --- dlt/destinations/impl/lance/configuration.py | 27 +++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/dlt/destinations/impl/lance/configuration.py b/dlt/destinations/impl/lance/configuration.py index bd8b95e665..dbff3f5f85 100644 --- a/dlt/destinations/impl/lance/configuration.py +++ b/dlt/destinations/impl/lance/configuration.py @@ -361,15 +361,34 @@ def fingerprint(self) -> str: return self.storage.fingerprint() if self.storage else "" def physical_location(self) -> str: - """Returns the resolved Lance catalog root.""" + """Returns the Lance catalog root which identifies the namespace.""" + # for `rest` catalogs the location is the namespace server uri + if self.catalog_type == "rest": + if isinstance(self.credentials, RestCatalogCredentials) and self.credentials.uri: + return f"rest:{self.credentials.uri.rstrip('/')}" + return "" + + # for `dir` catalogs the explicit manifest root takes precedence + catalog_root: Optional[str] = None if ( isinstance(self.credentials, DirectoryCatalogCredentials) and self.credentials.bucket_url ): - return f"{self.catalog_type}:{self.credentials.bucket_url.rstrip('/')}" + catalog_root = self.credentials.bucket_url + elif self.storage and self.storage.bucket_url: + # same fallback as on_resolved: catalog colocates with data storage + catalog_root = self.storage.namespace_uri + if catalog_root: + return f"{self.catalog_type}:{catalog_root.rstrip('/')}" return "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + def can_write_from(self, other: DestinationClientConfiguration) -> bool: + """Lance does not have an engine that can write. `dlt` is that engine, + and returning False here enforces its usage. + """ + return False + + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Returns True for the same Lance catalog and bound dlt dataset.""" if not isinstance(other, LanceClientConfiguration): return False @@ -379,4 +398,6 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: if not self_loc or not other_loc or self_loc != other_loc: return False + # TODO: remove the dataset check when cross dataset joins are implemented. any + # dataset (namespace) under the same catalog root is readable via the same ATTACH return self.dataset_name == other.dataset_name From f30acc97ad5acafa1de97aeb5a7a2de9eb135b85 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jun 2026 21:45:57 +0200 Subject: [PATCH 15/16] splits abilty to read from (join) from writing from (select from into) --- dlt/common/destination/client.py | 13 +- dlt/dataset/dataset.py | 2 +- .../impl/ducklake/configuration.py | 36 ++- .../impl/lancedb/configuration.py | 17 +- .../impl/postgres/configuration.py | 2 +- dlt/destinations/impl/qdrant/configuration.py | 9 +- .../impl/sqlalchemy/configuration.py | 65 ++--- .../impl/weaviate/configuration.py | 2 +- tests/destinations/test_join_compatibility.py | 254 ++++++++++++++++-- .../load/pipeline/test_join_compatibility.py | 21 +- .../test_sqlalchemy_configuration.py | 13 +- 11 files changed, 337 insertions(+), 97 deletions(-) diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py index 939b888251..9342e843b6 100644 --- a/dlt/common/destination/client.py +++ b/dlt/common/destination/client.py @@ -174,8 +174,10 @@ def fingerprint(self) -> str: """Returns a destination fingerprint derived from selected configuration fields.""" return "" - def can_join_with(self, other: "DestinationClientConfiguration") -> bool: - """Returns True for same-type destinations with the same non-empty identity.""" + def can_read_from(self, other: "DestinationClientConfiguration") -> bool: + """Returns True if `self` can read data from `other`. + In case of SQL engines it is an ability to SELECT / JOIN + """ if not isinstance(other, DestinationClientConfiguration): return False if self.destination_type != other.destination_type: @@ -186,6 +188,13 @@ def can_join_with(self, other: "DestinationClientConfiguration") -> bool: return True return False + def can_write_from(self, other: "DestinationClientConfiguration") -> bool: + """Returns true if `self` can write data from `other` + In case of SQL engines it is an ability to INSERT FROM + """ + # in most destinations, ability to read is also the same as abilty to write + return self.can_read_from(other) + def __str__(self) -> str: """Return displayable destination location""" return str(self.credentials) diff --git a/dlt/dataset/dataset.py b/dlt/dataset/dataset.py index 81ab840d6c..a23cd44f2e 100644 --- a/dlt/dataset/dataset.py +++ b/dlt/dataset/dataset.py @@ -505,7 +505,7 @@ def get_dataset_sql_client(dataset: dlt.Dataset) -> SqlClientBase[Any]: ) def is_same_physical_destination(dataset1: dlt.Dataset, dataset2: dlt.Dataset) -> bool: """Check if both datasets are at the same physical destination.""" - return dataset1.destination_client.config.can_join_with(dataset2.destination_client.config) + return dataset1.destination_client.config.can_read_from(dataset2.destination_client.config) def _get_dataset_schema_from_destination_using_schema_name( diff --git a/dlt/destinations/impl/ducklake/configuration.py b/dlt/destinations/impl/ducklake/configuration.py index 1f783d04c5..b217c51c7c 100644 --- a/dlt/destinations/impl/ducklake/configuration.py +++ b/dlt/destinations/impl/ducklake/configuration.py @@ -149,23 +149,37 @@ def fingerprint(self) -> str: return self.credentials.storage.fingerprint() def physical_location(self) -> str: - """Returns credential-free catalog identity plus ducklake name.""" + """Returns credential-free catalog identity which locates the ducklake.""" if not self.credentials or not self.credentials.catalog: return "" catalog = self.credentials.catalog - ducklake_name = self.credentials.ducklake_name or DEFAULT_DUCKLAKE_NAME + drivername = catalog.drivername or "" + # attach statement converts `postgresql` to duckdb-known `postgres` + if drivername == "postgresql": + drivername = "postgres" - if catalog.host: - port_str = f":{catalog.port}" if catalog.port else "" - db_str = f"/{catalog.database}" if catalog.database else "" - catalog_id = f"{catalog.drivername}://{catalog.host}{port_str}{db_str}" - elif catalog.database: - catalog_id = f"{catalog.drivername}://{catalog.database}" - else: - catalog_id = catalog.drivername or "unknown" + # TODO: motherduck catalog has no non-secret account identity + if drivername == "md": + return "" - return f"{catalog_id}#{ducklake_name}" + # file catalogs: the database file is the lake, attach name is just an alias + if drivername in ("duckdb", "sqlite"): + if catalog.database: + return f"{drivername}://{catalog.database}" + return "" + + # sql catalogs host one lake per metadata schema which defaults to ducklake name + if catalog.host and catalog.database: + metadata_schema = ( + self.credentials.metadata_schema + or self.credentials.ducklake_name + or DEFAULT_DUCKLAKE_NAME + ) + # NOTE: ports must be specified (or not) consistently across configs to match + port_str = f":{catalog.port}" if catalog.port else "" + return f"{drivername}://{catalog.host}{port_str}/{catalog.database}#{metadata_schema}" + return "" def on_resolved(self) -> None: # redirect local catalog database file to `local_dir` diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py index 5ca152643e..9ae4f0e43b 100644 --- a/dlt/destinations/impl/lancedb/configuration.py +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -215,14 +215,19 @@ def physical_location(self) -> str: return self.lance_uri - def can_join_with(self, other: DestinationClientConfiguration) -> bool: - """Returns True for the same LanceDB URI and table naming layout.""" + def can_write_from(self, other: DestinationClientConfiguration) -> bool: + """LanceDB does not have an engine that can write. `dlt` is that engine, + and returning False here enforces its usage. + """ + return False + + def can_read_from(self, other: DestinationClientConfiguration) -> bool: + """Returns True for the same LanceDB URI.""" if not isinstance(other, LanceDBClientConfiguration): return False + # any table at the same location can be read via the same ATTACH (lance extension), + # `dataset_separator` only affects table naming and does not limit readability self_loc = self.physical_location() other_loc = other.physical_location() - if not self_loc or not other_loc or self_loc != other_loc: - return False - - return self.dataset_separator == other.dataset_separator + return bool(self_loc and other_loc and self_loc == other_loc) diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index 384574d9d4..feda982442 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -64,7 +64,7 @@ def physical_location(self) -> str: return f"{self.credentials.host}:{port}" return "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Returns True for the same Postgres host:port and database.""" if not isinstance(other, PostgresClientConfiguration): return False diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index 06ebb1122e..ca7d24f704 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -161,7 +161,14 @@ def physical_location(self) -> str: """Returns the Qdrant connection location.""" return self.qd_location or "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + # TODO: qdrant supports cross collection (and cross instance) writes via point streaming + # (scroll -> upsert, see `qdrant_client.migrate`). this is not SQL so it requires a + # qdrant specific model job and a non-SQL transformation input to be useful. + def can_write_from(self, other: DestinationClientConfiguration) -> bool: + """Qdrant cannot execute SQL models.""" + return False + + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Qdrant does not support dlt SQL joins.""" return False diff --git a/dlt/destinations/impl/sqlalchemy/configuration.py b/dlt/destinations/impl/sqlalchemy/configuration.py index 4902cbf98f..5c09a302c2 100644 --- a/dlt/destinations/impl/sqlalchemy/configuration.py +++ b/dlt/destinations/impl/sqlalchemy/configuration.py @@ -268,25 +268,23 @@ def on_resolved(self) -> None: ) def physical_location(self) -> str: - """Returns sqlite path for sqlite, otherwise host:port.""" + """Returns sqlite database path for sqlite, otherwise host:port.""" if not self.credentials: return "" - drivername = self.credentials.drivername or "" - database = self.credentials.database - host = self.credentials.host - port = self.credentials.port - - if drivername == "sqlite": - if SqlalchemyCredentials.is_memory_database(database, self.credentials.query): - return ":memory:" - return database or "" - - if host: - # Default-vs-explicit port mismatches may reject otherwise valid joins. - if port: - return f"{host}:{port}" - return host + if self.get_backend_name() == "sqlite": + # each in-memory database is a separate database + if SqlalchemyCredentials.is_memory_database( + self.credentials.database, self.credentials.query + ): + return "" + return self.credentials.database or "" + + if self.credentials.host: + # NOTE: default-vs-explicit port mismatches may reject otherwise valid joins + if self.credentials.port: + return f"{self.credentials.host}:{self.credentials.port}" + return self.credentials.host return "" def fingerprint(self) -> str: @@ -296,7 +294,7 @@ def fingerprint(self) -> str: return digest128(physical_location) return "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Returns True when dialect-specific destination identities match.""" if not isinstance(other, SqlalchemyClientConfiguration): return False @@ -304,35 +302,22 @@ def can_join_with(self, other: DestinationClientConfiguration) -> bool: if not self.credentials or not other.credentials: return False - self_dialect = (self.credentials.drivername or "").lower() - other_dialect = (other.credentials.drivername or "").lower() - - if self_dialect != other_dialect: + self_backend = self.get_backend_name() + if not self_backend or self_backend != other.get_backend_name(): return False - if self_dialect == "sqlite": - self_loc = self.physical_location() - other_loc = other.physical_location() - return bool(self_loc and other_loc and self_loc == other_loc) - - if self_dialect == "postgresql": - self_loc = self.physical_location() - other_loc = other.physical_location() - if not self_loc or not other_loc or self_loc != other_loc: - return False - self_db = self.credentials.database - other_db = other.credentials.database - return self_db is not None and other_db is not None and self_db == other_db - - if self_dialect in ("mysql", "mssql", "oracle", "db2"): - self_loc = self.physical_location() - other_loc = other.physical_location() - return bool(self_loc and other_loc and self_loc == other_loc) - self_loc = self.physical_location() other_loc = other.physical_location() if not self_loc or not other_loc or self_loc != other_loc: return False + + # sqlite: the database file is the location. mysql and mssql can query across + # databases on the same server (database is schema-like / 3-part names) + if self_backend in ("sqlite", "mysql", "mssql"): + return True + + # remaining dialects (postgresql, oracle, db2, unknown) bind a connection to a single + # database: oracle needs db links and db2 needs federation to query across databases self_db = self.credentials.database other_db = other.credentials.database return self_db is not None and other_db is not None and self_db == other_db diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 57c72eaad9..d710cc1f5c 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -81,6 +81,6 @@ def physical_location(self) -> str: return urlparse(self.credentials.url).hostname or "" return "" - def can_join_with(self, other: DestinationClientConfiguration) -> bool: + def can_read_from(self, other: DestinationClientConfiguration) -> bool: """Weaviate does not support dlt SQL joins.""" return False diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index b12e3ad196..122992b7c7 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -80,9 +80,11 @@ LanceDBCredentials, ) from dlt.destinations.impl.lance.configuration import ( + DEFAULT_LANCE_NAMESPACE_NAME, DirectoryCatalogCredentials, LanceClientConfiguration, LanceStorageConfiguration, + RestCatalogCredentials, ) from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration from dlt.destinations.impl.weaviate.configuration import ( @@ -126,15 +128,15 @@ def __init__(self, config: DestinationClientConfiguration) -> None: def assert_joinable( config1: DestinationClientConfiguration, config2: DestinationClientConfiguration ) -> None: - assert config1.can_join_with(config2) - assert config2.can_join_with(config1) + assert config1.can_read_from(config2) + assert config2.can_read_from(config1) def assert_not_joinable( config1: DestinationClientConfiguration, config2: DestinationClientConfiguration ) -> None: - assert not config1.can_join_with(config2) - assert not config2.can_join_with(config1) + assert not config1.can_read_from(config2) + assert not config2.can_read_from(config1) def assert_join_result( @@ -160,10 +162,12 @@ def _ducklake_creds( catalog_str: str, name: str = DEFAULT_DUCKLAKE_NAME, storage_url: Optional[str] = None, + metadata_schema: Optional[str] = None, ) -> DuckLakeCredentials: """Build DuckLake credentials.""" return DuckLakeCredentials( ducklake_name=name, + metadata_schema=metadata_schema, catalog=ConnectionStringCredentials(catalog_str), storage=( FilesystemConfigurationWithLocalFiles(bucket_url=storage_url) if storage_url else None @@ -223,6 +227,30 @@ def _lance_config(catalog_root: str, dataset_name: str = "dataset") -> LanceClie return c +def _lance_rest_config( + uri: Optional[str], dataset_name: str = "dataset" +) -> LanceClientConfiguration: + """Build Lance config with REST namespace catalog.""" + c = LanceClientConfiguration( + catalog_type="rest", + credentials=RestCatalogCredentials(uri=uri), + ) + c._bind_dataset_name(dataset_name) + return c + + +def _lance_multi_base_config( + catalog_root: Optional[str], storage_root: str, dataset_name: str = "dataset" +) -> LanceClientConfiguration: + """Build Lance config with manifest catalog and data storage in separate locations.""" + c = LanceClientConfiguration( + credentials=DirectoryCatalogCredentials(bucket_url=catalog_root) if catalog_root else None, + storage=LanceStorageConfiguration(bucket_url=storage_root), + ) + c._bind_dataset_name(dataset_name) + return c + + # Base DestinationClientConfiguration join contract def test_base_can_join_with_default_false_when_physical_locations_differ() -> None: config1 = _PhysicalDestinationConfig("host1") @@ -244,9 +272,9 @@ def test_base_can_join_with_default_false_when_empty_physical_location() -> None def test_base_can_join_with_returns_false_for_non_config() -> None: config = _PhysicalDestinationConfig("host1") - assert not config.can_join_with("not a config") # type: ignore[arg-type] - assert not config.can_join_with(None) - assert not config.can_join_with(42) # type: ignore[arg-type] + assert not config.can_read_from("not a config") # type: ignore[arg-type] + assert not config.can_read_from(None) + assert not config.can_read_from(42) # type: ignore[arg-type] def test_is_same_physical_location_delegates_to_can_join_with() -> None: @@ -389,25 +417,38 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: lambda: os.path.join(os.path.abspath(active().local_dir), "local", "p"), id="fs_local", ), - # DuckLake + # DuckLake: sql catalogs host one lake per metadata schema (defaults to ducklake name) pytest.param( lambda: DuckLakeClientConfiguration( - credentials=_ducklake_creds("pg://u@h:5432/db", "lake") + credentials=_ducklake_creds("postgresql://u@h:5432/db", "lake") ), - "pg://h:5432/db#lake", + "postgres://h:5432/db#lake", id="dl_remote_cat", ), + pytest.param( + lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("postgres://u@h:5432/db")), + f"postgres://h:5432/db#{DEFAULT_DUCKLAKE_NAME}", + id="dl_remote_cat_default_name", + ), + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake", metadata_schema="meta") + ), + "postgres://h:5432/db#meta", + id="dl_remote_cat_explicit_metadata_schema", + ), + # DuckLake: file catalogs are the lake themselves, attach name is just an alias pytest.param( lambda: DuckLakeClientConfiguration( credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake") ), - "sqlite://cat.sqlite#lake", + "sqlite://cat.sqlite", id="dl_local_cat", ), pytest.param( - lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("sqlite:///cat.sqlite")), - f"sqlite://cat.sqlite#{DEFAULT_DUCKLAKE_NAME}", - id="dl_default_name", + lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("md:///md_db", "lake")), + "", + id="dl_md_cat_no_identity", ), # Fabric pytest.param( @@ -623,6 +664,7 @@ def test_physical_location(factory: ConfigFactory, expected: ExpectedLocation) - True, id="dl_same_cat_name", ), + # the file is the lake, attach name does not matter pytest.param( lambda: DuckLakeClientConfiguration( credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake1") @@ -630,8 +672,47 @@ def test_physical_location(factory: ConfigFactory, expected: ExpectedLocation) - lambda: DuckLakeClientConfiguration( credentials=_ducklake_creds("sqlite:///cat.sqlite", "lake2") ), + True, + id="dl_file_cat_diff_name", + ), + # sql catalogs: different name means different metadata schema, so a different lake + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake1") + ), + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake2") + ), + False, + id="dl_sql_cat_diff_name", + ), + # sql catalogs: explicit metadata schema overrides the name + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake1", metadata_schema="meta") + ), + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake2", metadata_schema="meta") + ), + True, + id="dl_sql_cat_same_metadata_schema", + ), + pytest.param( + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake", metadata_schema="meta1") + ), + lambda: DuckLakeClientConfiguration( + credentials=_ducklake_creds("postgres://u@h:5432/db", "lake", metadata_schema="meta2") + ), False, - id="dl_same_cat_diff_name", + id="dl_sql_cat_diff_metadata_schema", + ), + # md catalogs have no non-secret identity + pytest.param( + lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("md:///md_db", "lake")), + lambda: DuckLakeClientConfiguration(credentials=_ducklake_creds("md:///md_db", "lake")), + False, + id="dl_md_cat_not_joinable", ), ] @@ -811,17 +892,38 @@ def test_cross_type_different_physical_locations() -> None: # Filesystem special cases -def test_filesystem_joinability_is_engine_based_not_location_based() -> None: - c1 = FilesystemDestinationClientConfiguration(bucket_url="s3://b1/p") - c2 = FilesystemDestinationClientConfiguration(bucket_url="s3://b2/p") - c3 = FilesystemDestinationClientConfiguration(bucket_url="/local/p") - c4 = FilesystemDestinationClientConfiguration(bucket_url="gs://b/p") - assert_joinable(c1, c2) - assert_joinable(c1, c3) - assert_joinable(c1, c4) +# NOTE: reading across different filesystem locations requires auto ATTACH in the +# duckdb view layer; until then only the same storage location is readable +@pytest.mark.parametrize( + "url1,url2,expected", + [ + pytest.param("s3://b/p1", "s3://b/p2", True, id="same_bucket_different_prefix"), + pytest.param("s3://b1/p", "s3://b2/p", False, id="different_bucket"), + pytest.param("s3://b/p", "gs://b/p", False, id="different_scheme_same_bucket"), + pytest.param("/local/p", "/local/p", True, id="same_local_path"), + pytest.param("/local/p1", "/local/p2", False, id="different_local_path"), + pytest.param("s3://b/p", "/local/p", False, id="remote_vs_local"), + ], +) +def test_filesystem_can_read_from_same_location(url1: str, url2: str, expected: bool) -> None: + c1 = FilesystemDestinationClientConfiguration(bucket_url=url1) + c2 = FilesystemDestinationClientConfiguration(bucket_url=url2) + assert_join_result(c1, c2, expected) + + +def test_filesystem_can_never_write() -> None: + """dlt is the only engine that writes to filesystem, so SQL write is never possible.""" + c1 = FilesystemDestinationClientConfiguration(bucket_url="s3://b/p") + c2 = FilesystemDestinationClientConfiguration(bucket_url="s3://b/p") + # same location is readable but not writable + assert c1.can_read_from(c2) + assert not c1.can_write_from(c2) + assert not c2.can_write_from(c1) + # not even from itself + assert not c1.can_write_from(c1) -def test_filesystem_cannot_join_with_non_filesystem() -> None: +def test_filesystem_cannot_read_from_non_filesystem() -> None: c = FilesystemDestinationClientConfiguration(bucket_url="s3://b/p") other = _PhysicalDestinationConfig("s3://b") assert_not_joinable(c, other) @@ -897,6 +999,25 @@ def test_motherduck_can_join_with_non_motherduck() -> None: pytest.param("postgresql://u@h:5432/db", "mysql://u@h:3306/db", False, id="diff_dialects"), pytest.param("unknown://u@h:1234/db", "unknown://u@h:1234/db", True, id="unknown_same"), pytest.param("unknown://u@h:1234/db1", "unknown://u@h:1234/db2", False, id="unknown_diff_db"), + # dbapi driver suffix does not change the backend identity + pytest.param( + "mysql+pymysql://u@h:3306/db1", "mysql+mysqldb://u@h:3306/db2", True, id="mysql_dbapi" + ), + pytest.param( + "postgresql+psycopg2://u@h:5432/db", "postgresql://u@h:5432/db", True, id="pg_dbapi" + ), + # each in-memory database is a separate database + pytest.param("sqlite:///:memory:", "sqlite:///:memory:", False, id="sqlite_memory"), + # mssql can query across databases via 3-part names + pytest.param( + "mssql+pyodbc://u@h:1433/db1", "mssql+pyodbc://u@h:1433/db2", True, id="mssql_diff_db" + ), + # oracle (db links) and db2 (federation) cannot query across databases + pytest.param("oracle://u@h:1521/svc", "oracle://u@h:1521/svc", True, id="oracle_same_service"), + pytest.param( + "oracle://u@h:1521/svc1", "oracle://u@h:1521/svc2", False, id="oracle_diff_service" + ), + pytest.param("db2://u@h:50000/db1", "db2://u@h:50000/db2", False, id="db2_diff_db"), ] @@ -914,7 +1035,7 @@ def test_sqlalchemy_can_join_with(conn1: str, conn2: str, expected: bool) -> Non lambda: _lancedb_config("/tmp/db.lancedb"), lambda: _lancedb_config("/tmp/db.lancedb"), True, - id="same_uri_dataset_separator", + id="same_uri", ), pytest.param( lambda: _lancedb_config("/tmp/db1.lancedb"), @@ -928,11 +1049,13 @@ def test_sqlalchemy_can_join_with(conn1: str, conn2: str, expected: bool) -> Non True, id="different_dataset_same_uri", ), + # any table at the same location is readable via the same ATTACH, + # separator only affects table naming pytest.param( lambda: _lancedb_config("/tmp/db.lancedb", dataset_separator="___"), lambda: _lancedb_config("/tmp/db.lancedb", dataset_separator="__"), - False, - id="different_separator", + True, + id="different_separator_same_uri", ), pytest.param( lambda: _lancedb_config(":external:"), @@ -946,6 +1069,18 @@ def test_lancedb_can_join_with(f1: ConfigFactory, f2: ConfigFactory, expected: b assert_join_result(f1(), f2(), expected) +def test_lancedb_can_never_write() -> None: + """dlt is the only engine that writes to LanceDB, so SQL write is never possible.""" + c1 = _lancedb_config("/tmp/db.lancedb") + c2 = _lancedb_config("/tmp/db.lancedb") + # same location is readable but not writable + assert c1.can_read_from(c2) + assert not c1.can_write_from(c2) + assert not c2.can_write_from(c1) + # not even from itself + assert not c1.can_write_from(c1) + + @pytest.mark.parametrize( "f1,f2,expected", [ @@ -961,11 +1096,30 @@ def test_lancedb_can_join_with(f1: ConfigFactory, f2: ConfigFactory, expected: b False, id="different_catalog", ), + # TODO: flip to True when cross dataset joins are implemented pytest.param( lambda: _lance_config("file:///tmp/lance", dataset_name="dataset1"), lambda: _lance_config("file:///tmp/lance", dataset_name="dataset2"), False, - id="different_dataset", + id="different_dataset_same_catalog", + ), + pytest.param( + lambda: _lance_multi_base_config("s3://catalogs/manifest", "s3://data1/lake"), + lambda: _lance_multi_base_config("s3://catalogs/manifest", "s3://data2/lake"), + True, + id="same_catalog_different_data_storage", + ), + pytest.param( + lambda: _lance_rest_config("http://127.0.0.1:2333"), + lambda: _lance_rest_config("http://127.0.0.1:2333/"), + True, + id="same_rest_namespace", + ), + pytest.param( + lambda: _lance_rest_config("http://127.0.0.1:2333"), + lambda: _lance_rest_config("http://other:2333"), + False, + id="different_rest_namespace", ), ], ) @@ -973,6 +1127,49 @@ def test_lance_can_join_with(f1: ConfigFactory, f2: ConfigFactory, expected: boo assert_join_result(f1(), f2(), expected) +@pytest.mark.parametrize( + "factory,expected", + [ + pytest.param( + lambda: _lance_config("file:///tmp/lance"), + "dir:file:///tmp/lance", + id="explicit_dir_catalog", + ), + pytest.param( + lambda: _lance_multi_base_config("s3://catalogs/manifest", "s3://data/lake"), + "dir:s3://catalogs/manifest", + id="catalog_takes_precedence_over_storage", + ), + pytest.param( + lambda: _lance_multi_base_config(None, "s3://data/lake"), + f"dir:s3://data/lake/{DEFAULT_LANCE_NAMESPACE_NAME}", + id="falls_back_to_storage_namespace", + ), + pytest.param( + lambda: _lance_rest_config("http://127.0.0.1:2333/"), + "rest:http://127.0.0.1:2333", + id="rest_namespace_uri", + ), + pytest.param(lambda: _lance_rest_config(None), "", id="rest_without_uri"), + pytest.param(lambda: LanceClientConfiguration(), "", id="empty"), + ], +) +def test_lance_physical_location(factory: ConfigFactory, expected: str) -> None: + assert factory().physical_location() == expected + + +def test_lance_can_never_write() -> None: + """dlt is the only engine that writes to Lance, so SQL write is never possible.""" + c1 = _lance_config("file:///tmp/lance") + c2 = _lance_config("file:///tmp/lance") + # same catalog and dataset is readable but not writable + assert c1.can_read_from(c2) + assert not c1.can_write_from(c2) + assert not c2.can_write_from(c1) + # not even from itself + assert not c1.can_write_from(c1) + + def test_lance_and_lancedb_cannot_join_with_each_other() -> None: lance = _lance_config("file:///tmp/lance") lancedb = _lancedb_config("file:///tmp/lance") @@ -995,3 +1192,4 @@ def test_qdrant_physical_location_but_not_joinable() -> None: c2 = QdrantClientConfiguration(qd_location="https://cluster.qdrant.io") assert c1.physical_location() == "https://cluster.qdrant.io" assert_not_joinable(c1, c2) + assert not c1.can_write_from(c2) diff --git a/tests/load/pipeline/test_join_compatibility.py b/tests/load/pipeline/test_join_compatibility.py index 0ed405c047..1883ea38ba 100644 --- a/tests/load/pipeline/test_join_compatibility.py +++ b/tests/load/pipeline/test_join_compatibility.py @@ -131,7 +131,11 @@ def _run_two_pipeline_check( first_destination: Optional[TDestinationReferenceArg], second_destination: Optional[TDestinationReferenceArg], expected: bool, + expected_write: Optional[bool] = None, ) -> None: + # by default SQL write capability follows read capability + if expected_write is None: + expected_write = expected test_id = uniq_id() first_pipeline = destination_config.setup_pipeline( "join_first_" + test_id, @@ -159,8 +163,10 @@ def _run_two_pipeline_check( first_config = first_pipeline.dataset().destination_client.config second_config = second_pipeline.dataset().destination_client.config - assert first_config.can_join_with(second_config) is expected - assert second_config.can_join_with(first_config) is expected + assert first_config.can_read_from(second_config) is expected + assert second_config.can_read_from(first_config) is expected + assert first_config.can_write_from(second_config) is expected_write + assert second_config.can_write_from(first_config) is expected_write @pytest.mark.parametrize( @@ -176,7 +182,11 @@ def test_same_database_join_compatibility( first_destination, second_destination = _make_same_database_destinations( destination_config, tmp_path, test_id ) - _run_two_pipeline_check(destination_config, first_destination, second_destination, True) + # filesystem at the same location is readable but dlt is the only writing engine + expected_write = False if destination_config.destination_type == "filesystem" else None + _run_two_pipeline_check( + destination_config, first_destination, second_destination, True, expected_write + ) @pytest.mark.parametrize( @@ -184,14 +194,15 @@ def test_same_database_join_compatibility( FILESYSTEM_DIFFERENT_LOCATION_JOIN_COMPATIBILITY_CONFIGS, ids=lambda x: x.name, ) -def test_filesystem_different_location_join_compatibility( +def test_filesystem_different_location_not_compatible( destination_config: DestinationTestConfiguration, tmp_path: Path, ) -> None: + # reading across filesystem locations requires auto ATTACH in the duckdb view layer first_destination, second_destination = _make_filesystem_different_location_destinations( tmp_path, uniq_id() ) - _run_two_pipeline_check(destination_config, first_destination, second_destination, True) + _run_two_pipeline_check(destination_config, first_destination, second_destination, False) @pytest.mark.parametrize( diff --git a/tests/load/sqlalchemy/test_sqlalchemy_configuration.py b/tests/load/sqlalchemy/test_sqlalchemy_configuration.py index 112877c9d2..7dd77f452a 100644 --- a/tests/load/sqlalchemy/test_sqlalchemy_configuration.py +++ b/tests/load/sqlalchemy/test_sqlalchemy_configuration.py @@ -23,16 +23,27 @@ "credentials,expected_fingerprint", [ pytest.param(None, "", id="empty"), + # in-memory databases have no identity pytest.param( SqlalchemyCredentials("sqlite:///:memory:"), - digest128(":memory:"), + "", id="sqlite_memory", ), + pytest.param( + SqlalchemyCredentials("sqlite:////data/db.sqlite"), + digest128("/data/db.sqlite"), + id="sqlite_file", + ), pytest.param( SqlalchemyCredentials("postgresql://user1:pass1@host1:5432/db1"), digest128("host1:5432"), id="postgres_host_port", ), + pytest.param( + SqlalchemyCredentials("mysql+pymysql://user1:pass1@host1:3306/db1"), + digest128("host1:3306"), + id="mysql_dbapi_host_port", + ), ], ) def test_sqlalchemy_fingerprint( From a732ce997b7df4818571c542e6456d5ccfc4c4ee Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jun 2026 22:05:45 +0200 Subject: [PATCH 16/16] athena casefold fix --- dlt/destinations/impl/athena/configuration.py | 3 ++- tests/destinations/test_join_compatibility.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index 1bd66e08e4..b18ff77d85 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -63,7 +63,8 @@ def _is_s3_tables_catalog(self) -> bool: def physical_location(self) -> str: """Returns region/catalog, or "" when region is unavailable.""" - catalog = self.aws_data_catalog or DEFAULT_AWS_DATA_CATALOG + # athena catalog names are case-insensitive, AWS docs spell the default `AwsDataCatalog` + catalog = (self.aws_data_catalog or DEFAULT_AWS_DATA_CATALOG).lower() region = None if self.credentials: region = self.credentials.region_name diff --git a/tests/destinations/test_join_compatibility.py b/tests/destinations/test_join_compatibility.py index 122992b7c7..674d71c318 100644 --- a/tests/destinations/test_join_compatibility.py +++ b/tests/destinations/test_join_compatibility.py @@ -395,6 +395,12 @@ def test_is_same_physical_location_delegates_to_can_join_with() -> None: "eu-central-1/awsdatacatalog", id="athena_default_catalog", ), + # catalog names are case-insensitive, AWS docs spell the default `AwsDataCatalog` + pytest.param( + lambda: _athena_config("eu-central-1", "AwsDataCatalog"), + "eu-central-1/awsdatacatalog", + id="athena_catalog_casefolded", + ), # Dremio pytest.param( lambda: DremioClientConfiguration(credentials=DremioCredentials("grpc://h")), @@ -795,6 +801,13 @@ def test_physical_location(factory: ConfigFactory, expected: ExpectedLocation) - False, id="athena_diff_catalog", ), + # catalog names are case-insensitive + pytest.param( + lambda: _athena_config("us-west-2", "AwsDataCatalog"), + lambda: _athena_config("us-west-2", "awsdatacatalog"), + True, + id="athena_catalog_case_insensitive", + ), pytest.param( lambda: AthenaClientConfiguration( credentials=AwsCredentials(),