diff --git a/Makefile b/Makefile index f77ecd4203..bd9ef80e24 100644 --- a/Makefile +++ b/Makefile @@ -225,7 +225,8 @@ TEST_COMMON_CORE_PATHS = \ tests/load/test_dummy_client.py \ tests/extract/test_extract.py \ tests/extract/test_sources.py \ - tests/pipeline/test_pipeline_state.py + tests/pipeline/test_pipeline_state.py \ + --ignore tests/normalize/test_normalize_arrow.py test-common-core: $(call RUN_XDIST_SAFE_SPLIT,$(TEST_COMMON_CORE_PATHS)) @@ -257,7 +258,7 @@ test-pipeline-min: install-pipeline-arrow: uv sync $(UV_SYNC_ARGS) --extra duckdb --extra cli --extra parquet -TEST_PIPELINE_ARROW_PATHS = tests/pipeline/test_pipeline_extra.py +TEST_PIPELINE_ARROW_PATHS = tests/pipeline/test_pipeline_extra.py tests/normalize/test_normalize_arrow.py test-pipeline-arrow: PYTEST_TARGET_ARGS = -k arrow test-pipeline-arrow: diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py index db3f3f28fa..53314d02f0 100644 --- a/dlt/common/destination/client.py +++ b/dlt/common/destination/client.py @@ -596,6 +596,7 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: """Updates storage to the current schema. @@ -605,6 +606,7 @@ def update_stored_schema( Args: only_tables (Sequence[str], optional): Updates only listed tables. Defaults to None. expected_update (TSchemaTables, optional): Update that is expected to be applied to the destination + force (bool): force full schema migration regardless of previous updates Returns: Optional[TSchemaTables]: Returns an update that was applied at the destination. """ diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index a7b58f2c45..98f1f7e0b7 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -24,6 +24,7 @@ TColumnNames, TypedDict, get_args, + NotRequired, ) if TYPE_CHECKING: @@ -367,6 +368,7 @@ class _TTableSchemaBase(TTableProcessingHints, total=False): resource: Optional[str] table_format: Optional[TTableFormat] file_format: Optional[TFileFormat] + variant_name: NotRequired[str] class TTableSchema(_TTableSchemaBase, total=False): diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 7ed4736330..cc27d0b416 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, List +from typing import Dict, Any, List, Tuple from abc import ABC, abstractmethod from dlt.common import logger @@ -22,9 +22,7 @@ def __init__(self, writer_spec: FileWriterSpec, *args: Any) -> None: def _get_writer( self, load_id: str, schema_name: str, table_name: str ) -> BufferedDataWriter[DataWriter]: - # unique writer id - writer_id = f"{load_id}.{schema_name}.{table_name}" - writer = self.buffered_writers.get(writer_id, None) + writer_id, writer = self.get_active_writer(load_id, schema_name, table_name) if not writer: # assign a writer for each table kwargs = {} @@ -35,6 +33,13 @@ def _get_writer( self.buffered_writers[writer_id] = writer return writer + def get_active_writer( + self, load_id: str, schema_name: str, table_name: str + ) -> Tuple[str, BufferedDataWriter[DataWriter]]: + # unique writer id + writer_id = f"{load_id}.{schema_name}.{table_name}" + return writer_id, self.buffered_writers.get(writer_id, None) + def write_data_item( self, load_id: str, diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 3fce116b98..2b08c810e5 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -604,6 +604,38 @@ def remove_completed_jobs(self, load_id: str) -> None: recursively=True, ) + def is_empty_package(self, load_id: str) -> bool: + """Package is empty if it does not contain any jobs or refresh commands (tables to + truncate / drop) in package state. A package that is being processed (applied schema + update already written) is never considered empty.""" + applied_schema_update_file = os.path.join( + self.get_package_path(load_id), PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME + ) + if self.storage.has_file(applied_schema_update_file): + return False + package_state = self.get_load_package_state(load_id) + dropped_tables = package_state.get("dropped_tables", []) + truncated_tables = package_state.get("truncated_tables", []) + return ( + len(dropped_tables) == 0 + and len(truncated_tables) == 0 + and len(self.list_new_jobs(load_id)) == 0 + ) + + def get_schema_update_file(self, load_id: str) -> Optional[TSchemaTables]: + """Reads the update file from load package `load_id` and returns its content. + Returns none if update file is already processed + """ + package_path = self.get_package_path(load_id) + if not self.storage.has_folder(package_path): + raise FileNotFoundError(package_path) + schema_update_file = os.path.join(package_path, PackageStorage.SCHEMA_UPDATES_FILE_NAME) + if self.storage.has_file(schema_update_file): + schema_update: TSchemaTables = json.loads(self.storage.load(schema_update_file)) + return schema_update + else: + return None + def delete_package(self, load_id: str, not_exists_ok: bool = False) -> None: package_path = self.get_package_path(load_id) if not self.storage.has_folder(package_path): diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 5e8a755016..168dfc58d4 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -130,18 +130,7 @@ def list_failed_jobs_in_loaded_package(self, load_id: str) -> Sequence[LoadJobIn return self.loaded_packages.list_failed_jobs_infos(load_id) def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: - """Reads the update file from load package `load_id` and returns its content. - Returns none if update file is already processed (deleted in commit_schema_update) - """ - package_path = self.get_normalized_package_path(load_id) - if not self.storage.has_folder(package_path): - raise FileNotFoundError(package_path) - schema_update_file = join(package_path, PackageStorage.SCHEMA_UPDATES_FILE_NAME) - if self.storage.has_file(schema_update_file): - schema_update: TSchemaTables = json.loads(self.storage.load(schema_update_file)) - return schema_update - else: - return None + return self.normalized_packages.get_schema_update_file(load_id) def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> None: """Marks schema update as processed by removing schema update file and saving the applied diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index f2f296d31c..23f8741b2d 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -399,8 +399,11 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update=expected_update) + applied_update = super().update_stored_schema( + only_tables, expected_update=expected_update, force=force + ) # here we could apply tags only if any migration happened, right now we do it on each run # NOTE: tags are applied before any data is loaded if ( diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py index e655e2bc0f..4cf1e67e20 100644 --- a/dlt/destinations/impl/destination/destination.py +++ b/dlt/destinations/impl/destination/destination.py @@ -49,8 +49,9 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - return super().update_stored_schema(only_tables, expected_update) + return super().update_stored_schema(only_tables, expected_update, force) def create_load_job( self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False diff --git a/dlt/destinations/impl/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py index dabae6fdc7..76927162db 100644 --- a/dlt/destinations/impl/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -609,6 +609,9 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: ``UNION ALL BY NAME``. """ existing_tables = set(tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()) + + # TODO: existing table schemas and sql statements can be cached so we do not have to recompute everything + # with every query tables_with_data: set[str] = set() for s in self.schemas.values(): tables_with_data.update(s.dlt_table_names()) @@ -691,8 +694,8 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB if not table.this: continue schema = table.db - # add only tables from the dataset schema - if schema or schema.lower() != self.dataset_name.lower(): + # add only tables that do not have schema prefix or schema prefix is actual dataset + if not schema or schema.lower() == self.dataset_name.lower(): load_tables[table.name] = table.name if load_tables: diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index ba9d55bbed..b79699c328 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -155,8 +155,9 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update) + applied_update = super().update_stored_schema(only_tables, expected_update, force) if self.config.fail_schema_update: raise DestinationTransientException( "Raise on schema update due to `fail_schema_update` config flag" diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index e371692e02..ad502d1268 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -823,17 +823,24 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> TSchemaTables: - applied_update = super().update_stored_schema(only_tables, expected_update) + applied_update = super().update_stored_schema(only_tables, expected_update, force) # don't store schema when used as staging if not self.config.as_staging_destination: # check if schema with hash exists current_hash = self.schema.stored_version_hash - if not self._get_stored_schema_by_hash_or_newest(current_hash): + stored = self._get_stored_schema_by_hash_or_newest(current_hash) + if not stored or force: logger.info( f"Schema with hash {self.schema.stored_version_hash} not found in the storage." " upgrading" + if not stored + else ( + f"Schema with hash {self.schema.stored_version_hash} found in storage but" + " update is enforced (tables to truncate/drop), ensuring table dirs" + ) ) # create destination dirs for all tables # TODO: find only tables with changes @@ -841,7 +848,9 @@ def update_stored_schema( dirs_to_create = self.get_table_dirs(table_names) for _, directory in zip(table_names, dirs_to_create): self.fs_client.makedirs(directory, exist_ok=True) - self._update_schema_in_storage(self.schema) + # do not write a duplicate schema file when the hash is already stored + if not stored: + self._update_schema_in_storage(self.schema) # we assume that expected_update == applied_update so table schemas in dest were not # externally changed diff --git a/dlt/destinations/impl/lance/configuration.py b/dlt/destinations/impl/lance/configuration.py index e2ab9f4a62..4e8541f8e5 100644 --- a/dlt/destinations/impl/lance/configuration.py +++ b/dlt/destinations/impl/lance/configuration.py @@ -271,6 +271,10 @@ class LanceClientConfiguration(WithLocalFiles, DestinationClientDwhConfiguration destination_type: Final[str] = dataclasses.field( # type: ignore default="lance", init=False, repr=False, compare=False ) + # dataset_name is optional: when not set tables are created in the root namespace + dataset_name: Final[Optional[str]] = dataclasses.field( # type: ignore + default=None, init=False, repr=False, compare=False + ) catalog_type: LanceCatalogType = "dir" CATALOG_CREDENTIALS: ClassVar[Dict[LanceCatalogType, Any]] = { @@ -293,6 +297,10 @@ class LanceClientConfiguration(WithLocalFiles, DestinationClientDwhConfiguration """Name of branch to use for read/write table operations. Uses `main` branch if not set.""" embeddings: Optional[LanceEmbeddingsConfiguration] = None """Optional embeddings configuration to add a vector embedding column.""" + always_refresh_views: bool = False + """Recreate the duckdb scanner views on each `dataset()` read. New rows are visible without + this (lance reads the latest dataset version on each scan); enable it to also pick up schema + changes (new columns) through an already-open connection.""" @property def storage_options(self) -> Optional[Dict[str, str]]: diff --git a/dlt/destinations/impl/lance/exceptions.py b/dlt/destinations/impl/lance/exceptions.py index fd157df8f1..d1bd5e236b 100644 --- a/dlt/destinations/impl/lance/exceptions.py +++ b/dlt/destinations/impl/lance/exceptions.py @@ -3,6 +3,7 @@ from typing import Any, List from dlt.common.destination.exceptions import ( + DestinationException, DestinationUndefinedEntity, DestinationTerminalException, DestinationTransientException, @@ -48,6 +49,9 @@ def raise_destination_error(f: TFun) -> TFun: def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: try: return f(self, *args, **kwargs) + except DestinationException: + # already converted (eg. raised by a nested decorated call) + raise except Exception as e: if is_lance_undefined_entity_exception(e): raise DestinationUndefinedEntity(e) from e diff --git a/dlt/destinations/impl/lance/factory.py b/dlt/destinations/impl/lance/factory.py index 930ad76231..a6c410c739 100644 --- a/dlt/destinations/impl/lance/factory.py +++ b/dlt/destinations/impl/lance/factory.py @@ -49,6 +49,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False caps.decimal_precision = (38, 18) + caps.wei_precision = (38, 0) caps.timestamp_precision = 6 caps.supported_replace_strategies = ["truncate-and-insert"] diff --git a/dlt/destinations/impl/lance/lance_client.py b/dlt/destinations/impl/lance/lance_client.py index 1f9a406f9b..881f6c325f 100644 --- a/dlt/destinations/impl/lance/lance_client.py +++ b/dlt/destinations/impl/lance/lance_client.py @@ -1,5 +1,6 @@ from __future__ import annotations +from copy import copy from types import TracebackType from typing import ( Dict, @@ -123,27 +124,40 @@ def sql_client(self) -> SqlClientBase[Any]: def sql_client(self, client: SqlClientBase[Any]) -> None: self._sql_client = client + def make_namespace_id(self) -> List[str]: + """Returns namespace `id` for the dataset. Empty (root namespace) when `dataset_name` is + not set.""" + return [] if self.dataset_name is None else [self.dataset_name] + @raise_destination_error def list_dataset_namespace_tables(self) -> List[str]: - return self.namespace.list_tables(ListTablesRequest(id=[self.dataset_name])).tables + return self.namespace.list_tables(ListTablesRequest(id=self.make_namespace_id())).tables @raise_destination_error def create_dataset_namespace(self) -> None: - """Creates child namespace for dataset in root namespace.""" - self.namespace.create_namespace(CreateNamespaceRequest(id=[self.dataset_name])) + """Creates child namespace for dataset in root namespace. No-op for the root namespace + (`dataset_name` not set) which always exists.""" + if self.dataset_name is None: + return + self.namespace.create_namespace(CreateNamespaceRequest(id=self.make_namespace_id())) @raise_destination_error def drop_dataset_namespace(self) -> None: - """Drops dataset namespace after removing all its tables.""" + """Drops dataset namespace after removing all its tables""" for table in self.list_dataset_namespace_tables(): self.namespace.drop_table(DropTableRequest(id=self.make_table_id(table))) - self.namespace.drop_namespace(DropNamespaceRequest(id=[self.dataset_name])) + # for the root namespace (`dataset_name` not set) only the tables are dropped + if self.dataset_name is not None: + self.namespace.drop_namespace(DropNamespaceRequest(id=self.make_namespace_id())) @raise_destination_error def dataset_namespace_exists(self) -> bool: """Returns True if child namespace for dataset exists in root namespace.""" + # the root namespace (`dataset_name` not set) always exists + if self.dataset_name is None: + return True try: - self.namespace.namespace_exists(NamespaceExistsRequest(id=[self.dataset_name])) + self.namespace.namespace_exists(NamespaceExistsRequest(id=self.make_namespace_id())) return True except Exception as e: if is_lance_undefined_entity_exception(e): @@ -177,7 +191,7 @@ def table_exists(self, table_name: str) -> bool: def make_table_id(self, table_name: str) -> List[str]: """Returns namespace `table_id` for given table name.""" - return [self.dataset_name, table_name] + return [*self.make_namespace_id(), table_name] def get_table_schema(self, table_name: str) -> pa.Schema: return self.open_lance_dataset(table_name, branch_name=self.config.branch_name).schema @@ -186,20 +200,36 @@ def get_table_uri(self, table_name: str) -> str: # we don't pass branch here — `uri` always returns base URI return self.open_lance_dataset(table_name).uri - def drop_tables(self, *tables: str) -> None: - """Drops tables from lance dataset namespace.""" + def drop_tables(self, *tables: str, delete_schema: bool = True) -> None: + """Drops tables from lance dataset namespace and optionally deletes the stored schema.""" for table_name in tables: - self.drop_table(table_name) + if self.table_exists(table_name): + self.drop_table(table_name) + if delete_schema: + self._delete_schema_in_storage(self.schema) + + @raise_destination_error + def _delete_schema_in_storage(self, schema: Schema) -> None: + """Deletes all stored versions with the same name as `schema`. No-op if table is missing.""" + if not self.table_exists(self.schema.version_table_name): + return + col = self.schema.naming.normalize_identifier("schema_name") + ds = self.open_lance_dataset( + self.schema.version_table_name, branch_name=self.config.branch_name + ) + ds.delete(f'`{col}` = "{schema.name}"') def drop_storage(self) -> None: """Drops dataset namespace and all its tables.""" if self.dataset_namespace_exists(): self.drop_dataset_namespace() + @raise_destination_error def truncate_table(self, table_name: str) -> None: """Truncates table by deleting all rows in active branch.""" self.open_lance_dataset(table_name, branch_name=self.config.branch_name).delete("true") + @raise_destination_error def create_branch_if_not_exists(self, table_name: str, branch_name: str) -> None: ds = self.open_lance_dataset(table_name) if branch_name not in ds.branches.list(): @@ -234,7 +264,7 @@ def open_lancedb_table(self, table_name: str) -> LanceTable: This provides access to LanceDB-specific features like vector search. """ db = LanceNamespaceDBConnection(self.namespace, storage_options=self.config.storage_options) - return db.open_table(table_name, namespace=[self.dataset_name]) + return db.open_table(table_name, namespace=self.make_namespace_id()) @raise_destination_error def _write_records( @@ -327,28 +357,29 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update) + super().update_stored_schema(only_tables, expected_update, force) try: schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) except DestinationUndefinedEntity: schema_info = None - if schema_info is None: + applied_update: TSchemaTables = {} + if schema_info is None or force: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - "not found in the storage. upgrading" + "not found in the storage (or update enforced). upgrading" + ) + applied_update = self._execute_schema_update( + only_tables, store_schema=schema_info is None ) - # TODO: return a real updated table schema (like in SQL job client) - self._execute_schema_update(only_tables) else: logger.debug( f"Schema with hash {self.schema.stored_version_hash} " f"inserted at {schema_info.inserted_at} found " "in storage, no upgrade required" ) - # we assume that expected_update == applied_update so table schemas in dest were not - # externally changed return applied_update def prepare_load_table(self, table_name: str) -> PreparedTableSchema: @@ -370,6 +401,9 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns] try: arrow_schema = self.get_table_schema(table_name) + except DestinationUndefinedEntity: + # `open_lance_dataset` already mapped a missing table/namespace to this exception + return False, table_schema except Exception as e: if is_lance_undefined_entity_exception(e): return False, table_schema @@ -433,13 +467,17 @@ def make_arrow_table_schema(self, table_name: str) -> pa.Schema: return arrow_schema + @raise_destination_error def add_null_columns_to_table(self, table_name: str, new_columns: List[TColumnSchema]) -> None: new_fields = [dlt_column_to_arrow_field(col, self.capabilities) for col in new_columns] self.open_lance_dataset(table_name, branch_name=self.config.branch_name).add_columns( new_fields ) - def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + def _execute_schema_update( + self, only_tables: Iterable[str], store_schema: bool = True + ) -> TSchemaTables: + applied_update: TSchemaTables = {} for table_name in only_tables or self.schema.tables: table_exists = self.table_exists(table_name) @@ -447,22 +485,34 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: if not table_exists: self.create_table(table_name, self.make_arrow_table_schema(table_name)) - # create branch if needed + # create branch if needed before diffing: a new branch forks from main and inherits + # its schema, so columns must be read from the branch *after* it exists if branch_name := self.config.branch_name: self.create_branch_if_not_exists(table_name, branch_name) - # add new columns to existing table (on the branch if configured) - if table_exists: - _, existing_columns = self.get_storage_table(table_name) - new_columns = self.schema.get_new_table_columns( - table_name, - existing_columns, - self.capabilities.generates_case_sensitive_identifiers(), - ) - if new_columns: - self.add_null_columns_to_table(table_name, new_columns) + # diff against the destination (branch, if configured): for a new table all columns + # are new + existing_columns = self.get_storage_table(table_name)[1] if table_exists else {} + new_columns = self.schema.get_new_table_columns( + table_name, + existing_columns, + self.capabilities.generates_case_sensitive_identifiers(), + ) - self.update_schema_in_storage() + # add new columns to existing table (on the branch if configured) + if table_exists and new_columns: + self.add_null_columns_to_table(table_name, new_columns) + + # record the migration applied to this table (new table or added columns) + if new_columns: + partial_table = copy(self.prepare_load_table(table_name)) + partial_table["columns"] = {c["name"]: c for c in new_columns} + applied_update[table_name] = partial_table + + # skip writing the version row when the schema is already stored (enforced update) + if store_schema: + self._update_schema_in_storage(self.schema) + return applied_update def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: """Retrieves the latest completed state for a pipeline.""" @@ -500,9 +550,13 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: return StateInfo.from_normalized_mapping(row, self.schema.naming) def _get_latest_schema(self, filter_: Optional[str] = None) -> Optional[StorageSchemaInfo]: - ds = self.open_lance_dataset( - self.schema.version_table_name, branch_name=self.config.branch_name - ) + try: + ds = self.open_lance_dataset( + self.schema.version_table_name, branch_name=self.config.branch_name + ) + except DestinationUndefinedEntity: + # version table not created yet (empty storage) + return None table = ds.scanner(filter=filter_, prefilter=True).to_table() if filter_ else ds.to_table() rows = table.to_pylist() try: @@ -524,14 +578,14 @@ def get_stored_schema(self, schema_name: str = None) -> Optional[StorageSchemaIn return self._get_latest_schema(filter_=f'`{col}` = "{schema_name}"') return self._get_latest_schema() - def update_schema_in_storage(self) -> None: + def _update_schema_in_storage(self, schema: Schema) -> None: record = { - "version": self.schema.version, - "engine_version": self.schema.ENGINE_VERSION, + "version": schema.version, + "engine_version": schema.ENGINE_VERSION, "inserted_at": pendulum.now(), - "schema_name": self.schema.name, - "version_hash": self.schema.stored_version_hash, - "schema": json.dumps(self.schema.to_dict()), + "schema_name": schema.name, + "version_hash": schema.stored_version_hash, + "schema": json.dumps(schema.to_dict()), } records = [{self.schema.naming.normalize_identifier(k): v for k, v in record.items()}] write_disposition = self.schema.get_table(self.schema.version_table_name).get( diff --git a/dlt/destinations/impl/lance/sql_client.py b/dlt/destinations/impl/lance/sql_client.py index 180d969386..93d7ed2845 100644 --- a/dlt/destinations/impl/lance/sql_client.py +++ b/dlt/destinations/impl/lance/sql_client.py @@ -43,9 +43,10 @@ def _prepare_create_lance_secret_statement( class LanceSQLClient(WithTableScanners): def __init__(self, lance_client: LanceClient) -> None: self.lance_client = lance_client + # schema-less (no dataset_name): host the read views in the ephemeral duckdb `main` schema super().__init__( remote_client=lance_client, - dataset_name=lance_client.dataset_name, + dataset_name=lance_client.dataset_name or "main", ) def open_connection(self) -> DuckDBPyConnection: @@ -63,8 +64,7 @@ def can_create_view(self, table_schema: PreparedTableSchema) -> bool: return True def should_replace_view(self, view_name: str, table_schema: PreparedTableSchema) -> bool: - # lance datasets are versioned, always refresh to get latest data - return True + return self.lance_client.config.always_refresh_views def create_view_select( self, table_schema: PreparedTableSchema, schema: Schema = None diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py index 4b8d725917..b50088d0a2 100644 --- a/dlt/destinations/impl/lancedb/configuration.py +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -129,6 +129,8 @@ class LanceDBClientConfiguration(WithLocalFiles, DestinationClientDwhConfigurati dataset_name: Final[Optional[str]] = dataclasses.field( # type: ignore default=None, init=False, repr=False, compare=False ) + always_refresh_views: bool = False + """Recreates view before each query to it""" options: Optional[LanceDBClientOptions] = None """LanceDB client options.""" diff --git a/dlt/destinations/impl/lancedb/exceptions.py b/dlt/destinations/impl/lancedb/exceptions.py index cc1acc8482..5a1fb026f7 100644 --- a/dlt/destinations/impl/lancedb/exceptions.py +++ b/dlt/destinations/impl/lancedb/exceptions.py @@ -7,6 +7,7 @@ from lancedb.exceptions import MissingValueError, MissingColumnError from dlt.common.destination.exceptions import ( + DestinationException, DestinationUndefinedEntity, DestinationTransientException, ) @@ -28,6 +29,9 @@ def lancedb_error(f: TFun) -> TFun: def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: try: return f(self, *args, **kwargs) + except DestinationException: + # already converted (eg. raised by a nested decorated call) + raise except ValueError as e: if is_lancedb_not_found_error(str(e)): raise DestinationUndefinedEntity(e) from e diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py index 079914d355..44989aa86a 100644 --- a/dlt/destinations/impl/lancedb/factory.py +++ b/dlt/destinations/impl/lancedb/factory.py @@ -46,6 +46,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False caps.decimal_precision = (38, 18) + caps.wei_precision = (38, 0) caps.timestamp_precision = 6 caps.supported_replace_strategies = ["truncate-and-insert"] diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py index 3f728d8f6e..a7aa55be11 100644 --- a/dlt/destinations/impl/lancedb/lancedb_client.py +++ b/dlt/destinations/impl/lancedb/lancedb_client.py @@ -1,3 +1,4 @@ +from copy import copy from types import TracebackType from typing import ( List, @@ -176,18 +177,33 @@ def create_table( """ return self.db_client.create_table(table_name, schema=schema, mode=mode) + @lancedb_error def drop_tables(self, *tables: str, delete_schema: bool = True) -> None: - """Drop multiple LanceDB tables. + """Drop multiple LanceDB tables and optionally delete the stored schema. Args: table_names: The names of the tables to drop. + delete_schema: If True, also delete all versions of the current schema from storage. """ - if not tables: - return + if tables: + existing_tables = self.list_table_names() + for table_name in tables: + fq_table_name = self.make_qualified_table_name(table_name) + if fq_table_name in existing_tables: + self.db_client.drop_table(fq_table_name) + if delete_schema: + self._delete_schema_in_storage(self.schema) - for table_name in tables: - if table_name in self.list_table_names(): - self.db_client.drop_table(table_name) + @lancedb_error + def _delete_schema_in_storage(self, schema: Schema) -> None: + """Deletes all stored versions with the same name as `schema`. No-op if table is missing.""" + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + if fq_version_table_name not in self.list_table_names(): + return + version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name) + version_table.checkout_latest() + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + version_table.delete(f'`{p_schema_name}` = "{schema.name}"') def delete_table(self, table_name: str) -> None: """Delete a LanceDB table. @@ -316,28 +332,29 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update) + super().update_stored_schema(only_tables, expected_update, force) try: schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) except DestinationUndefinedEntity: schema_info = None - if schema_info is None: + applied_update: TSchemaTables = {} + if schema_info is None or force: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - "not found in the storage. upgrading" + "not found in the storage (or update enforced). upgrading" + ) + applied_update = self._execute_schema_update( + only_tables, store_schema=schema_info is None ) - # TODO: return a real updated table schema (like in SQL job client) - self._execute_schema_update(only_tables) else: logger.debug( f"Schema with hash {self.schema.stored_version_hash} " f"inserted at {schema_info.inserted_at} found " "in storage, no upgrade required" ) - # we assume that expected_update == applied_update so table schemas in dest were not - # externally changed return applied_update def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: @@ -386,7 +403,10 @@ def extend_lancedb_table_schema(self, table_name: str, field_schemas: List[pa.Fi # TODO: Update method below doesn't work for bulk NULL assignments, raise with LanceDB developers. # table.update(values={field.name: None}) - def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + def _execute_schema_update( + self, only_tables: Iterable[str], store_schema: bool = True + ) -> TSchemaTables: + applied_update: TSchemaTables = {} for table_name in only_tables or self.schema.tables: exists, existing_columns = self.get_storage_table(table_name) new_columns: List[TColumnSchema] = self.schema.get_new_table_columns( @@ -396,6 +416,10 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: ) logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") if new_columns: + # record the migration applied to this table (new table or added columns) + partial_table = copy(self.prepare_load_table(table_name)) + partial_table["columns"] = {c["name"]: c for c in new_columns} + applied_update[table_name] = partial_table if exists: field_schemas: List[TArrowField] = [ make_arrow_field_schema(column["name"], column, self.type_mapper) @@ -429,24 +453,21 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: fq_table_name = self.make_qualified_table_name(table_name) self.create_table(fq_table_name, table_schema) - self.update_schema_in_storage() + # skip writing the version row when the schema is already stored (enforced update) + if store_schema: + self._update_schema_in_storage(self.schema) + return applied_update @lancedb_error - def update_schema_in_storage(self) -> None: + def _update_schema_in_storage(self, schema: Schema) -> None: records = [ { - self.schema.naming.normalize_identifier("version"): self.schema.version, - self.schema.naming.normalize_identifier( - "engine_version" - ): self.schema.ENGINE_VERSION, + self.schema.naming.normalize_identifier("version"): schema.version, + self.schema.naming.normalize_identifier("engine_version"): schema.ENGINE_VERSION, self.schema.naming.normalize_identifier("inserted_at"): pendulum.now(), - self.schema.naming.normalize_identifier("schema_name"): self.schema.name, - self.schema.naming.normalize_identifier( - "version_hash" - ): self.schema.stored_version_hash, - self.schema.naming.normalize_identifier("schema"): json.dumps( - self.schema.to_dict() - ), + self.schema.naming.normalize_identifier("schema_name"): schema.name, + self.schema.naming.normalize_identifier("version_hash"): schema.stored_version_hash, + self.schema.naming.normalize_identifier("schema"): json.dumps(schema.to_dict()), } ] fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) @@ -518,6 +539,9 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: @lancedb_error def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + # version table not created yet (empty storage) + if fq_version_table_name not in self.list_table_names(): + return None version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name) version_table.checkout_latest() diff --git a/dlt/destinations/impl/lancedb/sql_client.py b/dlt/destinations/impl/lancedb/sql_client.py index 85e2d5dd3f..fddff68d53 100644 --- a/dlt/destinations/impl/lancedb/sql_client.py +++ b/dlt/destinations/impl/lancedb/sql_client.py @@ -9,35 +9,22 @@ """ from __future__ import annotations -from contextlib import contextmanager from packaging import version as pkg_version -from typing import Any, Iterator, TYPE_CHECKING +from typing import Optional, Tuple, TYPE_CHECKING -import sqlglot -import sqlglot.expressions as exp import duckdb +from dlt.common.schema import Schema from dlt.destinations.exceptions import DatabaseUndefinedRelation -from dlt.common.destination.dataset import DBApiCursor -from dlt.common.destination.capabilities import DestinationCapabilitiesContext -from dlt.destinations.sql_client import raise_database_error, raise_open_connection_error -from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient -from dlt.destinations.impl.duckdb.factory import _set_duckdb_raw_capabilities +from dlt.destinations.impl.duckdb.sql_client import WithTableScanners if TYPE_CHECKING: - from sqlglot import expressions as sge from duckdb import DuckDBPyConnection + from dlt.common.destination.typing import PreparedTableSchema from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient -def _get_lancedb_sql_capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps = _set_duckdb_raw_capabilities(caps) - caps.preferred_loader_file_format = "parquet" - return caps - - def _install_and_load_lance_duckdb_extension(duckdb_con: DuckDBPyConnection) -> None: """Ensure the `lance-duckdb` extension is loaded. @@ -54,14 +41,6 @@ def _install_and_load_lance_duckdb_extension(duckdb_con: DuckDBPyConnection) -> duckdb_con.execute("LOAD lance;") -def _create_and_use_duckdb_dataset( - duckdb_con: DuckDBPyConnection, dataset_qualified_name: str -) -> None: - """Create a schema in the ephemeral DuckDB client that matches the `dlt` dataset name.""" - create_schema_sql = f"CREATE SCHEMA IF NOT EXISTS {dataset_qualified_name}" - duckdb_con.execute(f"{create_schema_sql}; USE {dataset_qualified_name}") - - def get_lance_table_uri(lancedb_client: LanceDBClient, table_name: str) -> str: """Create a URI for a Lance table @@ -75,79 +54,42 @@ def get_lance_table_uri(lancedb_client: LanceDBClient, table_name: str) -> str: return f"{dataset_lance_uri}/{qualified_table_name}.lance" -def _prepare_create_view_statement(lance_table_uri: str, view_name: str) -> str: - return f'CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM "{lance_table_uri}"' - - -class LanceDBSQLClient(DuckDbSqlClient): +class LanceDBSQLClient(WithTableScanners): def __init__(self, lancedb_client: LanceDBClient) -> None: self.lancedb_client = lancedb_client + # schema-less (no dataset_name): host the read views in the ephemeral duckdb `main` schema super().__init__( - dataset_name=self.lancedb_client.dataset_name, - staging_dataset_name=None, - credentials=None, # duckdb doesn't need special credentials - capabilities=_get_lancedb_sql_capabilities(), + remote_client=lancedb_client, dataset_name=lancedb_client.dataset_name or "main" ) - self._conn: DuckDBPyConnection | None = None - @raise_open_connection_error def open_connection(self) -> DuckDBPyConnection: - if self._conn: - return self._conn + with self.credentials.conn_pool._conn_lock: + first_connection = self.credentials.conn_pool.never_borrowed + super().open_connection() - self._conn = duckdb.connect(":memory:") - _install_and_load_lance_duckdb_extension(self._conn) - - # by default, LanceDB has `dataset_name=None`. To be consistent, it uses DuckDB's - # main schema by default - if self.lancedb_client.dataset_name: - _create_and_use_duckdb_dataset(self._conn, self.fully_qualified_dataset_name()) + if first_connection: + _install_and_load_lance_duckdb_extension(self._conn) return self._conn - def close_connection(self) -> None: - if self._conn: - self._conn = None - - @contextmanager - @raise_database_error - def execute_query( # type: ignore[override] - self, query: str, *args: Any, **kwargs: Any - ) -> Iterator[DBApiCursor]: - # replace generic string placeholder by DuckDB placeholders - if args or kwargs: - query = query.replace("%s", "?") - - expression: sge.Expression = sqlglot.maybe_parse(query) - for table in expression.find_all(exp.Table): - if not table.this: - continue + def can_create_view(self, table_schema: PreparedTableSchema) -> bool: + return True - self.create_view(table.name) + def should_replace_view(self, view_name: str, table_schema: PreparedTableSchema) -> bool: + # views must be refreshed when schema evolves + return self.lancedb_client.config.always_refresh_views - with super().execute_query(query, *args, **kwargs) as cursor: - yield cursor - - @raise_database_error - def create_view(self, table_name: str) -> None: + def create_view_select( + self, table_schema: PreparedTableSchema, schema: Schema = None + ) -> Optional[Tuple[str, str]]: + table_name = table_schema["name"] lance_table_uri = get_lance_table_uri(self.lancedb_client, table_name) - - # lancedb allows omitting the dataset_name, calling `make_qualified_table_name` will - # prepend the dataset_name even if it is None - if self.dataset_name: - view_name = self.make_qualified_table_name(table_name) - else: - view_name = self.capabilities.escape_identifier( - self.capabilities.casefold_identifier(table_name) - ) - - create_view_sql = _prepare_create_view_statement( - lance_table_uri=lance_table_uri, - view_name=view_name, - ) - try: - self._conn.execute(create_view_sql) - # Creating a DuckDB view will fail if the table doesn't exist in lance - # potential edge case: a table only exists in the ephemeral DuckDB - except duckdb.IOException as e: - raise DatabaseUndefinedRelation(e) + # the `lance` duckdb extension reads a `.lance` directory directly + return lance_table_uri, f'SELECT * FROM "{lance_table_uri}"' + + @classmethod + def _make_database_exception(cls, ex: Exception) -> Exception: + # a missing `.lance` directory means the table was not created yet in lancedb + if isinstance(ex, duckdb.IOException): + return DatabaseUndefinedRelation(ex) + return super()._make_database_exception(ex) diff --git a/dlt/destinations/impl/qdrant/qdrant_job_client.py b/dlt/destinations/impl/qdrant/qdrant_job_client.py index 445555b2a5..7773448571 100644 --- a/dlt/destinations/impl/qdrant/qdrant_job_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_job_client.py @@ -288,15 +288,16 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update) + applied_update = super().update_stored_schema(only_tables, expected_update, force) schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) - if schema_info is None: + if schema_info is None or force: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - "not found in the storage. upgrading" + "not found in the storage (or update enforced). upgrading" ) - self._execute_schema_update(only_tables) + self._execute_schema_update(only_tables, store_schema=schema_info is None) else: logger.info( f"Schema with hash {self.schema.stored_version_hash} " @@ -490,7 +491,7 @@ def _update_schema_in_storage(self, schema: Schema) -> None: version_table_name = self._make_qualified_collection_name(self.schema.version_table_name) self._create_point_no_vector(properties, version_table_name) - def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + def _execute_schema_update(self, only_tables: Iterable[str], store_schema: bool = True) -> None: is_local = self.config.is_local() for table_name in only_tables or self.schema.tables: exists = self._collection_exists(table_name) @@ -517,7 +518,9 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: field_schema="datetime", ) - self._update_schema_in_storage(self.schema) + # skip writing the version row when the schema is already stored (enforced update) + if store_schema: + self._update_schema_in_storage(self.schema) def _collection_exists(self, table_name: str, qualify_table_name: bool = True) -> bool: try: diff --git a/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py b/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py index 5fa1d78eb4..b458a8546b 100644 --- a/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py +++ b/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py @@ -195,13 +195,16 @@ def get_storage_tables( } def update_stored_schema( - self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + self, + only_tables: Iterable[str] = None, + expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: # super().update_stored_schema(only_tables, expected_update) - JobClientBase.update_stored_schema(self, only_tables, expected_update) + JobClientBase.update_stored_schema(self, only_tables, expected_update, force) schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) - if schema_info is not None: + if schema_info is not None and not force: logger.info( "Schema with hash %s inserted at %s found in storage, no upgrade required", self.schema.stored_version_hash, @@ -210,7 +213,7 @@ def update_stored_schema( return {} else: logger.info( - "Schema with hash %s not found in storage, upgrading", + "Schema with hash %s not found in storage (or update enforced), upgrading", self.schema.stored_version_hash, ) @@ -246,7 +249,9 @@ def update_stored_schema( for table_obj in tables_to_create: self.sql_client.create_table(table_obj) self.sql_client.alter_table_add_columns(columns_to_add) - self._update_schema_in_storage(self.schema) + # do not write a duplicate version row when the hash is already stored (enforced update) + if schema_info is None: + self._update_schema_in_storage(self.schema) return schema_update diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 5829fcf5f2..223088e695 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -697,20 +697,21 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: - applied_update = super().update_stored_schema(only_tables, expected_update) + applied_update = super().update_stored_schema(only_tables, expected_update, force) # Retrieve the schema from Weaviate try: schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) except DestinationUndefinedEntity: schema_info = None - if schema_info is None: + if schema_info is None or force: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - "not found in the storage. upgrading" + "not found in the storage (or update enforced). upgrading" ) # TODO: return a real updated table schema (like in SQL job client) - self._execute_schema_update(only_tables) + self._execute_schema_update(only_tables, store_schema=schema_info is None) else: logger.info( f"Schema with hash {self.schema.stored_version_hash} " @@ -720,7 +721,7 @@ def update_stored_schema( return applied_update - def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + def _execute_schema_update(self, only_tables: Iterable[str], store_schema: bool = True) -> None: for table_name in only_tables or self.schema.tables.keys(): exists, existing_columns = self.get_storage_table(table_name) # TODO: detect columns where vectorization was added or removed and modify it. currently we ignore change of hints @@ -741,7 +742,9 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: else: collection_config = self.make_weaviate_collection_schema(table_name) self.create_collection(collection_config) - self._update_schema_in_storage(self.schema) + # skip writing the version row when the schema is already stored (enforced update) + if store_schema: + self._update_schema_in_storage(self.schema) def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: table_schema: TTableSchemaColumns = {} diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 23e3d88fde..10f58e150b 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -308,19 +308,29 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, + force: bool = False, ) -> Optional[TSchemaTables]: self._set_query_tags(operation="update_stored_schema") - super().update_stored_schema(only_tables, expected_update) + super().update_stored_schema(only_tables, expected_update, force) applied_update: TSchemaTables = {} schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) - if schema_info is None: - logger.info( - f"Schema with hash {self.schema.stored_version_hash} not found in the storage." - " upgrading" - ) + if schema_info is None or force: + if schema_info is None: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} not found in the storage." + " upgrading" + ) + else: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} found in storage but" + " update is enforced (tables to truncate/drop), applying DDL" + ) with self.maybe_ddl_transaction(): - applied_update = self._execute_schema_update_sql(only_tables) + # do not write a duplicate version row when the hash is already stored + applied_update = self._execute_schema_update_sql( + only_tables, store_schema=schema_info is None + ) else: logger.info( f"Schema with hash {self.schema.stored_version_hash} inserted at" @@ -639,7 +649,9 @@ def _get_storage_table_query_columns(self) -> List[str]: fields += ["numeric_precision", "numeric_scale"] return fields - def _execute_schema_update_sql(self, only_tables: Iterable[str]) -> TSchemaTables: + def _execute_schema_update_sql( + self, only_tables: Iterable[str], store_schema: bool = True + ) -> TSchemaTables: # Only `only_tables` are included, or all if None. sql_scripts, schema_update = self._build_schema_update_sql( list(self.get_storage_tables(only_tables or self.schema.tables.keys())) @@ -648,7 +660,9 @@ def _execute_schema_update_sql(self, only_tables: Iterable[str]) -> TSchemaTable # Some DB backends use bytes not characters, so decrease the limit by half, # assuming most of the characters in DDL encoded into single bytes. self.sql_client.execute_many(sql_scripts) - self._update_schema_in_storage(self.schema) + # skip writing the version row when the schema is already stored (enforced update) + if store_schema: + self._update_schema_in_storage(self.schema) return schema_update def _build_schema_update_sql( diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 9628e4aa93..25d5690a93 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -2,7 +2,7 @@ from collections.abc import Sequence as C_Sequence from copy import copy import itertools -from typing import Iterator, List, Dict, Any, Optional +from typing import Iterator, List, Dict, Any, Optional, Set import yaml from dlt.common import logger @@ -48,6 +48,7 @@ ) from dlt.extract.exceptions import UnknownSourceReference from dlt.extract.incremental import IncrementalResourceWrapper +from dlt.extract.items import TableNameMeta from dlt.extract.items_transform import ItemTransform from dlt.common.metrics import DataWriterAndCustomMetrics from dlt.extract.pipe_iterator import PipeIterator @@ -340,47 +341,79 @@ def _get_all_resource_custom_metrics(resource_name: str) -> Dict[str, Any]: "hints": clean_hints, } - def _write_empty_files( + def _handle_empty_tables( self, source: DltSource, extractors: Dict[TDataItemFormat, Extractor] ) -> None: + """Tables that are present in schema and in extracted resources but without data items require special handling""" schema = source.schema json_extractor = extractors["object"] - resources_with_items = set().union(*[e.resources_with_items for e in extractors.values()]) + tables_with_items = set().union(*[e.tables_with_items for e in extractors.values()]) # find REPLACE resources that did not yield any pipe items and create empty jobs for them - # NOTE: do not include tables that have never seen data + # do not include tables that have never seen data data_tables = {t["name"]: t for t in schema.data_tables(seen_data_only=True)} tables_by_resources = utils.group_tables_by_resource(data_tables) for resource in source.resources.selected.values(): - if resource.write_disposition != "replace" or resource.name in resources_with_items: - continue if resource.name not in tables_by_resources: continue + write_disposition = resource.write_disposition + # dynamic write dispositions can't be handled here + if callable(write_disposition): + continue + # disposition shorthand used for comparisons and the replace gate below + disposition = ( + write_disposition["disposition"] + if isinstance(write_disposition, dict) + else write_disposition + ) for table in tables_by_resources[resource.name]: - # we only need to write empty files for the root tables - if not utils.is_nested_table(table): - json_extractor.write_empty_items_file(table["name"]) + table_name = table["name"] + # we only need to handle root tables + if utils.is_nested_table(table) or table_name in tables_with_items: + continue + # best-effort write disposition refresh: the resource write disposition is static + # here (dynamic ones were skipped above), so it can be written into existing tables. + # the full config (incl. merge strategy) is applied, not just the disposition. + wd_config: Any = None + if variant_name := table.get("variant_name"): + # 1. variant table: take the write disposition declared on the variant hints + variant_wd = (resource._hints_variants.get(variant_name) or {}).get( + "write_disposition" + ) + if isinstance(variant_wd, (str, dict)): + wd_config = variant_wd + elif table.get("write_disposition") != disposition: + # variant has no explicit disposition and we can't be sure - leave it + continue + elif len(schema.naming.break_path(table_name)) > 1: + # 2. pseudo-root (nested table broken out by a primary key): can't be + # re-derived from hints, so do not update it + if table.get("write_disposition") != disposition: + continue + else: + # 3. a root table created from resource hints (incl. with_table_name marks and + # dynamically dispatched names) - the static write disposition is known + wd_config = write_disposition + if wd_config is not None: + if not isinstance(wd_config, dict): + wd_config = {"disposition": wd_config} + table["write_disposition"] = wd_config + resource._merge_write_disposition_dict(table) # type: ignore[arg-type] + + # write empty files so a replace root is truncated even though it received no data + if disposition != "replace": + continue + # table itself must accept replace + if table.get("write_disposition") == "replace": + json_extractor.write_empty_items_file(table_name) - # collect resources that received empty materialized lists and had no items - resources_with_empty = ( + # collect tables that received empty materialized lists and had no items + tables_with_empty = ( set() - .union(*[e.resources_with_empty for e in extractors.values()]) - .difference(resources_with_items) + .union(*[e.tables_with_empty for e in extractors.values()]) + .difference(tables_with_items) ) - # get all possible tables - data_tables = {t["name"]: t for t in schema.data_tables()} - tables_by_resources = utils.group_tables_by_resource(data_tables) - for resource_name in resources_with_empty: - if resource := source.resources.selected.get(resource_name): - if tables := tables_by_resources.get("resource_name"): - # write empty tables - for table in tables: - # we only need to write empty files for the root tables - if not utils.is_nested_table(table): - json_extractor.write_empty_items_file(table["name"]) - else: - table_name = json_extractor._get_static_table_name(resource, None) - if table_name: - json_extractor.write_empty_items_file(table_name) + for table_name in tables_with_empty: + json_extractor.write_empty_items_file(table_name) def _extract_single_source( self, @@ -430,7 +463,7 @@ def _extract_single_source( resource, pipe_item.item, pipe_item.meta ) - self._write_empty_files(source, extractors) + self._handle_empty_tables(source, extractors) if left_gens > 0: # go to 100% collector.update("Resources", left_gens) diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 983ab9c210..3110cdd02b 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -124,10 +124,10 @@ def __init__( self.schema = schema self.naming = schema.naming self.collector = collector - self.resources_with_items: Set[str] = set() - """Tracks resources that received items""" - self.resources_with_empty: Set[str] = set() - """Track resources that received empty materialized list""" + self.tables_with_items: Set[str] = set() + """Tracks tables that received items""" + self.tables_with_empty: Set[str] = set() + """Tracks tables that received empty materialized list""" self.load_id = load_id self.item_storage = item_storage self._table_contracts: Dict[str, TSchemaContractDict] = {} @@ -190,10 +190,10 @@ def _write_item( self.collector.update(table_name, inc=new_rows_count) # if there were rows or item was empty arrow table if new_rows_count > 0 or self.__class__ is ArrowExtractor: - self.resources_with_items.add(resource_name) + self.tables_with_items.add(table_name) else: if isinstance(items, MaterializedEmptyList): - self.resources_with_empty.add(resource_name) + self.tables_with_empty.add(table_name) def _import_item( self, @@ -210,7 +210,7 @@ def _import_item( meta.file_format, ) self.collector.update(table_name, inc=metrics.items_count) - self.resources_with_items.add(resource_name) + self.tables_with_items.add(table_name) def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: if not isinstance(items, list): diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index e90346efc7..0ed0dcf5b6 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -40,6 +40,7 @@ migrate_complex_types, new_column, new_table, + normalize_table_identifiers, remove_compound_props, ) from dlt.common.typing import TAny, TDataItem, TColumnNames @@ -263,9 +264,13 @@ def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTab # look for variant default_table_name = meta.table_name root_table_template = self._hints_variants.get(default_table_name, self._hints) + variant_name = ( + default_table_name if default_table_name in self._hints_variants else None + ) else: default_table_name = self.name root_table_template = self._hints + variant_name = None if not root_table_template: return new_table(default_table_name, resource=self.name) @@ -294,6 +299,8 @@ def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTab table_schema = self._hints_to_table_schema(resolved_template) if not is_nested_table(table_schema): table_schema["resource"] = self.name + if variant_name: + table_schema["variant_name"] = variant_name migrate_complex_types(table_schema, warn=True) validate_dict_ignoring_xkeys( spec=TTableSchema, @@ -392,6 +399,23 @@ def compute_nested_table_schemas( return nested_table_schemas + def compute_tables( + self, naming: NamingConvention, item: TDataItem = None, meta: Any = None + ) -> List[TTableSchema]: + """Computes normalized root and nested table schemas for this resource. + + Raises `DataItemRequiredForDynamicTableHints` when the table name is dynamic and `item` is + not provided. + """ + root_table_schema = self.compute_table_schema(item, meta) + nested_tables_schema = self.compute_nested_table_schemas( + root_table_schema["name"], naming, item, meta + ) + return [ + normalize_table_identifiers(table_schema, naming) + for table_schema in (root_table_schema, *nested_tables_schema) + ] + def apply_hints( self, table_name: Optional[TTableHintTemplate[str]] = None, diff --git a/dlt/extract/items.py b/dlt/extract/items.py index df90ddfe57..14e54043a8 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -93,6 +93,9 @@ class TableNameMeta: def __init__(self, table_name: str) -> None: self.table_name = table_name + def __repr__(self) -> str: + return f"TableNameMeta('{self.table_name}')" + class SupportsPipe(Protocol): """A protocol with the core Pipe properties and operations""" diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 4b9680b028..48a1aa9f89 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -24,7 +24,6 @@ from dlt.common.normalizers.json.typing import RelationalNormalizerConfig from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnName, TSchemaContract -from dlt.common.schema.utils import normalize_table_identifiers from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container from dlt.common.pipeline import ( @@ -496,13 +495,8 @@ def discover_schema(self, item: TDataItem = None, meta: Any = None) -> Schema: for r in self.selected_resources.values(): # names must be normalized here with contextlib.suppress(DataItemRequiredForDynamicTableHints): - root_table_schema = r.compute_table_schema(item, meta) - nested_tables_schema = r.compute_nested_table_schemas( - root_table_schema["name"], schema.naming, item, meta - ) # NOTE must ensure that `schema.update_table()` is called in an order that respect parent-child relationships - for table_schema in (root_table_schema, *nested_tables_schema): - partial_table = normalize_table_identifiers(table_schema, self._schema.naming) + for partial_table in r.compute_tables(schema.naming, item, meta): schema.update_table(partial_table) return schema diff --git a/dlt/helpers/ibis.py b/dlt/helpers/ibis.py index e7f6c2fa21..f228ae4184 100644 --- a/dlt/helpers/ibis.py +++ b/dlt/helpers/ibis.py @@ -226,7 +226,21 @@ def _ignore_hstore(conn: Any, name: Any) -> Any: sql_client.memory_db = None del sql_client elif issubclass(destination.spec, LanceDBClientConfiguration): - con = _create_ibis_backend_lancedb(client) + from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + from dlt.destinations.impl.lancedb.sql_client import LanceDBSQLClient + + assert isinstance(client, LanceDBClient) + sql_client = client.sql_client + assert isinstance(sql_client, LanceDBSQLClient) + if schemas: + sql_client.set_schemas(schemas) + duckdb_conn = sql_client.open_connection() + sql_client.create_views_for_all_tables() + con = ibis.duckdb.from_connection(duckdb_conn) + # disable destructor so connection survives + client.sql_client = None + sql_client.memory_db = None + del sql_client else: # NOTE: Athena could theoretically work with trino backend, but according to # https://github.com/ibis-project/ibis/issues/7682 connecting with aws credentials @@ -239,20 +253,6 @@ def _ignore_hstore(conn: Any, name: Any) -> Any: return con -def _create_ibis_backend_lancedb(client: JobClientBase) -> BaseBackend: - from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient - - assert isinstance(client, LanceDBClient) - # open connection but do not close it, ducklake always creates a separate connection - # and will not close it in destructor - native_con = client.sql_client.open_connection() - - for table_name in client.schema.tables: - client.sql_client.create_view(table_name) - - return ibis.duckdb.from_connection(native_con) - - def create_unbound_ibis_table(schema: Schema, dataset_name: str, table_name: str) -> Table: """Create an unbound ibis table from a dlt schema. No additional identifiers normalization, quoting or escaping is performed. diff --git a/dlt/load/load.py b/dlt/load/load.py index bd79e7f2b1..4c61078cfd 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -642,8 +642,9 @@ def initialize_package( # get dropped and truncated tables that were added in the extract step if refresh was requested # NOTE: if naming convention was updated those names correspond to the old naming convention # and they must be like that in order to drop existing tables - dropped_tables = current_load_package()["state"].get("dropped_tables", []) - truncated_tables = current_load_package()["state"].get("truncated_tables", []) + package_state = self.load_storage.normalized_packages.get_load_package_state(load_id) + dropped_tables = package_state.get("dropped_tables", []) + truncated_tables = package_state.get("truncated_tables", []) # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: diff --git a/dlt/load/utils.py b/dlt/load/utils.py index eb190a0b9f..62600a38e4 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -101,19 +101,20 @@ def init_client( # get all tables that actually have load jobs with data tables_with_jobs = set(job.table_name for job in new_jobs) - tables_no_data - # get tables to truncate by extending tables with jobs with all their nested tables + # initial tables contain child tables already initial_truncate_names = ( set(t["name"] for t in truncate_tables if drop_staging_filter(t)) if truncate_tables else set() ) + + # get tables to truncate by extending tables with jobs with all their nested tables truncate_table_names = set( _extend_tables_with_table_chain( schema, tables_with_jobs, tables_with_jobs, - lambda table_name: truncate_filter(table_name) - or (table_name in initial_truncate_names), + truncate_filter, ) ) @@ -129,7 +130,8 @@ def init_client( job_client, expected_update, tables_with_jobs | dlt_tables, - truncate_table_names, + truncate_tables=truncate_table_names, + initial_truncate_tables=initial_truncate_names, drop_tables=drop_table_names, ) @@ -162,7 +164,7 @@ def init_client( job_client, expected_update, all_staging_tables | {schema.version_table_name}, - staging_tables_with_jobs, # only truncate tables with jobs in this load + truncate_tables=staging_tables_with_jobs, # only truncate tables with jobs in this load staging_info=True, drop_tables=drop_table_names, # try to drop all the same tables on staging ) @@ -174,7 +176,9 @@ def _init_dataset_and_update_schema( job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], - truncate_tables: Iterable[str] = None, + *, + truncate_tables: Set[str] = None, + initial_truncate_tables: Set[str] = None, staging_info: bool = False, drop_tables: Iterable[str] = None, ) -> TSchemaTables: @@ -203,14 +207,19 @@ def _init_dataset_and_update_schema( f" {staging_text}" ) applied_update = job_client.update_stored_schema( - only_tables=update_tables, expected_update=expected_update + only_tables=update_tables, + expected_update=expected_update, + # force schema update if tables dropped or truncated via refresh + force=bool(drop_tables or initial_truncate_tables), ) - if truncate_tables: + if truncate_tables or initial_truncate_tables: + if initial_truncate_tables: + truncate_tables = initial_truncate_tables | (truncate_tables or set()) logger.info( f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" ) + job_client.initialize_storage(truncate_tables=truncate_tables) - job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update diff --git a/dlt/normalize/items_normalizers/jsonl.py b/dlt/normalize/items_normalizers/jsonl.py index 1d2159bb42..775c3f2545 100644 --- a/dlt/normalize/items_normalizers/jsonl.py +++ b/dlt/normalize/items_normalizers/jsonl.py @@ -21,7 +21,7 @@ TTableSchemaColumns, TSchemaContractDict, ) -from dlt.common.schema.utils import has_table_seen_data, is_complete_column +from dlt.common.schema.utils import has_table_seen_data from dlt.common.schema.exceptions import CannotCoerceColumnException, CannotCoerceNullException from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.storages.load_storage import LoadStorage @@ -493,8 +493,14 @@ def __call__( ) schema_updates.append(partial_update) logger.debug(f"Processed {line_no+1} lines from file {extracted_items_file}") - # empty json files are when replace write disposition is used in order to truncate table(s) - if line is None and root_table_name in self.schema.tables: + # generate empty files when (1) input data had no rows (2) rows got filtered out by contract + if ( + root_table_name in self.schema.tables + and self.item_storage.get_active_writer( # no active writer if no rows created + self.load_id, self.schema.name, root_table_name + )[1] + is None + ): root_table = self.schema.tables[root_table_name] if not has_table_seen_data(root_table): # if this is a new table, add normalizer columns diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 0da732950d..66984a5095 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -341,8 +341,9 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: logger.info( f"Found {len(schema_files)} files in schema {schema.name} load_id {load_id}" ) - if len(schema_files) == 0: - # delete empty package + if self.normalize_storage.extracted_packages.is_empty_package(load_id): + # package has no data and no refresh commands (tables to truncate / drop): drop it + # so it does not reach the load step. packages with refresh commands are kept self.normalize_storage.extracted_packages.delete_package(load_id) logger.info(f"Empty package {load_id} processed") continue @@ -355,11 +356,6 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: # return info on still pending packages (if extractor saved something in the meantime) return TRunMetrics(False, len(self.normalize_storage.extracted_packages.list_packages())) - # def verify_package(self, load_id, schema: Schema, schema_files: Sequence[str]) -> None: - # """Verifies package schema and jobs against destination capabilities""" - # # get all tables in schema files - # table_names = set(ParsedLoadJobFileName.parse(job).table_name for job in schema_files) - def get_load_package_info(self, load_id: str) -> LoadPackageInfo: """Returns information on extracted/normalized/completed package with given load_id, all jobs and their statuses.""" try: diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index d9ce62e89a..567c3f7c7e 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1643,7 +1643,7 @@ def _make_dataset_name( destination_needs_dataset = False if destination and issubclass(destination.spec, DestinationClientDwhConfiguration): destination_needs_dataset = destination.spec.needs_dataset_name() - # if destination is not specified - generate dataset + # set default dataset name if destination is specified and requires it if destination_needs_dataset: new_dataset_name = self.pipeline_name + self.DEFAULT_DATASET_SUFFIX diff --git a/docs/website/docs/dlt-ecosystem/destinations/lance.md b/docs/website/docs/dlt-ecosystem/destinations/lance.md index 5d9679c173..c891bc2ed8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/lance.md +++ b/docs/website/docs/dlt-ecosystem/destinations/lance.md @@ -152,9 +152,13 @@ bucket_url/ ``` - **Root namespace** — a physical directory at `bucket_url/namespace_name`. The `namespace_name` defaults to `"dlt_lance_root"` and can be set to `""` to use `bucket_url` directly. -- **Dataset namespace** — a logical child namespace named after `dataset_name`, tracked in the `__manifest/` catalog. Created automatically when the pipeline runs. All tables for the dataset are registered inside it. +- **Dataset namespace** — when `dataset_name` is set, a logical child namespace named after it is created automatically (tracked in the `__manifest/` catalog) and all tables for the dataset are registered inside it. `dataset_name` is **optional**: when omitted, tables are created directly in the root namespace (single-level table ids) and no per-dataset child namespace is used. - **Tables** — stored as hash-prefixed directories at the root namespace level, not nested under a dataset subdirectory. +:::note +`dataset_name` is optional for `lance`. If you do not pass one, dlt does **not** auto-generate a dataset name and writes tables to the **root namespace**. Pass a `dataset_name` to isolate a pipeline's tables. +::: + ```toml [destination.lance.storage] bucket_url = "s3://my-bucket" @@ -324,6 +328,15 @@ dataset = pipeline.dataset() df = dataset["movies"].df() ``` +Reads go through an in-memory DuckDB instance that scans the Lance datasets via views. Because Lance scans always read the latest dataset version, **new rows appended after a dataset connection was opened are visible without recreating the views**. Picking up **schema changes (new columns)** on an already-open connection requires recreating the views — enable `always_refresh_views` for that: + +```toml +[destination.lance] +always_refresh_views = true +``` + +This recreates the scanner views on every read (a small overhead), so leave it disabled unless you read evolving schemas through a long-lived dataset connection. + ### Low-level Lance access For operations specific to the Lance format — such as version management, tagging, or direct reads — use `open_lance_dataset` on the destination client. It returns a `lance.LanceDataset` from the [lance](https://github.com/lancedb/lance) library: diff --git a/tests/common/storages/test_load_package.py b/tests/common/storages/test_load_package.py index c8ff2049bf..968bceb2e5 100644 --- a/tests/common/storages/test_load_package.py +++ b/tests/common/storages/test_load_package.py @@ -87,6 +87,50 @@ def test_is_partially_loaded(load_storage: LoadStorage) -> None: assert PackageStorage.is_package_partially_loaded(info) is True +@pytest.mark.parametrize( + "case,expected_empty", + [ + # no jobs and no refresh commands + ("no_jobs_no_commands", True), + # package carries data jobs + ("with_jobs", False), + # package carries refresh commands even though it has no jobs + ("with_dropped_tables", False), + ("with_truncated_tables", False), + # commands and jobs together + ("with_dropped_and_jobs", False), + # a package being processed (applied schema update written) is never empty + ("with_applied_schema_update", False), + ], +) +def test_is_empty_package(load_storage: LoadStorage, case: str, expected_empty: bool) -> None: + package_storage = load_storage.new_packages + load_id = create_load_id() + package_storage.create_package(load_id, schema=Schema("mock")) + + if case in ("with_jobs", "with_dropped_and_jobs"): + package_storage.storage.save( + os.path.join(load_id, PackageStorage.NEW_JOBS_FOLDER, "mock_table.abc.0.jsonl"), "x" + ) + + if case == "with_applied_schema_update": + package_storage.storage.save( + os.path.join(load_id, PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME), "{}" + ) + + if case in ("with_dropped_tables", "with_dropped_and_jobs"): + state = package_storage.get_load_package_state(load_id) + state["dropped_tables"] = [{"name": "dropped_table"}] + package_storage.save_load_package_state(load_id, state) + + if case == "with_truncated_tables": + state = package_storage.get_load_package_state(load_id) + state["truncated_tables"] = [{"name": "truncated_table"}] + package_storage.save_load_package_state(load_id, state) + + assert package_storage.is_empty_package(load_id) is expected_empty + + def test_save_load_schema(load_storage: LoadStorage) -> None: # mock schema version to some random number so we know we load what we save schema = Schema("event") diff --git a/tests/destinations/test_lineage.py b/tests/destinations/test_lineage.py index 1c9191b4e9..aa0a2fd80f 100644 --- a/tests/destinations/test_lineage.py +++ b/tests/destinations/test_lineage.py @@ -64,6 +64,10 @@ def sqlglot_schema() -> SQLGlotSchema: JOIN table_unknown ON table_1.col_bool = table_unknown.col_unknown_2 """ +# `table_1` qualified with the catalog/db prefix that the sqlglot schema is keyed under +QUERY_DB_QUALIFIED_TABLE_STAR_SELECT = "SELECT * FROM db.table_1" +# the same table qualified with a prefix that is NOT in the sqlglot schema +QUERY_UNKNOWN_DB_QUALIFIED_TABLE_STAR_SELECT = "SELECT * FROM unknown_db.table_1" @pytest.mark.parametrize( @@ -162,6 +166,28 @@ def sqlglot_schema() -> SQLGlotSchema: "col_unknown_1": {"name": "col_unknown_1"}, }, ), + # table qualified with the known catalog/db prefix resolves exactly like the unqualified name + ( + QUERY_DB_QUALIFIED_TABLE_STAR_SELECT, + {"allow_partial": False}, + { + "col_varchar": {"name": "col_varchar", "data_type": "text"}, + "col_bool": {"name": "col_bool", "data_type": "bool"}, + }, + ), + # same query qualified with an UNKNOWN prefix: the table no longer matches `db.table_1`, so + # the `*` cannot be resolved. with `allow_partial` we silently get an empty column schema + ( + QUERY_UNKNOWN_DB_QUALIFIED_TABLE_STAR_SELECT, + {"allow_partial": True}, + {}, + ), + # ... and without it the unresolved `*` raises + ( + QUERY_UNKNOWN_DB_QUALIFIED_TABLE_STAR_SELECT, + {"allow_partial": False}, + LineageFailedException(), + ), ], ) def test_compute_columns_schema( diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 217b466e56..ea64c10ec0 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -12,13 +12,14 @@ NormalizeStorageConfiguration, ) from dlt.common.storages.schema_storage import SchemaStorage +from dlt.common.schema.typing import TWriteDisposition from dlt.common.typing import TTableNames, TDataItems from dlt.common.utils import uniq_id from dlt.extract import DltResource, DltSource from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, ResourceExtractionError from dlt.extract.extract import ExtractStorage, Extract -from dlt.extract.hints import TResourceNestedHints, make_hints +from dlt.extract.hints import TResourceNestedHints, make_hints, make_nested_hints from dlt.extract.items_transform import ValidateItem, MetricsItem from dlt.extract.items import TableNameMeta, DataItemWithMeta @@ -95,6 +96,9 @@ def table_with_name_selectable(_range): schema = expect_tables(extract_step, table_with_name_selectable) assert "table_with_name_selectable" not in schema.tables + # dynamically dispatched tables are not variants - they carry no variant_name + assert "variant_name" not in schema.tables["odd_table"] + assert "variant_name" not in schema.tables["even_table"] def test_extract_select_tables_lambda(extract_step: Extract) -> None: @@ -108,6 +112,9 @@ def table_name_with_lambda(_range): schema = expect_tables(extract_step, table_name_with_lambda) assert "table_name_with_lambda" not in schema.tables + # event-dispatch via a table_name function is not a variant - no variant_name is set + assert "variant_name" not in schema.tables["odd_table"] + assert "variant_name" not in schema.tables["even_table"] def test_make_hints_default() -> None: @@ -223,6 +230,8 @@ def with_table_hints(): assert "id" in table["columns"] assert table["columns"]["pk"]["primary_key"] is True assert table["columns"]["id"]["data_type"] == "bigint" + # a registered table variant carries its variant name + assert table["variant_name"] == "table_a" schema = dlt.current.source_schema() # table table_a will be created @@ -240,13 +249,16 @@ def with_table_hints(): # get table table = resource.compute_table_schema(meta=TableNameMeta("table_b")) assert table["write_disposition"] == "replace" + assert table["variant_name"] == "table_b" schema_table = schema.tables["table_b"] assert table == schema_table # item to resource yield {"id": 3, "pk": "C"} - # dispatch to table a with table meta + # dispatch to table a with table meta: a known variant keeps its variant name + table = resource.compute_table_schema(meta=TableNameMeta("table_a")) + assert table["variant_name"] == "table_a" yield dlt.mark.with_table_name({"id": 4, "pk": "D"}, "table_a") source = DltSource(dlt.Schema("hintable"), "module", [with_table_hints]) @@ -379,6 +391,14 @@ def test_break_nesting_with_primary_key(extract_step: Extract) -> None: # schema after extractions must be same as discovered schema assert source.schema._schema_tables == pre_extract_schema._schema_tables + # write disposition on a table broken from nesting is set to defaul (append) + # on reload + pseudo_root = "with_nested_hints__outer1__innerbar" + # in-memory the broken-out table carries no write disposition + assert "write_disposition" not in source.schema.tables[pseudo_root] + reloaded_schema = dlt.Schema.from_dict(source.schema.to_dict()) # type: ignore[arg-type] + assert reloaded_schema.tables[pseudo_root]["write_disposition"] == "append" + def test_nested_hints_dynamic_table_names(extract_step: Extract) -> None: data = [ @@ -614,7 +634,6 @@ def empty_list( assert found_empty_list -@pytest.mark.skip(reason="introduced by #3901; temporarily disabled") @pytest.mark.parametrize( "yield_one,yield_two", [(True, False), (False, True), (False, False), (True, True)], @@ -623,12 +642,13 @@ def empty_list( def test_materialize_table_schema_multi_table(yield_one: bool, yield_two: bool) -> None: """Empty table materialization works correctly for resources that produce multiple tables.""" + # non-normalized table names so the empty-table handling is exercised with normalized identifiers @dlt.resource def multi_table(): yield dlt.mark.with_hints( dlt.mark.materialize_table_schema(), dlt.mark.make_hints( - table_name="table_one", + table_name="TableOne", write_disposition="replace", columns={"col_one": {"data_type": "text"}}, ), @@ -637,16 +657,16 @@ def multi_table(): yield dlt.mark.with_hints( dlt.mark.materialize_table_schema(), dlt.mark.make_hints( - table_name="table_two", + table_name="TableTwo", write_disposition="replace", columns={"col_two": {"data_type": "bigint"}}, ), create_table_variant=True, ) if yield_one: - yield dlt.mark.with_table_name({"col_one": "val"}, table_name="table_one") + yield dlt.mark.with_table_name({"col_one": "val"}, table_name="TableOne") if yield_two: - yield dlt.mark.with_table_name({"col_two": 5}, table_name="table_two") + yield dlt.mark.with_table_name({"col_two": 5}, table_name="TableTwo") p = dlt.pipeline( pipeline_name="materialize_multi_" + uniq_id(), @@ -658,9 +678,15 @@ def multi_table(): extracted_tables = { job.job_file_info.table_name for job in extract_info.load_packages[0].jobs["new_jobs"] } - # both tables should always have jobs — either with data or empty files + # both tables should always have jobs (data or empty files) under their normalized names assert "table_one" in extracted_tables assert "table_two" in extracted_tables + assert "TableOne" not in extracted_tables + assert "TableTwo" not in extracted_tables + # variant tables keep their RAW (non-normalized) variant name even though the table identifier + # itself is normalized + assert p.default_schema.tables["table_one"]["variant_name"] == "TableOne" + assert p.default_schema.tables["table_two"]["variant_name"] == "TableTwo" @pytest.mark.parametrize( @@ -1067,3 +1093,377 @@ def dynamic_resource(): assert "another_table" in table_names assert "MyTable" not in table_names assert "AnotherTable" not in table_names + # dynamically dispatched tables are not variants - they carry no variant_name + assert "variant_name" not in source.schema.tables["my_table"] + assert "variant_name" not in source.schema.tables["another_table"] + + +def _mark_seen_data(schema: dlt.Schema, *table_names: str) -> None: + # simulate a completed prior run so tables are treated as existing tables that have seen data + for table_name in table_names: + schema.tables[table_name].setdefault("x-normalizer", {})["seen-data"] = True + + +def _extract_resource( + extract_step: Extract, schema: dlt.Schema, resource: DltResource +) -> Dict[str, Any]: + """Extract a single resource into the shared `schema` (mirrors the per-run extraction) and + return its per-table writer metrics (table name -> DataWriterMetrics).""" + source = DltSource(schema, "module", [resource]) + load_id = extract_step.extract_storage.create_load_package(schema) + extract_step._extract_single_source(load_id, source, max_parallel_items=5, workers=1) + table_metrics: Dict[str, Any] = extract_step._step_info_metrics(load_id)[0]["table_metrics"] + extract_step.extract_storage.commit_new_load_package(load_id, schema) + return table_metrics + + +@pytest.mark.parametrize( + "table_name,expected", + [("items", "items"), ("MyItems", "my_items")], + ids=["table_is_resource_name", "table_differs_from_resource_name"], +) +def test_handle_empty_tables_refreshes_static_write_disposition( + extract_step: Extract, table_name: str, expected: str +) -> None: + """#3998 root cause: a static resource that yields no data still refreshes its (stale) write + disposition in the schema from the current resource hints - including a full write disposition + config (ie. an scd2 merge strategy). A replace table that gets no data still has an empty file + written (so it is truncated), while a non-replace one does not. Covers both a table matching the + resource name and one with a distinct (non-normalized) table name.""" + schema = dlt.Schema("empty_tables") + + # a column declared via the decorator makes the table complete, so it survives the + # `seen_data_only` filter once data has been seen + @dlt.resource( + name="items", + table_name=table_name, + write_disposition="replace", + columns=[{"name": "id", "data_type": "bigint"}], + ) + def items_replace(data: Any) -> Any: + yield from data + + # first run with data creates the table + _extract_resource(extract_step, schema, items_replace([{"Id": 1}])) + _mark_seen_data(schema, expected) + assert schema.tables[expected]["write_disposition"] == "replace" + + # a replace table that yields no data still gets an empty file (so it is truncated): it appears + # in the writer metrics with zero items + metrics = _extract_resource(extract_step, schema, items_replace([])) + assert metrics[expected].items_count == 0 + + # the resource now switches to an scd2 merge config and yields no data + @dlt.resource( + name="items", + table_name=table_name, + write_disposition={"disposition": "merge", "strategy": "scd2"}, + ) + def items_merge() -> Any: + yield from [] + + # a non-replace table that yields no data must NOT get an empty file - it is absent from metrics + metrics = _extract_resource(extract_step, schema, items_merge()) + assert expected not in metrics + + items_table = schema.tables[expected] + # write disposition refreshed from the resource, with the full scd2 merge config applied + assert items_table["write_disposition"] == "merge" + assert items_table["x-merge-strategy"] == "scd2" # type: ignore[typeddict-item] + # scd2 validity columns were added by the merge config + assert "_dlt_valid_from" in items_table["columns"] + assert "_dlt_valid_to" in items_table["columns"] + + +def test_handle_empty_tables_variant_pseudo_root_no_cascade(extract_step: Extract) -> None: + """Nested hints that break nesting (a primary key) create a pseudo-root table - both for the + default table and for a variant. On an empty run the default root is refreshed from the resource + hints and the variant (declared with a non-normalized name) from its own variant hints - keyed + by the raw name, so it keeps its own disposition rather than falling back to the resource root. + The pseudo-roots are left untouched - we can't re-derive a pseudo-root, and recomputing one as a + root would re-apply the nested hints and create spurious tables.""" + schema = dlt.Schema("empty_tables") + + def make_resource( + wd: TWriteDisposition, variant_wd: TWriteDisposition, data: Any + ) -> DltResource: + # a column makes the root complete (so it survives the seen_data filter); the nested hint's + # primary key breaks nesting into a pseudo-root that carries its own (replace) disposition + @dlt.resource( + name="items", + write_disposition=wd, + columns=[{"name": "id", "data_type": "bigint"}], + nested_hints={ + "SubItems": make_nested_hints( + primary_key="Id", + write_disposition="replace", + columns=[{"name": "id", "data_type": "bigint"}], + ) + }, + ) + def items() -> Any: + yield from data + + # the variant has a non-normalized name and its own write disposition; it inherits the + # (nesting-breaking) nested hints + items.apply_hints( + table_name="OtherItems", write_disposition=variant_wd, create_table_variant=True + ) + return items + + # the four tables exercised below: the default root + its pseudo-root, and the variant root + + # its pseudo-root + roots = ["items", "other_items"] + pseudo_roots = ["items__sub_items", "other_items__sub_items"] + all_tables = roots + pseudo_roots + + # run 1: replace with data for both the default table and the variant creates all four tables + # (each root plus its broken-out, primary-keyed pseudo-root) + seed = [ + {"Id": 1, "SubItems": [{"Id": 101}]}, + dlt.mark.with_table_name({"Id": 2, "SubItems": [{"Id": 102}]}, "OtherItems"), + ] + _extract_resource(extract_step, schema, make_resource("replace", "replace", seed)) + _mark_seen_data(schema, *all_tables) + for pseudo in pseudo_roots: + assert is_nested_table(schema.tables[pseudo]) is False + for table in all_tables: + assert schema.tables[table]["write_disposition"] == "replace" + + # run 2: replace with no data - every table (roots and pseudo-roots) is replace, so each gets an + # empty file written (so it is truncated), and no spurious cascade table is created + metrics = _extract_resource(extract_step, schema, make_resource("replace", "replace", [])) + for table in all_tables: + assert metrics[table].items_count == 0 + assert "items__sub_items__sub_items" not in schema.tables + assert "other_items__sub_items__sub_items" not in schema.tables + + # run 3: default root switches to append, the variant to merge, with no data - no empty files + # are written for any table (the resource is append) + metrics = _extract_resource(extract_step, schema, make_resource("append", "merge", [])) + for table in all_tables: + assert table not in metrics + # the default root is refreshed from the resource hints + assert schema.tables["items"]["write_disposition"] == "append" + # the variant is refreshed from its own (raw-name-keyed) hints, keeping its own disposition - + # a normalized lookup would miss it and wrongly fall back to the resource root's append + assert schema.tables["other_items"]["write_disposition"] == "merge" + # the pseudo-roots can't be re-derived, so they are left untouched (not switched) + for pseudo in pseudo_roots: + assert schema.tables[pseudo]["write_disposition"] == "replace" + # recomputing a pseudo-root would have re-applied the nested hints and created these tables + assert "items__sub_items__sub_items" not in schema.tables + assert "other_items__sub_items__sub_items" not in schema.tables + + +@pytest.mark.parametrize("dispatch", ["marked", "dynamic"]) +def test_handle_empty_tables_updates_dispatched_tables( + extract_step: Extract, dispatch: str +) -> None: + """Event-dispatch tables - created via with_table_name marks or a dynamic table_name function - + have their (static) write disposition refreshed on an empty run, and a replace table that gets + no data has an empty file written so it is truncated. Dispatched tables are not variants: they + carry no variant_name and are refreshed via their normalized table name.""" + schema = dlt.Schema("empty_tables") + + def make_resource(wd: TWriteDisposition, data: Any) -> DltResource: + # a declared column makes the dispatched tables complete so they survive the seen_data filter + if dispatch == "marked": + + @dlt.resource( + name="events", write_disposition=wd, columns=[{"name": "id", "data_type": "bigint"}] + ) + def events() -> Any: + for d in data: + yield dlt.mark.with_table_name(d, d["kind"]) + + else: + + @dlt.resource( + name="events", + table_name=lambda e: e["kind"], + write_disposition=wd, + columns=[{"name": "id", "data_type": "bigint"}], + ) + def events() -> Any: + yield from data + + return events() + + tables = ["my_issue", "my_purchase"] + + # run 1: replace with data creates both dispatched tables + seed = [{"kind": "MyIssue", "id": 1}, {"kind": "MyPurchase", "id": 2}] + _extract_resource(extract_step, schema, make_resource("replace", seed)) + _mark_seen_data(schema, *tables) + for table in tables: + assert schema.tables[table]["write_disposition"] == "replace" + # dispatched tables are not variants - they carry no variant_name + assert "variant_name" not in schema.tables[table] + + # run 2 (totally empty): every replace table gets an empty file (so it is truncated) + metrics = _extract_resource(extract_step, schema, make_resource("replace", [])) + for table in tables: + assert metrics[table].items_count == 0 + + # run 3 (only one table gets data): the table with data is loaded normally, the other still gets + # an empty file + metrics = _extract_resource( + extract_step, schema, make_resource("replace", [{"kind": "MyIssue", "id": 3}]) + ) + assert metrics["my_issue"].items_count == 1 + assert metrics["my_purchase"].items_count == 0 + + # run 4: switch to append with no data - tables are refreshed to append and NOT truncated + metrics = _extract_resource(extract_step, schema, make_resource("append", [])) + for table in tables: + assert table not in metrics + assert schema.tables[table]["write_disposition"] == "append" + + +def test_handle_empty_tables_ignores_dynamic_write_disposition(extract_step: Extract) -> None: + """Resources whose write disposition is computed from data are ignored: the disposition cannot + be known without data, so the schema is left unchanged when no data is yielded.""" + schema = dlt.Schema("empty_tables") + + def make_resource(data: Any) -> DltResource: + @dlt.resource( + name="events", + table_name=lambda e: e["kind"], + write_disposition=lambda e: e["wd"], + ) + def events() -> Any: + yield from data + + return events() + + _extract_resource( + extract_step, schema, make_resource([{"kind": "MyIssue", "wd": "replace", "id": 1}]) + ) + _mark_seen_data(schema, "my_issue") + assert schema.tables["my_issue"]["write_disposition"] == "replace" + + # no data: dynamic write disposition cannot be resolved, so the table is left untouched and is + # not truncated (no empty file written) + metrics = _extract_resource(extract_step, schema, make_resource([])) + assert "my_issue" not in metrics + assert schema.tables["my_issue"]["write_disposition"] == "replace" + + +@pytest.mark.parametrize( + "resource_wd,member_kind,member_wd,root_truncated,member_truncated", + [ + # a replace resource truncates the root, and a member only if it also accepts replace + ("replace", "variant", "replace", True, True), + ("replace", "variant", "merge", True, False), + ("replace", "pseudo", "replace", True, True), + ("replace", "pseudo", "merge", True, False), + # an append resource never truncates - not even a replace variant or pseudo-root + ("append", "variant", "replace", False, False), + ("append", "pseudo", "replace", False, False), + ], + ids=[ + "replace_resource-replace_variant", + "replace_resource-merge_variant", + "replace_resource-replace_pseudo", + "replace_resource-merge_pseudo", + "append_resource-replace_variant", + "append_resource-replace_pseudo", + ], +) +def test_handle_empty_tables_skips_tables_not_accepting_replace( + extract_step: Extract, + resource_wd: TWriteDisposition, + member_kind: str, + member_wd: TWriteDisposition, + root_truncated: bool, + member_truncated: bool, +) -> None: + """An empty file (truncation) is written only when BOTH the resource and the table accept + replace. A replace resource does not truncate a variant or pseudo-root whose own write + disposition is not replace; an append resource never truncates, not even a replace variant or + pseudo-root.""" + schema = dlt.Schema("empty_tables") + member_table = "other_items" if member_kind == "variant" else "items__sub_items" + + def make_resource(data: Any) -> DltResource: + nested: Optional[Dict[TTableNames, TResourceNestedHints]] = ( + { + "SubItems": make_nested_hints( + primary_key="id", + write_disposition=member_wd, + columns=[{"name": "id", "data_type": "bigint"}], + ) + } + if member_kind == "pseudo" + else None + ) + + @dlt.resource( + name="items", + write_disposition=resource_wd, + columns=[{"name": "id", "data_type": "bigint"}], + nested_hints=nested, + ) + def items() -> Any: + yield from data + + if member_kind == "variant": + # a variant with its own write disposition, declared with a non-normalized name + items.apply_hints( + table_name="OtherItems", write_disposition=member_wd, create_table_variant=True + ) + return items + + if member_kind == "variant": + seed: Any = [{"id": 1}, dlt.mark.with_table_name({"id": 2}, "OtherItems")] + else: + seed = [{"id": 1, "SubItems": [{"id": 11}]}] + + # run 1: create the root and the member table + _extract_resource(extract_step, schema, make_resource(seed)) + _mark_seen_data(schema, "items", member_table) + + # run 2: empty - only tables where the resource and the table both accept replace are truncated + metrics = _extract_resource(extract_step, schema, make_resource([])) + assert ("items" in metrics) is root_truncated + assert (member_table in metrics) is member_truncated + + +def test_handle_empty_tables_variant_not_redeclared_left_untouched(extract_step: Extract) -> None: + """When a variant is registered in a prior run but not re-declared on the empty run, its write + disposition cannot be determined from the (now absent) variant hints. Like a pseudo-root, the + decision then falls back to the stored disposition vs the resource disposition: the table is left + untouched and not truncated when they differ - even though the resource is replace.""" + schema = dlt.Schema("empty_tables") + + def make_resource(declare_variant: bool, data: Any) -> DltResource: + @dlt.resource( + name="items", + write_disposition="replace", + columns=[{"name": "id", "data_type": "bigint"}], + ) + def items() -> Any: + if declare_variant: + # a merge variant declared inside the generator (only when there is data) + yield dlt.mark.with_hints( + {"id": 1}, + make_hints(table_name="OtherItems", write_disposition="merge"), + create_table_variant=True, + ) + yield from data + + return items + + # run 1: the merge variant is registered and gets data; the replace root gets data too + _extract_resource(extract_step, schema, make_resource(True, [{"id": 2}])) + _mark_seen_data(schema, "items", "other_items") + assert schema.tables["other_items"]["write_disposition"] == "merge" + + # run 2: empty - the variant is NOT re-declared, so it is absent from `_hints_variants` and its + # disposition can't be determined. Its stored disposition (merge) differs from the resource's + # (replace), so it is left untouched and not truncated; only the replace root is truncated + metrics = _extract_resource(extract_step, schema, make_resource(False, [])) + assert "other_items" not in metrics + assert schema.tables["other_items"]["write_disposition"] == "merge" + assert metrics["items"].items_count == 0 diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 71f74cb173..3de084747b 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1951,11 +1951,15 @@ def empty_gen(): table_a = empty.compute_table_schema(meta=TableNameMeta("table_a")) assert table_a["name"] == "table_a" assert table_a["write_disposition"] == "replace" + # a registered variant carries its variant name + assert table_a["variant_name"] == "table_a" # unknown table (without variant) - created out resource hints table_unk = empty.compute_table_schema(meta=TableNameMeta("table_unk")) assert table_unk["name"] == "table_unk" assert table_unk["write_disposition"] == "append" + # not a registered variant - dispatched table name carries no variant_name + assert "variant_name" not in table_unk # resource hints are base for table variants empty.apply_hints( @@ -1967,6 +1971,7 @@ def empty_gen(): table_b = empty.compute_table_schema(meta=TableNameMeta("table_b")) assert table_b["name"] == "table_b" assert table_b["write_disposition"] == "merge" + assert table_b["variant_name"] == "table_b" assert len(table_b["columns"]) == 1 assert table_b["columns"]["id"]["primary_key"] is True # overwrite table_b, remove column def and primary_key diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index dfedd4d064..07bef6421b 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -24,7 +24,9 @@ from dlt.destinations import filesystem from tests.utils import get_test_storage_root from tests.cases import arrow_table_all_data_types -from dlt.destinations.exceptions import DatabaseUndefinedRelation +import duckdb + +from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseUndefinedRelation @pytest.fixture(scope="function", autouse=True) @@ -456,3 +458,53 @@ def items2(): # check df and arrow access assert len(pipeline.dataset().items.df().index) == 50 assert pipeline.dataset().items.arrow().num_rows == 50 + + +@pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs(local_filesystem_configs=True), + ids=lambda x: x.name, +) +def test_auto_views_not_created_for_other_dataset_qualified_table( + destination_config: DestinationTestConfiguration, +) -> None: + """The scanner only creates views for tables that belong to its own dataset. A query that + references a same-named table qualified with a different schema (i.e. another dataset) must read + that table directly and must NOT trigger creation of a scanner view for the dataset's table.""" + if destination_config.file_format not in ["parquet", "jsonl"]: + pytest.skip( + f"Test only works for jsonl and parquet, given: {destination_config.file_format}" + ) + + pipeline = destination_config.setup_pipeline( + "read_pipeline", + dataset_name="test_other_dataset_no_view", + dev_mode=True, + ) + + # the dataset has its own `items` table - a view would be created for it on a matching query + @dlt.resource(name="items") + def items(): + yield [{"id": 1, "value": "hello"}] + + pipeline.run(items(), **destination_config.run_kwargs) + + dataset = pipeline.dataset() + with dataset: + conn: duckdb.DuckDBPyConnection = dataset.sql_client.native_connection + # a real table named `items` living in another schema, i.e. a different dataset + conn.execute("CREATE SCHEMA manual_schema") + conn.execute("CREATE TABLE manual_schema.items(id INTEGER)") + conn.execute("INSERT INTO manual_schema.items VALUES (42)") + + # querying the table qualified with the other schema reads it directly + rows = dataset.query( + "SELECT * FROM manual_schema.items ORDER BY id", _execute_raw_query=True + ).fetchall() + assert rows == [(42,)] + + # the scanner must not have created a view for `items`: the reference was qualified with a + # schema that is not the dataset name, so the dataset's own `items` table is left untouched + views = conn.execute("SELECT view_name FROM duckdb_views()").fetchall() + assert "items" not in [v[0] for v in views] diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py index da8d0a7504..30b80e2ce3 100644 --- a/tests/load/lancedb/test_pipeline.py +++ b/tests/load/lancedb/test_pipeline.py @@ -666,15 +666,18 @@ def identity_resource(data: pa.Table) -> Generator[pa.Table, None, None]: ids=lambda x: x.name, ) def test_empty_dataset_allowed(destination_config: DestinationTestConfiguration) -> None: - if destination_config.destination_type == "lance": - pytest.skip("lance destination does not allow empty datasets") - # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed. - pipe = dlt.pipeline(destination=destination_config.destination_type, dev_mode=True) + # both lance (root namespace) and lancedb (no table prefix) allow this. embeddings are tested + # elsewhere, so this stays plain text. + destination: Any = ( + destination_config.destination_factory() + if destination_config.destination_type == "lance" # lance needs catalog/bucket config + else destination_config.destination_type + ) + pipe = dlt.pipeline(destination=destination, dev_mode=True) assert pipe.dataset_name is None - adapter = get_adapter(destination_config) - info = pipe.run(adapter(["context", "created", "not a stop word"], embed=["value"])) + info = pipe.run(["context", "created", "not a stop word"], table_name="content") # Dataset in load info is empty. assert info.dataset_name is None # lancedb is a DWH destination with optional dataset_name — verify metrics reflect None @@ -691,6 +694,63 @@ def test_empty_dataset_allowed(destination_config: DestinationTestConfiguration) assert len(rows) == 3 +@pytest.mark.parametrize( + "destination_config", + LANCE_DEST_CONFS, + ids=lambda x: x.name, +) +def test_dataset_sees_new_rows_through_open_connection( + destination_config: DestinationTestConfiguration, +) -> None: + """With the default (no view refresh), a dataset opened with a persisted sql client sees rows + written after it was opened: lance reads the latest dataset version on each scan.""" + pipeline = destination_config.setup_pipeline("lance_open_conn_" + uniq_id(), dev_mode=True) + info = pipeline.run([{"id": 1, "value": "a"}], table_name="items") + assert_load_info(info) + + # open the dataset and keep the sql client connection open for the whole block + with pipeline.dataset() as ds_: + # initial read (also creates the duckdb scanner view for `items`) + assert len(ds_.items.fetchall()) == 1 + + # append more rows (same schema) while the connection stays open + info = pipeline.run([{"id": 2, "value": "b"}], table_name="items") + assert_load_info(info) + + # new rows are visible without recreating the view + assert ds_("SELECT * FROM items").arrow().num_rows == 2 + + +@pytest.mark.parametrize( + "destination_config", + LANCE_DEST_CONFS, + ids=lambda x: x.name, +) +def test_dataset_sees_schema_evolution_with_always_refresh( + destination_config: DestinationTestConfiguration, +) -> None: + """With `always_refresh_views` enabled, a dataset opened with a persisted sql client also sees + schema changes (new columns) written after it was opened.""" + pipeline = destination_config.setup_pipeline( + "lance_open_conn_refresh_" + uniq_id(), dev_mode=True + ) + pipeline.destination.config_params["always_refresh_views"] = True + info = pipeline.run([{"id": 1, "value": "a"}], table_name="items") + assert_load_info(info) + + with pipeline.dataset() as ds_: + assert len(ds_.items.fetchall()) == 1 + + # append a row AND evolve the schema (new `extra` column) while the connection stays open + info = pipeline.run([{"id": 2, "value": "b", "extra": 99}], table_name="items") + assert_load_info(info) + + # both the new row and the new column are visible through the refreshed view + result = ds_("SELECT * FROM items").arrow() + assert result.num_rows == 2 + assert "extra" in result.column_names + + @pytest.mark.parametrize( "destination_config", LANCE_DEST_CONFS, diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 9a204c8dfb..c18b36d858 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -24,6 +24,21 @@ from dlt.destinations.exceptions import DatabaseUndefinedRelation +# destinations the drop command runs on: sql + local filesystem (incl. table formats) + lance/lancedb +# (weaviate/qdrant have no sql client to verify dropped tables) +drop_command_configs = pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, + local_filesystem_configs=True, + table_format_local_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], + ), + ids=lambda x: x.name, +) + + def _attach(pipeline: Pipeline) -> Pipeline: return dlt.attach(pipeline.pipeline_name, pipelines_dir=pipeline.pipelines_dir) @@ -149,6 +164,8 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: local_filesystem_configs=True, all_buckets_filesystem_configs=True, table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], ), ids=lambda x: x.name, ) @@ -245,15 +262,7 @@ def test_drop_command_resources_and_state( assert "name" in droppable_c_l_schema["columns"] -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_drop_command_only_state(destination_config: DestinationTestConfiguration) -> None: """Test drop command that deletes part of the state and syncs with destination""" source = droppable_source() @@ -276,15 +285,7 @@ def test_drop_command_only_state(destination_config: DestinationTestConfiguratio assert_destination_state_loaded(pipeline) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_drop_command_only_tables(destination_config: DestinationTestConfiguration) -> None: """Test drop only tables and makes sure that schema and state are synced""" source = droppable_source() @@ -304,15 +305,7 @@ def test_drop_command_only_tables(destination_config: DestinationTestConfigurati assert_destination_state_loaded(pipeline) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_drop_destination_tables_fails(destination_config: DestinationTestConfiguration) -> None: """Fail on DROP TABLES in destination init. Command runs again.""" source = droppable_source() @@ -338,15 +331,7 @@ def test_drop_destination_tables_fails(destination_config: DestinationTestConfig assert_destination_state_loaded(attached) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration) -> None: """Fail directly after drop tables. Command runs again ignoring destination tables missing.""" source = droppable_source() @@ -375,15 +360,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration assert_destination_state_loaded(attached) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_load_step_fails(destination_config: DestinationTestConfiguration) -> None: """Test idempotence. pipeline.load() fails. Command can be run again successfully""" source = droppable_source() @@ -406,7 +383,12 @@ def test_load_step_fails(destination_config: DestinationTestConfiguration) -> No @pytest.mark.parametrize( "destination_config", - destinations_configs(default_sql_configs=True, local_filesystem_configs=True), + destinations_configs( + default_sql_configs=True, + local_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], + ), ids=lambda x: x.name, ) def test_resource_regex(destination_config: DestinationTestConfiguration) -> None: @@ -424,15 +406,7 @@ def test_resource_regex(destination_config: DestinationTestConfiguration) -> Non assert_destination_state_loaded(attached) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: """No resources, no state keys. Nothing is changed.""" source = droppable_source() @@ -450,7 +424,12 @@ def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: @pytest.mark.parametrize( "destination_config", - destinations_configs(default_sql_configs=True, table_format_local_configs=True), + destinations_configs( + default_sql_configs=True, + table_format_local_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], + ), ids=lambda x: x.name, ) def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None: @@ -476,15 +455,7 @@ def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None assert all(len(table[1]) > 0 for table in storage_tables) -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) @@ -501,15 +472,7 @@ def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConf attached.load() -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@drop_command_configs def test_drop_state_only(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) diff --git a/tests/load/pipeline/test_lance.py b/tests/load/pipeline/test_lance.py index 3f634c7b13..f41767aba7 100644 --- a/tests/load/pipeline/test_lance.py +++ b/tests/load/pipeline/test_lance.py @@ -3,6 +3,7 @@ from typing import cast +from dlt.common.utils import uniq_id from dlt.destinations.impl.lance.exceptions import LanceEmbeddingsConfigurationMissing from dlt.destinations.impl.lance.lance_adapter import lance_adapter from dlt.pipeline.exceptions import PipelineStepFailed @@ -165,3 +166,53 @@ def test_lance_pipeline_replace_in_branch( dev_ds = client.open_lance_dataset("items", branch_name="dev") assert dev_ds.count_rows() == 1 assert dev_ds.to_table().column("text").to_pylist() == ["dev-replaced"] + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_vector_configs=True, subset=("lance",)), + ids=lambda x: x.name, +) +def test_lance_pipeline_branching_root_namespace( + destination_config: DestinationTestConfiguration, +) -> None: + """Branching and schema evolution work for tables in the ROOT namespace (no `dataset_name`, + single-level table ids).""" + from dlt.destinations.impl.lance.lance_client import LanceClient + + # build the configured destination but do NOT pass a dataset_name -> root namespace + pipe = dlt.pipeline( + pipeline_name="test_lance_root_branch_" + uniq_id(), + destination=destination_config.destination_factory(), + dev_mode=True, + ) + assert pipe.dataset_name is None + + # first run: write to main (no branch) + pipe.run([{"id": 1, "text": "main-record"}], table_name="items") + # second run: write to "staging" branch (forks from main) + pipe.destination.config_params["branch_name"] = "staging" + pipe.run([{"id": 2, "text": "b1"}, {"id": 3, "text": "b2"}], table_name="items") + # third run: write to "dev" branch with schema evolution (extra column) + pipe.destination.config_params["branch_name"] = "dev" + pipe.run([{"id": 4, "text": "d1", "a_new_column": 1}], table_name="items") + + with pipe.destination_client() as client: + client = cast(LanceClient, client) + assert client.dataset_name is None + # tables are addressed at the root namespace with single-level ids + assert client.make_table_id("items") == ["items"] + + main_ds = client.open_lance_dataset("items") + staging_ds = client.open_lance_dataset("items", branch_name="staging") + dev_ds = client.open_lance_dataset("items", branch_name="dev") + + # branches fork from main, not from each other + assert main_ds.count_rows() == 1 + assert staging_ds.count_rows() == 3 # 1 from main + 2 new + assert dev_ds.count_rows() == 2 # 1 from main + 1 new + + # schema evolution is branch-isolated + assert "a_new_column" in dev_ds.schema.names + assert "a_new_column" not in main_ds.schema.names + assert "a_new_column" not in staging_ds.schema.names diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py index df14507c3c..e1ec630f78 100644 --- a/tests/load/pipeline/test_refresh_modes.py +++ b/tests/load/pipeline/test_refresh_modes.py @@ -5,22 +5,36 @@ from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.common.utils import uniq_id from dlt.common.typing import DictStrAny -from dlt.common.pipeline import pipeline_state as current_pipeline_state +from dlt.common.schema.utils import is_nested_table +from dlt.common.pipeline import pipeline_state as current_pipeline_state, TRefreshMode from dlt.destinations.sql_client import DBApiCursor from dlt.extract.source import DltSource from dlt.extract.state import resource_state from dlt.pipeline.state_sync import load_pipeline_state_from_destination -from tests.utils import clean_test_storage, get_test_storage_root from tests.pipeline.utils import ( assert_load_info, + assert_empty_tables, load_table_counts, load_tables_to_dicts, assert_only_table_columns, table_exists, ) -from tests.load.utils import FILE_BUCKET, destinations_configs, DestinationTestConfiguration +from tests.load.utils import destinations_configs, DestinationTestConfiguration + + +# destinations that exercise the full truncate/drop table-chain logic (sql + filesystem + athena) +refresh_chain_destinations = pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, + local_filesystem_configs=True, + table_format_local_configs=True, + subset=["duckdb", "filesystem", "athena"], + ), + ids=lambda x: x.name, +) def assert_source_state_is_wiped(state: DictStrAny) -> None: @@ -38,73 +52,64 @@ def column_values(cursor: DBApiCursor, column_name: str) -> List[Any]: @dlt.source def refresh_source(first_run: bool = True, drop_sources: bool = False): - @dlt.resource - def some_data_1(): + @dlt.resource(table_name="SomeDataOne") + def some_data_one(): if first_run: # Set some source and resource state dlt.current.source_state()["source_key_1"] = "source_value_1" - resource_state("some_data_1")["run1_1"] = "value1_1" - resource_state("some_data_1")["run1_2"] = "value1_2" - yield {"id": 1, "name": "John"} - yield {"id": 2, "name": "Jane"} + resource_state("some_data_one")["run1_1"] = "value1_1" + resource_state("some_data_one")["run1_2"] = "value1_2" + yield {"ItemId": 1, "FullName": "John"} + yield {"ItemId": 2, "FullName": "Jane"} else: # Check state is cleared for this resource - assert not resource_state("some_data_1") + assert not resource_state("some_data_one") if drop_sources: assert_source_state_is_wiped(dlt.current.source_state()) # Second dataset without name column to test tables are re-created - yield {"id": 3} - yield {"id": 4} + yield {"ItemId": 3} + yield {"ItemId": 4} - @dlt.resource - def some_data_2(): + @dlt.resource(table_name="SomeDataTwo") + def some_data_two(): if first_run: dlt.current.source_state()["source_key_2"] = "source_value_2" - resource_state("some_data_2")["run1_1"] = "value1_1" - resource_state("some_data_2")["run1_2"] = "value1_2" - yield {"id": 5, "name": "Joe"} - yield {"id": 6, "name": "Jill"} + resource_state("some_data_two")["run1_1"] = "value1_1" + resource_state("some_data_two")["run1_2"] = "value1_2" + yield {"ItemId": 5, "FullName": "Joe"} + yield {"ItemId": 6, "FullName": "Jill"} else: - assert not resource_state("some_data_2") + assert not resource_state("some_data_two") if drop_sources: assert_source_state_is_wiped(dlt.current.source_state()) - yield {"id": 7} - yield {"id": 8} + yield {"ItemId": 7} + yield {"ItemId": 8} - @dlt.resource(primary_key="id", write_disposition="merge") - def some_data_3(): + @dlt.resource(table_name="SomeDataThree", primary_key="ItemId", write_disposition="merge") + def some_data_three(): if first_run: dlt.current.source_state()["source_key_3"] = "source_value_3" - resource_state("some_data_3")["run1_1"] = "value1_1" - yield {"id": 9, "name": "Jack"} - yield {"id": 10, "name": "Jill"} + resource_state("some_data_three")["run1_1"] = "value1_1" + yield {"ItemId": 9, "FullName": "Jack"} + yield {"ItemId": 10, "FullName": "Jill"} else: - assert not resource_state("some_data_3") + assert not resource_state("some_data_three") if drop_sources: assert_source_state_is_wiped(dlt.current.source_state()) - yield {"id": 11} - yield {"id": 12} + yield {"ItemId": 11} + yield {"ItemId": 12} - @dlt.resource - def some_data_4(): + @dlt.resource(table_name="SomeDataFour") + def some_data_four(): yield [] - yield some_data_1 - yield some_data_2 - yield some_data_3 - yield some_data_4 + yield some_data_one + yield some_data_two + yield some_data_three + yield some_data_four -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - subset=["duckdb", "filesystem", "iceberg", "athena"], - local_filesystem_configs=True, - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@refresh_chain_destinations @pytest.mark.parametrize("in_source", (True, False)) @pytest.mark.parametrize("with_wipe", (True, False)) def test_refresh_drop_sources( @@ -121,7 +126,7 @@ def test_refresh_drop_sources( # first run pipeline so destination so tables are created info = pipeline.run(data, refresh="drop_sources", **destination_config.run_kwargs) assert_load_info(info) - assert table_exists(pipeline, "some_data_3") + assert table_exists(pipeline, "some_data_three") # second run of pipeline with only selected resources if with_wipe: @@ -129,7 +134,7 @@ def test_refresh_drop_sources( pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name) data = refresh_source(first_run=False, drop_sources=True).with_resources( - "some_data_1", "some_data_2" + "some_data_one", "some_data_two" ) if not in_source: data = list(data.selected_resources.values()) @@ -141,19 +146,19 @@ def test_refresh_drop_sources( ) assert set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) == { - "some_data_1", - "some_data_2", + "some_data_one", + "some_data_two", } # no "name" column should exist as table was dropped and re-created without it - assert_only_table_columns(pipeline, "some_data_1", ["id"]) - data = load_tables_to_dicts(pipeline, "some_data_1")["some_data_1"] - result = sorted([row["id"] for row in data]) + assert_only_table_columns(pipeline, "some_data_one", ["item_id"]) + data = load_tables_to_dicts(pipeline, "some_data_one")["some_data_one"] + result = sorted([row["item_id"] for row in data]) # only rows from second run should exist assert result == [3, 4] # confirm resource tables not selected on second run got dropped - assert not table_exists(pipeline, "some_data_3") + assert not table_exists(pipeline, "some_data_three") # loaded state is wiped with pipeline.destination_client() as dest_client: destination_state = load_pipeline_state_from_destination( @@ -167,7 +172,7 @@ def test_refresh_drop_sources( destinations_configs( default_sql_configs=True, local_filesystem_configs=True, - subset=["duckdb", "filesystem", "iceberg"], + subset=["duckdb", "filesystem"], table_format_local_configs=True, ), ids=lambda x: x.name, @@ -189,7 +194,7 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): # Second run with all tables dropped and only some tables re-created info = pipeline.run( refresh_source(first_run=False, drop_sources=True).with_resources( - "some_data_1", "some_data_2" + "some_data_one", "some_data_two" ), **destination_config.run_kwargs, ) @@ -198,7 +203,7 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): new_table_names = set( t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True) ) - assert new_table_names == {"some_data_1", "some_data_2"} + assert new_table_names == {"some_data_one", "some_data_two"} # Run again with all tables to ensure they are re-created # The new schema in this case should match the schema of the first run exactly @@ -206,8 +211,8 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): refresh_source(first_run=True, drop_sources=True), **destination_config.run_kwargs ) # Check table 3 was re-created - data = load_tables_to_dicts(pipeline, "some_data_3")["some_data_3"] - result = sorted([(row["id"], row["name"]) for row in data]) + data = load_tables_to_dicts(pipeline, "some_data_three")["some_data_three"] + result = sorted([(row["item_id"], row["full_name"]) for row in data]) assert result == [(9, "Jack"), (10, "Jill")] # Schema is identical to first schema @@ -251,7 +256,7 @@ def test_refresh_drop_resources( pipeline._wipe_working_folder() pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name) - data = refresh_source(first_run=False).with_resources("some_data_1", "some_data_2") + data = refresh_source(first_run=False).with_resources("some_data_one", "some_data_two") if not in_source: data = list(data.selected_resources.values()) @@ -262,15 +267,15 @@ def test_refresh_drop_resources( ) # Confirm resource tables not selected on second run are untouched - data = load_tables_to_dicts(pipeline, "some_data_3")["some_data_3"] - result = sorted([(row["id"], row["name"]) for row in data]) + data = load_tables_to_dicts(pipeline, "some_data_three")["some_data_three"] + result = sorted([(row["item_id"], row["full_name"]) for row in data]) assert result == [(9, "Jack"), (10, "Jill")] # Check the columns to ensure the name column was dropped - assert_only_table_columns(pipeline, "some_data_1", ["id"]) - data = load_tables_to_dicts(pipeline, "some_data_1")["some_data_1"] + assert_only_table_columns(pipeline, "some_data_one", ["item_id"]) + data = load_tables_to_dicts(pipeline, "some_data_one")["some_data_one"] # Only second run data - result = sorted([row["id"] for row in data]) + result = sorted([row["item_id"] for row in data]) assert result == [3, 4] # Loaded state contains only keys created in second run @@ -285,21 +290,12 @@ def test_refresh_drop_resources( assert source_state["source_key_2"] == "source_value_2" assert source_state["source_key_3"] == "source_value_3" # Only resource excluded in second run remains - assert source_state["resources"]["some_data_3"] == {"run1_1": "value1_1"} - assert not source_state["resources"]["some_data_2"] - assert not source_state["resources"]["some_data_1"] + assert source_state["resources"]["some_data_three"] == {"run1_1": "value1_1"} + assert not source_state["resources"]["some_data_two"] + assert not source_state["resources"]["some_data_one"] -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - subset=["duckdb", "filesystem", "iceberg", "athena"], - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@refresh_chain_destinations def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration): """Refresh drop_data should truncate all selected tables before load""" # First run pipeline with load to destination so tables are created @@ -316,7 +312,7 @@ def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration # Second run of pipeline with only selected resources info = pipeline.run( - refresh_source(first_run=False).with_resources("some_data_1", "some_data_2"), + refresh_source(first_run=False).with_resources("some_data_one", "some_data_two"), write_disposition="append", **destination_config.run_kwargs, ) @@ -326,24 +322,24 @@ def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration assert pipeline.default_schema.version_hash == first_schema_hash # Tables selected in second run are truncated and should only have data from second run - data = load_tables_to_dicts(pipeline, "some_data_1", "some_data_2", "some_data_3") + data = load_tables_to_dicts(pipeline, "some_data_one", "some_data_two", "some_data_three") # name column still remains when table was truncated instead of dropped # (except on filesystem where truncate and drop are the same) if destination_config.destination_type == "filesystem": - result = sorted([row["id"] for row in data["some_data_1"]]) + result = sorted([row["item_id"] for row in data["some_data_one"]]) assert result == [3, 4] - result = sorted([row["id"] for row in data["some_data_2"]]) + result = sorted([row["item_id"] for row in data["some_data_two"]]) assert result == [7, 8] else: - result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) + result = sorted([(row["item_id"], row["full_name"]) for row in data["some_data_one"]]) assert result == [(3, None), (4, None)] - result = sorted([(row["id"], row["name"]) for row in data["some_data_2"]]) + result = sorted([(row["item_id"], row["full_name"]) for row in data["some_data_two"]]) assert result == [(7, None), (8, None)] # Other tables still have data from first run - result = sorted([(row["id"], row["name"]) for row in data["some_data_3"]]) + result = sorted([(row["item_id"], row["full_name"]) for row in data["some_data_three"]]) assert result == [(9, "Jack"), (10, "Jill")] # State of selected resources is wiped, source level state is kept @@ -356,21 +352,12 @@ def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration assert source_state["source_key_1"] == "source_value_1" assert source_state["source_key_2"] == "source_value_2" assert source_state["source_key_3"] == "source_value_3" - assert not source_state["resources"]["some_data_1"] - assert not source_state["resources"]["some_data_2"] - assert source_state["resources"]["some_data_3"] == {"run1_1": "value1_1"} + assert not source_state["resources"]["some_data_one"] + assert not source_state["resources"]["some_data_two"] + assert source_state["resources"]["some_data_three"] == {"run1_1": "value1_1"} -@pytest.mark.parametrize( - "destination_config", - destinations_configs( - default_sql_configs=True, - local_filesystem_configs=True, - subset=["duckdb", "filesystem", "iceberg", "athena"], - table_format_local_configs=True, - ), - ids=lambda x: x.name, -) +@refresh_chain_destinations def test_refresh_drop_sources_multiple_sources(destination_config: DestinationTestConfiguration): """ Ensure only state and tables for currently selected source is dropped @@ -391,7 +378,9 @@ def source_2_data_1(): assert ( pipeline_state["sources"]["refresh_source"]["source_key_1"] == "source_value_1" ) - assert pipeline_state["sources"]["refresh_source"]["resources"]["some_data_1"] == { + assert pipeline_state["sources"]["refresh_source"]["resources"][ + "some_data_one" + ] == { "run1_1": "value1_1", "run1_2": "value1_2", } @@ -435,7 +424,7 @@ def source_2_data_2(): table_names = set( t["name"] for t in pipeline.schemas["refresh_source"].data_tables(include_incomplete=True) ) - assert table_names == {"some_data_1", "some_data_2", "some_data_3", "some_data_4"} + assert table_names == {"some_data_one", "some_data_two", "some_data_three", "some_data_four"} # Source 2 has only the selected tables table_names = set( @@ -444,8 +433,8 @@ def source_2_data_2(): assert table_names == {"source_2_data_1"} # Destination still has tables from source 1 - data = load_tables_to_dicts(pipeline, "some_data_1") - result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) + data = load_tables_to_dicts(pipeline, "some_data_one") + result = sorted([(row["item_id"], row["full_name"]) for row in data["some_data_one"]]) assert result == [(1, "John"), (2, "Jane")] # First table from source2 exists, with only first column @@ -463,7 +452,10 @@ def source_2_data_2(): @pytest.mark.parametrize( "destination_config", destinations_configs( - default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + default_sql_configs=True, + local_filesystem_configs=True, + table_format_local_configs=True, + subset=["duckdb", "filesystem"], ), ids=lambda x: x.name, ) @@ -474,7 +466,7 @@ def test_refresh_argument_to_run(destination_config: DestinationTestConfiguratio assert_load_info(info) info = pipeline.run( - refresh_source(first_run=False).with_resources("some_data_3"), + refresh_source(first_run=False).with_resources("some_data_three"), **destination_config.run_kwargs, refresh="drop_sources", ) @@ -482,18 +474,18 @@ def test_refresh_argument_to_run(destination_config: DestinationTestConfiguratio # Check local schema to confirm refresh was at all applied tables = set(t["name"] for t in pipeline.default_schema.data_tables()) - assert tables == {"some_data_3"} + assert tables == {"some_data_three"} # Run again without refresh to confirm refresh option doesn't persist on pipeline info = pipeline.run( - refresh_source(first_run=False).with_resources("some_data_2"), + refresh_source(first_run=False).with_resources("some_data_two"), **destination_config.run_kwargs, ) assert_load_info(info) # Nothing is dropped tables = set(t["name"] for t in pipeline.default_schema.data_tables()) - assert tables == {"some_data_2", "some_data_3"} + assert tables == {"some_data_two", "some_data_three"} @pytest.mark.parametrize( @@ -510,23 +502,23 @@ def test_refresh_argument_to_extract(destination_config: DestinationTestConfigur assert_load_info(info) pipeline.extract( - refresh_source(first_run=False).with_resources("some_data_3"), + refresh_source(first_run=False).with_resources("some_data_three"), table_format=destination_config.table_format, refresh="drop_sources", ) tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) # All other data tables removed - assert tables == {"some_data_3"} + assert tables == {"some_data_three"} # Run again without refresh to confirm refresh option doesn't persist on pipeline pipeline.extract( - refresh_source(first_run=False).with_resources("some_data_2"), + refresh_source(first_run=False).with_resources("some_data_two"), table_format=destination_config.table_format, ) tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) - assert tables == {"some_data_2", "some_data_3"} + assert tables == {"some_data_two", "some_data_three"} @pytest.mark.parametrize( @@ -597,7 +589,11 @@ def test_refresh_staging_dataset(destination_config: DestinationTestConfiguratio @pytest.mark.parametrize( "destination_config", - destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + destinations_configs( + default_sql_configs=True, + all_buckets_filesystem_configs=True, + table_format_local_configs=True, + ), ids=lambda x: x.name, ) @pytest.mark.parametrize("refresh", ["drop_source", "drop_resource", "drop_data"]) @@ -610,6 +606,196 @@ def test_changing_write_disposition_with_refresh( pipeline.run( [1, 2, 3], table_name="items", write_disposition="append", **destination_config.run_kwargs ) + # `primary_key` is required by the `upsert` merge strategy pipeline.run( - [1, 2, 3], table_name="items", write_disposition="merge", **destination_config.run_kwargs + [1, 2, 3], + table_name="items", + write_disposition="merge", + primary_key="value", + **destination_config.run_kwargs, + ) + + +@dlt.source +def refresh_additional_cases(first_run: bool = True): + """Resources spanning append/replace/merge dispositions that produce nested, dynamic + (event-dispatch) and `with_table_name`-marked tables. On the refresh (second) run each resource + yields data for only some of its tables, so the others receive no data.""" + + @dlt.resource( + name="parent", + write_disposition="append", + nested_hints={ + # a pseudo-root child (broken out by a primary key) whose write disposition differs + # from the append root + "tags": dlt.mark.make_nested_hints(primary_key="tid", write_disposition="replace") + }, ) + def parent(): + if first_run: + dlt.current.source_state()["src_key"] = "src_value" + resource_state("parent")["run1"] = "value1" + yield { + "id": 1, + "name": "p1", + "children": [{"cid": 11}, {"cid": 12}], + "tags": [{"tid": 1}], + } + else: + # root receives data, the child and pseudo-root tables receive none + yield {"id": 2, "name": "p2", "children": [], "tags": []} + + @dlt.resource( + name="events", + table_name=lambda e: "event_" + e["kind"], + write_disposition="replace", + primary_key="id", + ) + def events(): + if first_run: + resource_state("events")["run1"] = "value1" + yield from [{"id": 1, "kind": "a"}, {"id": 2, "kind": "b"}] + else: + # only event_a is dispatched, event_b receives no data + yield {"id": 3, "kind": "a"} + + @dlt.resource(name="marked", write_disposition="merge", primary_key="id") + def marked(): + if first_run: + resource_state("marked")["run1"] = "value1" + # non-normalized dispatch names exercise identifier normalization end to end through the + # refresh path (they land in the schema as mark_x / mark_y / mark_variant) + yield dlt.mark.with_table_name({"id": 1}, "MarkX") + yield dlt.mark.with_table_name({"id": 2}, "MarkY") + # a table variant whose write disposition differs from the merge root + yield dlt.mark.with_hints( + {"id": 4}, + dlt.mark.make_hints(table_name="MarkVariant", write_disposition="replace"), + create_table_variant=True, + ) + else: + # only mark_x receives data; mark_y and mark_variant receive none + yield dlt.mark.with_table_name({"id": 3}, "MarkX") + + yield parent + yield events + yield marked + + +@refresh_chain_destinations +@pytest.mark.parametrize("refresh", ["drop_data", "drop_resources"]) +@pytest.mark.parametrize("pre_drop", [False, True], ids=["no_pre_drop", "pre_drop"]) +def test_refresh_truncates_or_drops_additional_cases( + destination_config: DestinationTestConfiguration, refresh: TRefreshMode, pre_drop: bool +): + """Makes sure that all cases in `refresh_additional_cases` are fully dropped or truncated. + `pre_drop` removes one of the table to check if refresh survives ie. tables removed by the user + """ + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + all_tables = [ + "parent", + "parent__children", + "parent__tags", + "event_a", + "event_b", + "mark_x", + "mark_y", + "mark_variant", + ] + pipeline = destination_config.setup_pipeline("refresh_chain" + uniq_id(), dev_mode=True) + + info = pipeline.run(refresh_additional_cases(first_run=True), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(pipeline, *all_tables) == { + "parent": 1, + "parent__children": 2, + "parent__tags": 1, + "event_a": 1, + "event_b": 1, + "mark_x": 1, + "mark_y": 1, + "mark_variant": 1, + } + # a pseudo-root and a variant whose write disposition differs from their root + tables = pipeline.default_schema.tables + assert is_nested_table(tables["parent__tags"]) is False + assert tables["parent"]["write_disposition"] == "append" + assert tables["parent__tags"]["write_disposition"] == "replace" + assert tables["mark_variant"]["write_disposition"] == "replace" + + if pre_drop: + # drop a table at the destination out of band; the local schema still lists it, so the + # refresh must survive dropping/truncating a table that no longer exists at the destination + with pipeline.destination_client() as client: + client.drop_tables("parent", delete_schema=False) # type: ignore[attr-defined] + assert not table_exists(pipeline, "parent") + + info = pipeline.run( + refresh_additional_cases(first_run=False), refresh=refresh, **destination_config.run_kwargs + ) + assert_load_info(info) + + # tables that received data on the refresh run hold ONLY the refresh-run rows + assert load_table_counts(pipeline, "parent", "event_a", "mark_x") == { + "parent": 1, + "event_a": 1, + "mark_x": 1, + } + assert sorted(row["id"] for row in load_tables_to_dicts(pipeline, "parent")["parent"]) == [2] + + # tables that received NO data are emptied (drop_data) or dropped (drop_resources) - including + # the pseudo-root and the variant whose write disposition differs from their root + no_data_tables = ["parent__children", "parent__tags", "event_b", "mark_y", "mark_variant"] + assert_empty_tables(pipeline, *no_data_tables) + # drop_data truncates (table kept), drop_resources drops it (filesystem cannot distinguish) + if destination_config.destination_type != "filesystem": + for table in no_data_tables: + assert table_exists(pipeline, table) is (refresh == "drop_data") + + # resource state is wiped, source-level state is kept (same for both modes) + with pipeline.destination_client() as dest_client: + destination_state = load_pipeline_state_from_destination( + pipeline.pipeline_name, dest_client # type: ignore[arg-type] + ) + source_state = destination_state["sources"]["refresh_additional_cases"] + assert source_state["src_key"] == "src_value" + # resource state is wiped (the key may be removed entirely or reset to empty) + assert not source_state["resources"].get("parent") + assert not source_state["resources"].get("events") + assert not source_state["resources"].get("marked") + + +@refresh_chain_destinations +@pytest.mark.parametrize("refresh", ["drop_data", "drop_resources"]) +@pytest.mark.parametrize("restore_state", [True, False], ids=["sync_state", "no_sync_state"]) +def test_refresh_truncates_or_drops_when_no_data( + destination_config: DestinationTestConfiguration, refresh: TRefreshMode, restore_state: bool +): + """When a refreshed resource yields no data at all, its tables (root and nested) must still be + emptied (drop_data) or dropped (drop_resources) + """ + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + @dlt.resource(name="items", write_disposition="append") + def items(emit: bool): + if emit: + yield {"id": 1, "children": [{"cid": 1}]} + + pipeline = destination_config.setup_pipeline("refresh_no_data" + uniq_id(), dev_mode=True) + pipeline.config.restore_from_destination = restore_state + pipeline.run(items(True), **destination_config.run_kwargs) + assert load_table_counts(pipeline, "items", "items__children") == { + "items": 1, + "items__children": 1, + } + + info = pipeline.run(items(False), refresh=refresh, **destination_config.run_kwargs) + # a load package must be generated even though the resource yields no data + assert_load_info(info) + assert_empty_tables(pipeline, "items", "items__children") + # drop_data truncates (table kept, emptied), drop_resources drops it (filesystem cannot + # distinguish the two - both remove the data files) + if destination_config.destination_type != "filesystem": + for table in ("items", "items__children"): + assert table_exists(pipeline, table) is (refresh == "drop_data") diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index fee20d38bd..66cc006352 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -28,7 +28,6 @@ def data_with_subtables(offset: int) -> Any: @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) -@pytest.mark.essential def test_switch_from_merge(destination_config: DestinationTestConfiguration): pipeline = destination_config.setup_pipeline( pipeline_name="test_switch_from_merge", dev_mode=True @@ -105,7 +104,6 @@ def test_switch_from_merge(destination_config: DestinationTestConfiguration): ids=lambda x: x.name, ) @pytest.mark.parametrize("with_root_key", [True, False, None]) -@pytest.mark.essential def test_switch_to_merge( destination_config: DestinationTestConfiguration, with_root_key: bool, mocker: MockerFixture ): @@ -210,29 +208,73 @@ def _assert_root_key_warn(spy: MockType) -> None: "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) @pytest.mark.essential -def test_incremental_merge_after_replace_keeps_rows_on_no_new_data( +def test_incremental_merge_full_refresh_with_replace( destination_config: DestinationTestConfiguration, ) -> None: - """Regression for #3998: an incremental `merge` resource must not truncate the destination - on a run with no new data after a prior `replace` run. - """ + # non-normalized table and column names so identifiers are exercised through normalization + @dlt.source(name="items_source") + def items_source(records: Any) -> Any: + @dlt.resource( + name="items", table_name="MyItems", write_disposition="merge", primary_key="Id" + ) + def items(updated_at: Any = dlt.sources.incremental("UpdatedAt")) -> Any: + yield from records - @dlt.resource(name="items", write_disposition="merge", primary_key="id") - def items(updated_at: Any = dlt.sources.incremental("updated_at")) -> Any: - yield from [ - {"id": 1, "updated_at": "2026-05-28"}, - {"id": 2, "updated_at": "2020-05-29"}, - ] + return items + + def source(records: Any) -> Any: + s = items_source(records) + # propagations not needed up to 2 nesting levels + s.root_key = False + return s + + seed = [ + {"Id": 1, "UpdatedAt": "2026-05-28", "SubItems": [{"Id": 101}]}, + {"Id": 2, "UpdatedAt": "2020-05-29", "SubItems": [{"Id": 201}]}, + ] pipeline = destination_config.setup_pipeline( pipeline_name="test_incremental_merge_after_replace", dev_mode=True ) - # full refresh seeds the table and advances the incremental cursor - info = pipeline.run(items(), write_disposition="replace", **destination_config.run_kwargs) + # full refresh seeds the root and nested tables and advances the incremental cursor + info = pipeline.run(source(seed), write_disposition="replace", **destination_config.run_kwargs) + assert_load_info(info) + assert_table_counts(pipeline, {"my_items": 2, "my_items__sub_items": 2}) + + # incremental merge run with no new rows (all filtered by the cursor) produces no load package + # and must keep root and nested data (the #3998 regression truncated them here) + info = pipeline.run(source(seed), **destination_config.run_kwargs) + assert_load_info(info, expected_load_packages=0) + assert_table_counts(pipeline, {"my_items": 2, "my_items__sub_items": 2}) + + # a later merge run with new data still loads to root and nested tables + info = pipeline.run( + source([{"Id": 3, "UpdatedAt": "2026-05-30", "SubItems": [{"Id": 301}]}]), + **destination_config.run_kwargs, + ) + assert_load_info(info) + assert_table_counts(pipeline, {"my_items": 3, "my_items__sub_items": 3}) + + # an explicit replace still resets the destination: prior rows are dropped, only new data remains + info = pipeline.run( + source([{"Id": 4, "UpdatedAt": "2026-06-01", "SubItems": [{"Id": 401}]}]), + write_disposition="replace", + **destination_config.run_kwargs, + ) + assert_load_info(info) + assert_table_counts(pipeline, {"my_items": 1, "my_items__sub_items": 1}) + + # and merging continues to load on top of the reset data + info = pipeline.run( + source([{"Id": 5, "UpdatedAt": "2026-06-02", "SubItems": [{"Id": 501}]}]), + **destination_config.run_kwargs, + ) assert_load_info(info) - assert_table_counts(pipeline, {"items": 2}) + assert_table_counts(pipeline, {"my_items": 2, "my_items__sub_items": 2}) - # incremental merge run with no new rows (all filtered by the cursor) must keep the data - pipeline.run(items(), **destination_config.run_kwargs) - assert_table_counts(pipeline, {"items": 2}) + # a replace run with no data truncates the (non-normalized) root and nested tables to empty: the + # empty-table handling matches the normalized table name, so a replace resource truncates + # correctly when it has none (and the runs above show it does not drop data when data is present) + pipeline.run(source([]), write_disposition="replace", **destination_config.run_kwargs) + assert_table_counts(pipeline, {"my_items": 0, "my_items__sub_items": 0}) diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 01b6d6a4cb..cfb0f9e2dc 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -1166,10 +1166,10 @@ def test_init_client_truncate_tables() -> None: "_dlt_loads", "_dlt_version", } - assert initialize_storage.call_count == 2 - # initialize storage is called twice, we deselected all tables to truncate + # nothing to truncate: only the bare init call is made (no truncate init call) + assert initialize_storage.call_count == 1 assert initialize_storage.call_args_list[0].args == () - assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set() + assert initialize_storage.call_args_list[0].kwargs == {} # tables not dropped drop_tables.assert_not_called() @@ -1194,7 +1194,7 @@ def test_init_client_truncate_tables() -> None: assert update_stored_schema.call_count == 2 assert "event_user" in update_stored_schema.call_args_list[0][1]["only_tables"] assert "_dlt_version" in update_stored_schema.call_args_list[1][1]["only_tables"] - assert initialize_storage.call_count == 4 + assert initialize_storage.call_count == 3 assert initialize_storage.call_args_list[0].args == () assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == {"event_user"} # dropped on staging dataset and final @@ -1216,12 +1216,11 @@ def test_init_client_truncate_tables() -> None: assert {"event_user", "event_bot"} <= set( update_stored_schema.call_args_list[1].kwargs["only_tables"] ) - assert initialize_storage.call_count == 4 + assert initialize_storage.call_count == 3 assert initialize_storage.call_args_list[0].args == () - assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set() - assert initialize_storage.call_args_list[2].args == () + assert initialize_storage.call_args_list[1].args == () # all tables that will be used on staging must be truncated - assert initialize_storage.call_args_list[3].kwargs["truncate_tables"] == { + assert initialize_storage.call_args_list[2].kwargs["truncate_tables"] == { "event_user", "event_bot", } @@ -1259,9 +1258,9 @@ def test_init_client_truncate_tables() -> None: "event_user" not in update_stored_schema.call_args_list[1].kwargs["only_tables"] ) - assert initialize_storage.call_count == 4 - assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set() - assert initialize_storage.call_args_list[3].kwargs[ + assert initialize_storage.call_count == 3 + assert initialize_storage.call_args_list[0].kwargs == {} + assert initialize_storage.call_args_list[2].kwargs[ "truncate_tables" ] == update_stored_schema.call_args_list[1].kwargs["only_tables"] - {"_dlt_version"} @@ -1278,6 +1277,41 @@ def test_init_client_truncate_tables() -> None: # print(update_stored_schema.call_args_list) +def test_init_client_initial_truncate_tables_from_package_state() -> None: + """truncated_tables stored in the load package state (refresh drop_data) reach init_client and + force a schema migration even when the schema hash is unchanged and the table has no jobs.""" + load = setup_loader() + load_id, schema = prepare_load_package( + load.load_storage, ["event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values"] + ) + # a refresh requested truncation of event_bot which has no data jobs in this package + packages = load.load_storage.normalized_packages + state = packages.get_load_package_state(load_id) + state["truncated_tables"] = [schema.get_table("event_bot")] + packages.save_load_package_state(load_id, state) + + # synthetic jsonl job (the on-disk case file is insert_values which the dummy client rejects) + new_jobs = [ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl")] + with ( + patch.object(dummy_impl.DummyClient, "initialize_storage") as initialize_storage, + patch.object( + dummy_impl.DummyClient, "update_stored_schema", return_value={} + ) as update_stored_schema, + ): + load.initialize_package(load_id, schema, new_jobs) + + # schema migration is forced because there are tables to truncate via refresh + assert update_stored_schema.call_args_list[0].kwargs["force"] is True + # event_user is append (not truncated); only the initial truncate table reaches the truncate + # init call, even though it has no jobs in this package + truncate_calls = [ + c.kwargs["truncate_tables"] + for c in initialize_storage.call_args_list + if "truncate_tables" in c.kwargs + ] + assert truncate_calls == [{"event_bot"}] + + def test_init_client_staging_ddl_includes_jobless_tables() -> None: """Issue #2862: staging DDL includes ALL staging-eligible data tables, not just those with jobs.""" load = setup_loader() @@ -1310,7 +1344,9 @@ def test_init_client_staging_ddl_includes_jobless_tables() -> None: staging_ddl = update_stored_schema.call_args_list[1].kwargs["only_tables"] assert staging_ddl == all_data | {"_dlt_version"} # staging truncation: only the table with a job - staging_truncate = initialize_storage.call_args_list[3].kwargs["truncate_tables"] + # main dataset has nothing to truncate so it makes only the bare init call (index 0), + # staging makes a bare init (index 1) + a truncate init (index 2) + staging_truncate = initialize_storage.call_args_list[2].kwargs["truncate_tables"] assert staging_truncate == {"event_user"} @@ -1348,7 +1384,8 @@ def test_init_client_staging_selective_filter() -> None: assert staging_ddl == bot_chain | {"_dlt_version"} assert "event_user" not in staging_ddl # staging truncation: only event_bot (has job + passes filter, append skips children) - staging_truncate = initialize_storage.call_args_list[3].kwargs["truncate_tables"] + # main has no truncation (bare init at index 0); staging bare init (1) + truncate (2) + staging_truncate = initialize_storage.call_args_list[2].kwargs["truncate_tables"] assert staging_truncate == {"event_bot"} @@ -1381,7 +1418,7 @@ def test_init_client_staging_dlt_tables_with_jobs() -> None: staging_ddl = update_stored_schema.call_args_list[1].kwargs["only_tables"] assert staging_ddl == all_data | {"_dlt_version", "_dlt_loads"} # staging truncation includes both tables with jobs - staging_truncate = initialize_storage.call_args_list[3].kwargs["truncate_tables"] + staging_truncate = initialize_storage.call_args_list[2].kwargs["truncate_tables"] assert staging_truncate == {"event_user", "_dlt_loads"} @@ -1420,7 +1457,7 @@ def test_init_client_unseen_data_tables_excluded() -> None: assert staging_ddl == (all_data - bot_chain) | {"_dlt_version"} assert not (bot_chain & staging_ddl) # staging truncation: only event_user - staging_truncate = initialize_storage.call_args_list[3].kwargs["truncate_tables"] + staging_truncate = initialize_storage.call_args_list[2].kwargs["truncate_tables"] assert staging_truncate == {"event_user"} @@ -1457,7 +1494,8 @@ def test_init_client_staging_destination_vs_final_destination() -> None: staging_ddl_1 = update_schema_1.call_args_list[1].kwargs["only_tables"] assert staging_ddl_1 == bot_chain | {"_dlt_version"} assert not (user_chain & staging_ddl_1) - staging_trunc_1 = init_storage_1.call_args_list[3].kwargs["truncate_tables"] + # main has no truncation (bare init at index 0); staging bare init (1) + truncate (2) + staging_trunc_1 = init_storage_1.call_args_list[2].kwargs["truncate_tables"] assert staging_trunc_1 == bot_chain # merge expands full chain # call 2: staging destination with staging_dest_filter @@ -1473,7 +1511,7 @@ def test_init_client_staging_destination_vs_final_destination() -> None: staging_ddl_2 = update_schema_2.call_args_list[1].kwargs["only_tables"] assert staging_ddl_2 == user_chain | {"_dlt_version"} assert not (bot_chain & staging_ddl_2) - staging_trunc_2 = init_storage_2.call_args_list[3].kwargs["truncate_tables"] + staging_trunc_2 = init_storage_2.call_args_list[2].kwargs["truncate_tables"] assert staging_trunc_2 == user_chain # merge expands full chain @@ -1514,9 +1552,9 @@ def test_init_client_staging_drop_tables_without_staging_eligible() -> None: # staging DDL: only _dlt_version (no staging-eligible data tables) staging_ddl = update_stored_schema.call_args_list[1].kwargs["only_tables"] assert staging_ddl == {"_dlt_version"} - # staging truncation: empty (no tables with jobs) - staging_truncate = initialize_storage.call_args_list[3].kwargs["truncate_tables"] - assert staging_truncate == set() + assert initialize_storage.call_count == 2 + assert initialize_storage.call_args_list[0].kwargs == {} + assert initialize_storage.call_args_list[1].kwargs == {} # drop_tables called on both main and staging datasets assert drop_tables.call_count == 2 assert drop_tables.call_args_list[0].args == ("event_bot",) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index b009ac7751..188b0f673c 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -61,18 +61,35 @@ "tests.common.cases.normalizers.title_case", ) +# parametrize for tests that run on sql destinations only +sql_client_configs = pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True), + indirect=True, + ids=lambda x: x.name, +) + +# parametrize for tests that also run on destinations read through duckdb table scanners: +# filesystem table formats and lance/lancedb vector dbs (weaviate/qdrant have no such sql client) +fs_vector_client_configs = pytest.mark.parametrize( + "client", + destinations_configs( + default_sql_configs=True, + table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], + ), + indirect=True, + ids=lambda x: x.name, +) + @pytest.fixture def file_storage() -> FileStorage: return FileStorage(get_test_storage_root(), file_type="b", makedirs=True) -@pytest.mark.parametrize( - "client", - destinations_configs(default_sql_configs=True, table_format_filesystem_configs=True), - indirect=True, - ids=lambda x: x.name, -) +@fs_vector_client_configs def test_initialize_storage(client: SqlJobClientBaseWithDestinationTestConfiguration) -> None: assert client.is_storage_initialized() @@ -82,6 +99,8 @@ def test_initialize_storage(client: SqlJobClientBaseWithDestinationTestConfigura destinations_configs_with_naming_convention( default_sql_configs=True, table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], naming_conventions=TEST_NAMING_CONVENTIONS, ), indirect=True, @@ -105,12 +124,7 @@ def test_get_schema_on_empty_storage( assert [("no_table_1", {}), ("no_table_2", {})] == storage_tables -@pytest.mark.parametrize( - "client", - destinations_configs(default_sql_configs=True, table_format_filesystem_configs=True), - indirect=True, - ids=lambda x: x.name, -) +@fs_vector_client_configs def test_get_update_basic_schema(client: SqlJobClientBaseWithDestinationTestConfiguration) -> None: schema = client.schema schema_update = client.update_stored_schema() @@ -147,6 +161,32 @@ def test_get_update_basic_schema(client: SqlJobClientBaseWithDestinationTestConf assert this_schema.schema == json.dumps(schema.to_dict()) first_version_schema = this_schema.schema + # enforcing an update on an unchanged schema (as a truncate refresh does) re-applies DDL but + # must NOT write a duplicate version row + if client.config.destination_type not in ["filesystem"]: + versions_table = client.sql_client.make_qualified_table_name(version_table_name) + version_hash_column = client.sql_client.escape_column_name( + schema.naming.normalize_identifier("version_hash") + ) + + def _version_row_count() -> int: + return len( + list( + client.sql_client.execute_sql( + f"SELECT * FROM {versions_table} WHERE {version_hash_column} = %s", + schema.stored_version_hash, + ) + ) + ) + + rows_before = _version_row_count() + client.update_stored_schema(force=True) + assert _version_row_count() == rows_before + else: + client.update_stored_schema(force=True) + # the stored schema is still resolvable by hash + assert client.get_stored_schema_by_hash(schema.version_hash) is not None + # modify schema schema.tables["event_slot"]["write_disposition"] = "replace" schema._bump_version() @@ -201,6 +241,8 @@ def test_get_update_basic_schema(client: SqlJobClientBaseWithDestinationTestConf naming_conventions=TEST_NAMING_CONVENTIONS, default_sql_configs=True, table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], ), indirect=True, ids=lambda x: x.name, @@ -276,6 +318,36 @@ def test_schema_update_create_table( _, storage_columns = list(client.get_storage_tables([table_name]))[0] assert len(storage_columns) > 0 + # a refresh that truncates/drops tables enforces the schema update so a table that is missing + # in the destination gets re-created even when the schema hash is already stored + versions_table = client.sql_client.make_qualified_table_name(schema.version_table_name) + version_hash_column = client.sql_client.escape_column_name( + schema.naming.normalize_identifier("version_hash") + ) + + def _version_row_count() -> int: + return len( + list( + client.sql_client.execute_sql( + f"SELECT * FROM {versions_table} WHERE {version_hash_column} = %s", + schema.stored_version_hash, + ) + ) + ) + + rows_before = _version_row_count() + # drop the table directly in the destination, schema (and its hash) stays stored + client.sql_client.drop_tables(table_name) + assert len(list(client.get_storage_tables([table_name]))[0][1]) == 0 + # a plain update skips on a hash match: the table is NOT re-created + client.update_stored_schema() + assert len(list(client.get_storage_tables([table_name]))[0][1]) == 0 + # a forced update re-runs the DDL and re-creates the table + client.update_stored_schema(force=True) + assert len(list(client.get_storage_tables([table_name]))[0][1]) > 0 + # ... without writing a duplicate version row + assert _version_row_count() == rows_before + @pytest.mark.parametrize( "client", @@ -317,9 +389,7 @@ def test_schema_update_create_table_bigquery_hidden_dataset( assert storage_columns.keys() == client.schema.tables["_dlt_version"]["columns"].keys() -@pytest.mark.parametrize( - "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name -) +@sql_client_configs def test_schema_update_alter_table( client: SqlJobClientBaseWithDestinationTestConfiguration, ) -> None: @@ -361,12 +431,7 @@ def test_schema_update_alter_table( assert storage_table_cols["col4"]["data_type"] == "timestamp" -@pytest.mark.parametrize( - "client", - destinations_configs(default_sql_configs=True, table_format_filesystem_configs=True), - indirect=True, - ids=lambda x: x.name, -) +@fs_vector_client_configs def test_drop_tables(client: SqlJobClientBaseWithDestinationTestConfiguration) -> None: schema = client.schema # Add columns in all tables @@ -432,9 +497,7 @@ def test_drop_tables(client: SqlJobClientBaseWithDestinationTestConfiguration) - assert len(rows) == 2 -@pytest.mark.parametrize( - "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name -) +@sql_client_configs def test_get_storage_table_with_all_types( client: SqlJobClientBaseWithDestinationTestConfiguration, ) -> None: @@ -535,6 +598,8 @@ def _assert_columns_order(sql_: str) -> None: naming_conventions=TEST_NAMING_CONVENTIONS, default_sql_configs=True, table_format_local_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], ), indirect=True, ids=lambda x: x.name, @@ -581,9 +646,7 @@ def test_data_writer_load( assert db_row[5] is None -@pytest.mark.parametrize( - "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name -) +@sql_client_configs def test_data_writer_string_escape( client: SqlJobClientBaseWithDestinationTestConfiguration, file_storage: FileStorage ) -> None: @@ -611,12 +674,7 @@ def test_data_writer_string_escape( assert list(db_row) == list(row.values()) -@pytest.mark.parametrize( - "client", - destinations_configs(default_sql_configs=True, table_format_filesystem_configs=True), - indirect=True, - ids=lambda x: x.name, -) +@fs_vector_client_configs def test_data_writer_string_escape_edge( client: SqlJobClientBaseWithDestinationTestConfiguration, file_storage: FileStorage ) -> None: @@ -647,6 +705,8 @@ def test_data_writer_string_escape_edge( naming_conventions=TEST_NAMING_CONVENTIONS, default_sql_configs=True, table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["weaviate", "qdrant"], ), indirect=True, ids=lambda x: x.name, @@ -734,12 +794,7 @@ def test_load_with_all_types( ("replace", "staging-optimized"), ], ) -@pytest.mark.parametrize( - "client", - destinations_configs(default_sql_configs=True, table_format_filesystem_configs=True), - indirect=True, - ids=lambda x: x.name, -) +@fs_vector_client_configs def test_write_dispositions( client: SqlJobClientBaseWithDestinationTestConfiguration, write_disposition: TWriteDisposition, @@ -753,6 +808,9 @@ def test_write_dispositions( table_name = "event_test_table" + uniq_id() column_schemas, data_row = get_columns_and_row_all_types(client.config) + # add _dlt_load_id that some destinations (ie. lancedb) require for merge + column_schemas["_dlt_load_id"] = new_column("_dlt_load_id", "text", nullable=False) + data_row["_dlt_load_id"] = uniq_id() root_table = new_table( table_name, write_disposition=write_disposition, columns=column_schemas.values() ) @@ -862,9 +920,7 @@ def test_write_dispositions( assert db_rows[-1][0] == pk_value -@pytest.mark.parametrize( - "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name -) +@sql_client_configs def test_get_resumed_job( client: SqlJobClientBaseWithDestinationTestConfiguration, file_storage: FileStorage ) -> None: @@ -905,10 +961,64 @@ def test_get_resumed_job( assert r_job.state() == "ready" +@fs_vector_client_configs +def test_initialize_storage_truncate_tables( + client: SqlJobClientBaseWithDestinationTestConfiguration, file_storage: FileStorage +) -> None: + if not client.capabilities.preferred_loader_file_format: + pytest.skip("preferred loader file format not set, destination will only work with staging") + # this mirrors what a `drop_data` refresh does: truncate the table but keep it (and its + # stored schema) in the destination + user_table_name = prepare_table(client) + load_json = { + "_dlt_id": uniq_id(), + "_dlt_root_id": uniq_id(), + "sender_id": "90238094809sajlkjxoiewjhduuiuehd", + "timestamp": pendulum.now(), + } + with io.BytesIO() as f: + write_dataset( + client, + f, + [load_json], + client.schema.get_table(user_table_name), + file_format=client.destination_config.file_format, + ) + dataset = f.getvalue() + expect_load_file( + client, + file_storage, + dataset, + user_table_name, + file_format=client.destination_config.file_format, + ) + qualified_table = client.sql_client.make_qualified_table_name(user_table_name) + assert len(list(client.sql_client.execute_sql(f"SELECT * FROM {qualified_table}"))) == 1 + stored_before = client.get_stored_schema_by_hash(client.schema.stored_version_hash) + assert stored_before is not None + + # truncate the table - the data is removed but the stored schema is untouched + client.initialize_storage(truncate_tables=[user_table_name]) + if client.config.destination_type == "filesystem": + # filesystem table formats cannot distinguish truncate from drop - the whole table is + # removed (no columns reported), so we cannot query it + assert len(list(client.get_storage_tables([user_table_name]))[0][1]) == 0 + else: + # sql destinations keep the (now empty) table + assert len(list(client.sql_client.execute_sql(f"SELECT * FROM {qualified_table}"))) == 0 + assert len(list(client.get_storage_tables([user_table_name]))[0][1]) > 0 + stored_after = client.get_stored_schema_by_hash(client.schema.stored_version_hash) + assert stored_after is not None + assert stored_after.version_hash == stored_before.version_hash + + @pytest.mark.parametrize( "destination_config", destinations_configs( - default_sql_configs=True, table_format_filesystem_configs=True, exclude=["dremio"] + default_sql_configs=True, + table_format_filesystem_configs=True, + default_vector_configs=True, + exclude=["dremio", "weaviate", "qdrant"], ), ids=lambda x: x.name, ) @@ -1127,6 +1237,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: or "NOT NULL" in str(py_ex.value) or "Adding columns with constraints not yet supported" in str(py_ex.value) or "Only nullable columns can be added" in str(py_ex.value) # Fabric Warehouse + or "must be nullable" in str(py_ex.value).lower() # lance / lancedb ) diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index c79ee39e68..987366f234 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -1331,14 +1331,8 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: table_name_prefix = dataset_name + "___" dataset_name = None additional_tables += ["dlt_sentinel_table"] - - # filesystem uses duckdb and views to map know tables. for other ibis will list - # all available tables so both schemas tables are visible - if populated_pipeline.destination.destination_type not in [ - "dlt.destinations.lancedb", - ]: - # from aleph schema - additional_tables += ["digits"] + # from aleph schema + additional_tables += ["digits"] add_table_prefix = lambda x: table_name_prefix + x diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index e3714f4d98..df5e221345 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -398,6 +398,103 @@ def test_normalize_many_packages( assert set(schemas) == set(["ethereum", "event"]) +def test_normalize_drops_empty_package_keeps_refresh(raw_normalize: Normalize) -> None: + extract_storage = ExtractStorage(raw_normalize.normalize_storage.config) + + # truly empty package: no items, no refresh commands + empty_schema = Schema("empty_pkg") + empty_load_id = extract_storage.create_load_package(empty_schema) + extract_storage.close_writers(empty_load_id) + extract_storage.commit_new_load_package(empty_load_id, empty_schema) + + # refresh package: carries a truncate command but no data items + refresh_schema = Schema("refresh_pkg") + refresh_load_id = extract_storage.create_load_package(refresh_schema) + state = extract_storage.new_packages.get_load_package_state(refresh_load_id) + state["truncated_tables"] = [new_table("items")] + extract_storage.new_packages.save_load_package_state(refresh_load_id, state) + extract_storage.close_writers(refresh_load_id) + extract_storage.commit_new_load_package(refresh_load_id, refresh_schema) + + extracted = raw_normalize.normalize_storage.extracted_packages + assert extracted.is_empty_package(empty_load_id) is True + assert extracted.is_empty_package(refresh_load_id) is False + + raw_normalize.run(None) + + normalized = raw_normalize.load_storage.list_normalized_packages() + # empty package was dropped at normalize and never reached the load step + assert empty_load_id not in normalized + assert not extracted.storage.has_folder(empty_load_id) + # refresh package was normalized and its truncate command survived into the load package + assert refresh_load_id in normalized + refresh_state = raw_normalize.load_storage.normalized_packages.get_load_package_state( + refresh_load_id + ) + assert len(refresh_state["truncated_tables"]) == 1 + + +@pytest.mark.parametrize("caps", INSERT_CAPS + JSONL_CAPS, indirect=True) +@pytest.mark.parametrize("write_disposition", ["append", "replace", "merge"]) +def test_normalize_contract_discard_all_rows_writes_empty_job( + caps: DestinationCapabilitiesContext, write_disposition: str, raw_normalize: Normalize +) -> None: + """When a `columns: discard_row` contract eliminates ALL rows of a root table, an empty job is + still written (so a `replace` table is truncated) with a post-filter row count of 0 - for every + write disposition and output writer.""" + schema = Schema("contracts") + schema.update_table( + new_table( + "items", + write_disposition=write_disposition, # type: ignore[arg-type] + schema_contract={"columns": "discard_row"}, + columns=[{"name": "id", "data_type": "bigint", "nullable": True}], + ) + ) + # every row carries a NEW column `name` so `discard_row` drops all of them + items = [{"id": i, "name": f"n{i}"} for i in range(5)] + load_id = extract_items(raw_normalize.normalize_storage, items, schema, "items") + normalize_pending(raw_normalize) + + # an empty root-table job is written, physically present, and the package is retained + files = raw_normalize.load_storage.list_new_jobs(load_id) + assert [ParsedLoadJobFileName.parse(f).table_name for f in files] == ["items"] + assert raw_normalize.load_storage.normalized_packages.storage.has_file(files[0]) + assert load_id in raw_normalize.load_storage.list_normalized_packages() + # per-job row count is the post-filter value: 0 + step_info = raw_normalize.get_step_info(MockPipeline("contracts_pipeline", True)) # type: ignore[abstract] + assert step_info.row_counts["items"] == 0 + assert step_info.metrics[load_id][0]["table_metrics"]["items"].items_count == 0 + + +@pytest.mark.parametrize("caps", INSERT_CAPS + JSONL_CAPS, indirect=True) +def test_normalize_empty_extracted_file_writes_empty_job( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: + schema = Schema("empties") + schema.update_table( + new_table( + "items", + write_disposition="replace", + columns=[{"name": "id", "data_type": "bigint", "nullable": True}], + ) + ) + extractor = ExtractStorage(raw_normalize.normalize_storage.config) + load_id = extractor.create_load_package(schema) + extractor.item_storages["object"].write_empty_items_file( + load_id, schema.name, "items", schema.get_table_columns("items") + ) + extractor.close_writers(load_id) + extractor.commit_new_load_package(load_id, schema) + + normalize_pending(raw_normalize) + files = raw_normalize.load_storage.list_new_jobs(load_id) + assert [ParsedLoadJobFileName.parse(f).table_name for f in files] == ["items"] + assert raw_normalize.load_storage.normalized_packages.storage.has_file(files[0]) + step_info = raw_normalize.get_step_info(MockPipeline("empties_pipeline", True)) # type: ignore[abstract] + assert step_info.metrics[load_id][0]["table_metrics"]["items"].items_count == 0 + + @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) def test_normalize_typed_json( caps: DestinationCapabilitiesContext, raw_normalize: Normalize diff --git a/tests/normalize/test_normalize_arrow.py b/tests/normalize/test_normalize_arrow.py new file mode 100644 index 0000000000..fe592a7172 --- /dev/null +++ b/tests/normalize/test_normalize_arrow.py @@ -0,0 +1,114 @@ +"""Arrow-input normalize tests. Kept in a separate module because pyarrow is an optional +dependency: the minimal `test-common-core` CI run (which executes tests/normalize) has no pyarrow, +so this module is ignored there and run by `test-pipeline-arrow` instead. Test names contain +`arrow` so the `-k arrow` filter selects them. +""" +from typing import Any, Iterator + +import pytest + +from dlt.common.configuration.container import Container +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.schema.schema import Schema +from dlt.common.schema.utils import new_table +from dlt.common.storages import NormalizeStorage, ParsedLoadJobFileName + +from dlt.extract.extract import ExtractStorage +from dlt.normalize import Normalize + +from dlt.destinations import duckdb + +from tests.utils import MockPipeline +from tests.normalize.utils import INSERT_CAPS, JSONL_CAPS + +# reuse fixtures and helpers from the main normalize test module +from tests.normalize.test_normalize import ( # noqa: F401 + raw_normalize, + caps, + default_caps, + logger_autouse, + normalize_pending, +) + +pyarrow = pytest.importorskip("pyarrow") + +pytestmark = pytest.mark.serial + + +def extract_arrow_items( + normalize_storage: NormalizeStorage, arrow_table: Any, schema: Schema, table_name: str +) -> str: + extractor = ExtractStorage(normalize_storage.config) + load_id = extractor.create_load_package(schema) + extractor.item_storages["arrow"].write_data_item( + load_id, + schema.name, + table_name, + arrow_table, + schema.get_table_columns(table_name, include_incomplete=True), + ) + extractor.close_writers(load_id) + extractor.commit_new_load_package(load_id, schema) + return load_id + + +def _items_schema(write_disposition: str = "replace") -> Schema: + schema = Schema("arrow_empties") + schema.update_table( + new_table( + "items", + write_disposition=write_disposition, # type: ignore[arg-type] + columns=[{"name": "id", "data_type": "bigint", "nullable": True}], + ) + ) + return schema + + +@pytest.fixture +def parquet_caps() -> Iterator[DestinationCapabilitiesContext]: + # parquet output: bypass the `caps` fixture override that rewrites parquet->jsonl + _caps = duckdb().capabilities() + _caps.preferred_loader_file_format = "parquet" + with Container().injectable_context(_caps): + yield _caps + + +@pytest.mark.parametrize("caps", INSERT_CAPS + JSONL_CAPS, indirect=True) +def test_normalize_empty_arrow_input_writes_empty_job( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: + """An empty arrow input file yields an empty root-table job (row count 0) for jsonl/insert + output writers.""" + schema = _items_schema() + empty = pyarrow.table({"id": pyarrow.array([], type=pyarrow.int64())}) + load_id = extract_arrow_items(raw_normalize.normalize_storage, empty, schema, "items") + normalize_pending(raw_normalize) + + # one or more empty `items` jobs (some writers emit both a streamed-empty and an explicit + # empty file); all are physically present and the aggregate row count is 0 + files = raw_normalize.load_storage.list_new_jobs(load_id) + assert {ParsedLoadJobFileName.parse(f).table_name for f in files} == {"items"} + storage = raw_normalize.load_storage.normalized_packages.storage + assert all(storage.has_file(f) for f in files) + step_info = raw_normalize.get_step_info(MockPipeline("arrow_empty_pipeline", True)) # type: ignore[abstract] + assert step_info.metrics[load_id][0]["table_metrics"]["items"].items_count == 0 + + +def test_normalize_empty_arrow_input_parquet_output( + parquet_caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: + """An empty arrow input file yields an empty root-table job with a physically present empty + parquet file (num_rows == 0).""" + schema = _items_schema() + empty = pyarrow.table({"id": pyarrow.array([], type=pyarrow.int64())}) + load_id = extract_arrow_items(raw_normalize.normalize_storage, empty, schema, "items") + normalize_pending(raw_normalize) + + files = raw_normalize.load_storage.list_new_jobs(load_id) + assert {ParsedLoadJobFileName.parse(f).table_name for f in files} == {"items"} + storage = raw_normalize.load_storage.normalized_packages.storage + # every emitted parquet job is physically present with a header and zero rows + assert all(storage.has_file(f) for f in files) + assert all(pyarrow.parquet.read_table(storage.make_full_path(f)).num_rows == 0 for f in files) + step_info = raw_normalize.get_step_info(MockPipeline("arrow_parquet_pipeline", True)) # type: ignore[abstract] + assert step_info.metrics[load_id][0]["table_metrics"]["items"].items_count == 0 diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 863fe61ddc..c6b589f61b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -46,7 +46,12 @@ from dlt.common.runtime.collector import DictCollector, LogCollector from dlt.common.schema.exceptions import TableIdentifiersFrozen from dlt.common.schema.typing import TColumnSchema -from dlt.common.schema.utils import get_first_column_name_with_prop, new_column, new_table +from dlt.common.schema.utils import ( + get_first_column_name_with_prop, + is_nested_table, + new_column, + new_table, +) from dlt.common.typing import DictStrAny, TDataItems from dlt.common.utils import uniq_id from dlt.common.warnings import DltDeprecationWarning @@ -3608,7 +3613,6 @@ def test_yielding_empty_list_creates_table() -> None: assert rows[0] == (1, None) -@pytest.mark.skip(reason="introduced by #3901; temporarily disabled") @pytest.mark.parametrize( "yield_one,yield_two", [(True, False), (False, True), (False, False), (True, True)], @@ -3620,12 +3624,13 @@ def test_materialize_table_schema_multi_table_duckdb(yield_one: bool, yield_two: variants are pre-declared with `materialize_table_schema()`. """ + # non-normalized table names so the empty-table handling is exercised with normalized identifiers @dlt.resource def multi_table(): yield dlt.mark.with_hints( dlt.mark.materialize_table_schema(), dlt.mark.make_hints( - table_name="table_one", + table_name="TableOne", write_disposition="replace", columns={"col_one": {"data_type": "text"}}, ), @@ -3634,16 +3639,16 @@ def multi_table(): yield dlt.mark.with_hints( dlt.mark.materialize_table_schema(), dlt.mark.make_hints( - table_name="table_two", + table_name="TableTwo", write_disposition="replace", columns={"col_two": {"data_type": "bigint"}}, ), create_table_variant=True, ) if yield_one: - yield dlt.mark.with_table_name({"col_one": "val"}, table_name="table_one") + yield dlt.mark.with_table_name({"col_one": "val"}, table_name="TableOne") if yield_two: - yield dlt.mark.with_table_name({"col_two": 5}, table_name="table_two") + yield dlt.mark.with_table_name({"col_two": 5}, table_name="TableTwo") pipeline = dlt.pipeline( pipeline_name="materialize_multi_e2e_" + uniq_id(), @@ -3653,6 +3658,7 @@ def multi_table(): load_info = pipeline.run(multi_table()) assert_load_info(load_info) + # tables are materialized under their normalized names expected = { "table_one": 1 if yield_one else 0, "table_two": 1 if yield_two else 0, @@ -3664,24 +3670,20 @@ def multi_table(): assert "col_two" in schema_tables["table_two"]["columns"] -@pytest.mark.skip(reason="introduced by #3901; temporarily disabled") def test_materialize_table_schema_with_nested_hints_duckdb() -> None: - """Pre-declared nested table via `nested_hints` is added to the schema but does NOT - materialize at the destination when only the root yields `materialize_table_schema()`. - The normalizer attaches parent/child linking columns only when real nested data flows - through it, so an empty nested table cannot be meaningfully created up front. - """ + """Nested tables cannot be materialized in advance""" + # non-normalized table, column and nested-table names exercised through normalization @dlt.resource( - name="users", + name="Users", write_disposition="replace", columns=[ - {"name": "id", "data_type": "bigint", "nullable": False}, - {"name": "name", "data_type": "text"}, + {"name": "Id", "data_type": "bigint", "nullable": False}, + {"name": "Name", "data_type": "text"}, ], nested_hints={ - "purchases": dlt.mark.make_nested_hints( - columns=[{"name": "price", "data_type": "decimal"}], + "Purchases": dlt.mark.make_nested_hints( + columns=[{"name": "Price", "data_type": "decimal"}], ), }, ) @@ -3713,6 +3715,167 @@ def users_with_nested(): assert not table_exists(pipeline, "users__purchases") +def test_replace_empty_resource_truncates_variant_tables() -> None: + """When a replace resource yields no data, everything belonging to it is truncated - the root + table and its table variants alike (the tables are emptied, not dropped).""" + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + def items_resource(emit: bool) -> DltResource: + @dlt.resource(name="items", write_disposition="replace", primary_key="id") + def items() -> Any: + if emit: + yield {"id": 1, "name": "root"} + # a table variant with its own data + yield dlt.mark.with_hints( + {"id": 2, "name": "variant"}, + dlt.mark.make_hints(table_name="other_items"), + create_table_variant=True, + ) + + return items + + pipeline = dlt.pipeline( + pipeline_name="replace_truncate_variant_" + uniq_id(), destination="duckdb", dev_mode=True + ) + pipeline.run(items_resource(True)) + assert load_table_counts(pipeline, "items", "other_items") == {"items": 1, "other_items": 1} + + # the resource yields no data: the root and the variant are both truncated + pipeline.run(items_resource(False)) + assert load_table_counts(pipeline, "items", "other_items") == {"items": 0, "other_items": 0} + + +def test_replace_empty_resource_keeps_append_pseudo_root() -> None: + """When a replace resource yields no data, its root (and variant root) are truncated, but a + pseudo-root (nested table broken out by a primary key) whose own write disposition differs from + the root is NOT truncated""" + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + def items_resource(emit: bool) -> DltResource: + @dlt.resource( + name="items", + write_disposition="replace", + primary_key="id", + nested_hints={ + "sub_items": dlt.mark.make_nested_hints( + primary_key="id", write_disposition="append" + ) + }, + ) + def items() -> Any: + variant_data: Any + if emit: + yield {"id": 1, "sub_items": [{"id": 101}]} + variant_data = {"id": 2, "sub_items": [{"id": 201}]} + else: + variant_data = [] + # a variant inherits the nesting-breaking nested hints + yield dlt.mark.with_hints( + variant_data, + dlt.mark.make_hints(table_name="other_items"), + create_table_variant=True, + ) + + return items + + pipeline = dlt.pipeline( + pipeline_name="replace_truncate_pseudo_" + uniq_id(), destination="duckdb", dev_mode=True + ) + pipeline.run(items_resource(True)) + # the nested tables are broken out into pseudo-roots + assert is_nested_table(pipeline.default_schema.tables["items__sub_items"]) is False + assert is_nested_table(pipeline.default_schema.tables["other_items__sub_items"]) is False + assert load_table_counts( + pipeline, "items", "items__sub_items", "other_items", "other_items__sub_items" + ) == {"items": 1, "items__sub_items": 1, "other_items": 1, "other_items__sub_items": 1} + + # empty run: the replace roots (default and variant) are emptied, but their append pseudo-roots + # are kept - we cannot re-derive a pseudo-root's effective disposition, so it is not truncated + pipeline.run(items_resource(False)) + assert load_table_counts( + pipeline, "items", "items__sub_items", "other_items", "other_items__sub_items" + ) == {"items": 0, "items__sub_items": 1, "other_items": 0, "other_items__sub_items": 1} + + +def test_replace_keeps_pseudo_root_when_root_has_data() -> None: + """A pseudo-root must not be force-truncated when its real root received data this run.""" + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + def items_resource(with_sub: bool) -> DltResource: + @dlt.resource( + name="items", + write_disposition="replace", + primary_key="id", + nested_hints={ + "sub_items": dlt.mark.make_nested_hints( + primary_key="id", write_disposition="append" + ) + }, + ) + def items() -> Any: + if with_sub: + yield {"id": 1, "sub_items": [{"id": 101}, {"id": 102}]} + else: + # the root receives data but the pseudo-root receives none + yield {"id": 1, "sub_items": []} + + return items + + pipeline = dlt.pipeline( + pipeline_name="replace_pseudo_root_kept_" + uniq_id(), destination="duckdb", dev_mode=True + ) + pipeline.run(items_resource(True)) + assert load_table_counts(pipeline, "items", "items__sub_items") == { + "items": 1, + "items__sub_items": 2, + } + + # the root received data, so the pseudo-root must not be force-truncated via an empty file + pipeline.run(items_resource(False)) + assert load_table_counts(pipeline, "items", "items__sub_items") == { + "items": 1, + "items__sub_items": 2, + } + + +@pytest.mark.parametrize("dispatch", ["dynamic", "marked"]) +def test_replace_event_dispatch_truncates_missing_table(dispatch: str) -> None: + """All dispatched tables will be truncated on write disposition replace + including those that didn't get data on replace + """ + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + if dispatch == "dynamic": + + @dlt.resource(name="events", table_name=lambda e: e["type"], primary_key="id") + def events(types: Any) -> Any: + for idx, type_ in enumerate(types): + yield {"id": idx, "type": type_} + + else: + + @dlt.resource(name="events", primary_key="id") + def events(types: Any) -> Any: + for idx, type_ in enumerate(types): + yield dlt.mark.with_table_name({"id": idx, "type": type_}, type_) + + pipeline = dlt.pipeline( + pipeline_name="event_dispatch_replace_" + uniq_id(), destination="duckdb", dev_mode=True + ) + # 1. start with merge, creating two tables + pipeline.run(events(["a", "b"]), write_disposition="merge") + assert load_table_counts(pipeline, "a", "b") == {"a": 1, "b": 1} + assert pipeline.default_schema.tables["b"]["write_disposition"] == "merge" + + # 2. switch to replace, with data only for table "a" + pipeline.run(events(["a"]), write_disposition="replace") + # "a" received data and is replaced with the new row + assert load_tables_to_dicts(pipeline, "a")["a"][0]["id"] == 0 + # missing "b" is refreshed to replace and truncated even though it received no data + assert pipeline.default_schema.tables["b"]["write_disposition"] == "replace" + assert load_table_counts(pipeline, "a", "b") == {"a": 1, "b": 0} + + @pytest.mark.parametrize( "local_path_kind", ( diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 3a24a7cce3..e7c5079078 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -1,4 +1,4 @@ -import dlt, os, pytest +import dlt, pytest import contextlib from typing import Any, Callable, ClassVar, Dict, Iterator, Literal, Union, Optional, Type @@ -1119,6 +1119,36 @@ def get_items_v2(): assert "email" not in table["columns"] +def test_replace_contract_discard_all_rows_truncates() -> None: + pipeline = get_pipeline() + + @dlt.resource( + name="items", + write_disposition="replace", + columns={"id": {"data_type": "bigint"}}, + schema_contract={"columns": "discard_row"}, + ) + def items(with_new_column: bool) -> Any: + # with_new_column adds a NEW column `extra` to every row so discard_row drops them all + if with_new_column: + yield from [{"id": i, "extra": i} for i in range(5)] + else: + yield from [{"id": i} for i in range(5)] + + # run 1: rows survive (no new column) - table is populated + info = pipeline.run(items(False)) + assert_load_info(info) + assert load_table_counts(pipeline)["items"] == 5 + + # run 2: every row is dropped by the contract -> empty job -> replace truncates the table + info = pipeline.run(items(True)) + assert_load_info(info) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + assert load_table_counts(pipeline).get("items", 0) == 0 + # the new column was blocked by the contract + assert "extra" not in pipeline.default_schema.get_table("items")["columns"] + + @pytest.mark.parametrize("contract_setting", ["freeze", "discard_value", "discard_row"]) def test_pydantic_model_forbid_extra_evolve_on_existing_table( contract_setting: TSchemaEvolutionMode,