From 15487b1618de922205cb068cffc7d6d5bb340118 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 4 May 2026 14:20:01 -0700 Subject: [PATCH 01/53] Added cohort definition sets and cohort generation features --- circe/api.py | 7 + circe/cohort_definition_set/__init__.py | 34 ++ .../cohort_definition_set/_checksum_store.py | 118 ++++++ circe/cohort_definition_set/_core.py | 100 +++++ circe/cohort_definition_set/_generate.py | 176 +++++++++ tests/test_cohort_definition_set.py | 366 ++++++++++++++++++ 6 files changed, 801 insertions(+) create mode 100644 circe/cohort_definition_set/__init__.py create mode 100644 circe/cohort_definition_set/_checksum_store.py create mode 100644 circe/cohort_definition_set/_core.py create mode 100644 circe/cohort_definition_set/_generate.py create mode 100644 tests/test_cohort_definition_set.py diff --git a/circe/api.py b/circe/api.py index 4c8f3a5..6645907 100644 --- a/circe/api.py +++ b/circe/api.py @@ -11,6 +11,13 @@ from typing import TYPE_CHECKING, Any, Literal, Optional +from .cohort_definition_set import ( # noqa: F401 + CohortDefinition, + CohortDefinitionSet, + CohortGenerationResult, + generate_cohort_set, + summarise_generation_results, +) from .cohortdefinition import ( BuildExpressionQueryOptions, CohortExpression, diff --git a/circe/cohort_definition_set/__init__.py b/circe/cohort_definition_set/__init__.py new file mode 100644 index 0000000..79e9b90 --- /dev/null +++ b/circe/cohort_definition_set/__init__.py @@ -0,0 +1,34 @@ +"""CohortDefinitionSet — batch cohort generation with incremental caching. + +This module provides the Python equivalent of OHDSI/CohortGenerator's +CohortDefinitionSet: a typed container for multiple cohort definitions that can +be generated simultaneously against an ibis backend, with optional checksum-based +incremental skipping. + +Example: + >>> from circe.cohort_definition_set import ( + ... CohortDefinitionSet, + ... generate_cohort_set, + ... ) + >>> cds = CohortDefinitionSet() + >>> cds.add(cohort_id=1, cohort_name="Diabetes", expression=expr1) + >>> cds.add(cohort_id=2, cohort_name="Hypertension", expression=expr2) + >>> results = generate_cohort_set( + ... cds, + ... backend=conn, + ... cdm_schema="main", + ... cohort_table="cohort", + ... incremental=True, + ... ) +""" + +from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult +from ._generate import generate_cohort_set, summarise_generation_results + +__all__ = [ + "CohortDefinition", + "CohortDefinitionSet", + "CohortGenerationResult", + "generate_cohort_set", + "summarise_generation_results", +] diff --git a/circe/cohort_definition_set/_checksum_store.py b/circe/cohort_definition_set/_checksum_store.py new file mode 100644 index 0000000..ac1b43b --- /dev/null +++ b/circe/cohort_definition_set/_checksum_store.py @@ -0,0 +1,118 @@ +"""Persistent checksum storage for incremental cohort generation. + +The checksum table records the SHA-256 hash of each successfully generated +cohort's expression. On subsequent incremental runs, cohorts whose expression +hash matches the stored value are skipped. + +Table schema (cohort_checksum): + cohort_definition_id int64 + checksum str + generation_end_time timestamp +""" + +from __future__ import annotations + +from datetime import datetime +from typing import TYPE_CHECKING + +from ..execution.ibis.operations import create_table, read_table, table_exists + +if TYPE_CHECKING: + from ..execution.typing import IbisBackendLike + + +def load_checksums( + backend: IbisBackendLike, + *, + schema: str | None, + table_name: str, +) -> dict[int, str]: + """Load stored checksums from the checksum table. + + Returns a mapping of cohort_id -> checksum for the most recently recorded + completed generation of each cohort. Returns an empty dict if the table + does not yet exist. + + Args: + backend: Ibis backend connection. + schema: Schema/database where the checksum table lives. + table_name: Name of the checksum table. + + Returns: + dict mapping cohort_id (int) -> checksum (str). + """ + if not table_exists(backend, table_name=table_name, schema=schema): + return {} + + table = read_table(backend, table_name=table_name, schema=schema) + rows = table.execute() + # In case there are multiple rows per cohort (shouldn't happen but be safe), + # keep the most recent by generation_end_time. + if rows.empty: + return {} + + if "generation_end_time" in rows.columns: + rows = rows.sort_values("generation_end_time", ascending=False) + rows = rows.drop_duplicates(subset=["cohort_definition_id"], keep="first") + + return {int(row["cohort_definition_id"]): str(row["checksum"]) for _, row in rows.iterrows()} + + +def save_checksums( + backend: IbisBackendLike, + *, + schema: str | None, + table_name: str, + completed: dict[int, tuple[str, datetime]], +) -> None: + """Persist checksums for successfully generated cohorts. + + Uses the same read-filter-union-rewrite pattern as ``write_cohort`` so it + works on every ibis backend without requiring raw SQL. + + Args: + backend: Ibis backend connection. + schema: Schema/database where the checksum table should be written. + table_name: Name of the checksum table. + completed: Mapping of cohort_id -> (checksum, generation_end_time) for + cohorts that completed successfully in this run. + """ + if not completed: + return + + import ibis + import pandas as pd + + new_rows_df = pd.DataFrame( + [ + { + "cohort_definition_id": cohort_id, + "checksum": checksum, + "generation_end_time": end_time, + } + for cohort_id, (checksum, end_time) in completed.items() + ] + ) + new_rows_df["cohort_definition_id"] = new_rows_df["cohort_definition_id"].astype("int64") + new_rows_df["checksum"] = new_rows_df["checksum"].astype(str) + new_rows_df["generation_end_time"] = pd.to_datetime(new_rows_df["generation_end_time"]) + + new_relation = ibis.memtable(new_rows_df) + + if not table_exists(backend, table_name=table_name, schema=schema): + create_table(backend, table_name=table_name, schema=schema, obj=new_relation, overwrite=False) + return + + # Merge: keep existing rows for cohorts NOT in this batch, union new rows. + existing = read_table(backend, table_name=table_name, schema=schema) + updated_ids = list(completed.keys()) + filtered_existing = existing.filter( + ~existing.cohort_definition_id.cast("int64").isin( + [ibis.literal(int(i), type="int64") for i in updated_ids] + ) + ) + # Cast new rows to match the existing table's timestamp type to avoid union schema conflicts. + ts_type = existing.schema()["generation_end_time"] + new_relation = new_relation.mutate(generation_end_time=new_relation.generation_end_time.cast(ts_type)) + merged = filtered_existing.union(new_relation, distinct=False) + create_table(backend, table_name=table_name, schema=schema, obj=merged, overwrite=True) diff --git a/circe/cohort_definition_set/_core.py b/circe/cohort_definition_set/_core.py new file mode 100644 index 0000000..ddc4081 --- /dev/null +++ b/circe/cohort_definition_set/_core.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass, field +from datetime import datetime +from typing import TYPE_CHECKING, Literal + +if TYPE_CHECKING: + from ..cohortdefinition.cohort import CohortExpression +else: + from ..cohortdefinition.cohort import CohortExpression + + +@dataclass +class CohortDefinition: + """A single cohort entry in a CohortDefinitionSet.""" + + cohort_id: int + cohort_name: str + expression: CohortExpression + + +@dataclass +class CohortGenerationResult: + """Result of generating a single cohort from a CohortDefinitionSet.""" + + cohort_id: int + cohort_name: str + status: Literal["COMPLETE", "SKIPPED", "FAILED"] + checksum: str + start_time: datetime + end_time: datetime + error: Exception | None = field(default=None, compare=False) + + +class CohortDefinitionSet: + """Container for a collection of cohort definitions to be generated together. + + Modelled after OHDSI/CohortGenerator's CohortDefinitionSet, but using typed + Python classes rather than an R data.frame. + + Example: + >>> cds = CohortDefinitionSet() + >>> cds.add(cohort_id=1, cohort_name="Diabetes", expression=expr1) + >>> cds.add(cohort_id=2, cohort_name="Hypertension", expression=expr2) + >>> len(cds) + 2 + """ + + def __init__(self) -> None: + self._cohorts: list[CohortDefinition] = [] + self._id_index: dict[int, int] = {} # cohort_id -> list index + + def add(self, cohort_id: int, cohort_name: str, expression: CohortExpression) -> None: + """Add a cohort definition to this set. + + Args: + cohort_id: Unique integer identifier for this cohort. + cohort_name: Human-readable name for this cohort. + expression: The CohortExpression defining the cohort logic. + + Raises: + ValueError: If a cohort with the same cohort_id already exists. + """ + if cohort_id in self._id_index: + raise ValueError( + f"A cohort with cohort_id={cohort_id} already exists in this CohortDefinitionSet." + ) + self._id_index[cohort_id] = len(self._cohorts) + self._cohorts.append( + CohortDefinition(cohort_id=cohort_id, cohort_name=cohort_name, expression=expression) + ) + + def __len__(self) -> int: + return len(self._cohorts) + + def __iter__(self) -> Iterator[CohortDefinition]: + return iter(self._cohorts) + + def __getitem__(self, cohort_id: int) -> CohortDefinition: + """Retrieve a cohort definition by its cohort_id. + + Raises: + KeyError: If no cohort with the given id exists. + """ + if cohort_id not in self._id_index: + raise KeyError(f"No cohort with cohort_id={cohort_id} in this CohortDefinitionSet.") + return self._cohorts[self._id_index[cohort_id]] + + def checksums(self) -> dict[int, str]: + """Return a mapping of cohort_id to the expression checksum for each cohort. + + Checksums are computed using CohortExpression.checksum(), which normalises + the expression JSON and produces a SHA-256 hex digest. This is suitable for + detecting whether a cohort definition has changed between runs. + + Returns: + dict mapping cohort_id -> hex checksum string + """ + return {c.cohort_id: c.expression.checksum() for c in self._cohorts} diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py new file mode 100644 index 0000000..a65de77 --- /dev/null +++ b/circe/cohort_definition_set/_generate.py @@ -0,0 +1,176 @@ +"""Batch cohort generation for CohortDefinitionSet.""" + +from __future__ import annotations + +from datetime import datetime +from typing import TYPE_CHECKING, Literal + +from ..execution.api import write_cohort +from ..execution.errors import ExecutionError +from ._checksum_store import load_checksums, save_checksums +from ._core import CohortDefinitionSet, CohortGenerationResult + +if TYPE_CHECKING: + from ..execution.typing import IbisBackendLike + + +def generate_cohort_set( + cohort_definition_set: CohortDefinitionSet, + *, + backend: IbisBackendLike, + cdm_schema: str, + cohort_table: str, + results_schema: str | None = None, + vocabulary_schema: str | None = None, + incremental: bool = False, + checksum_table: str = "cohort_checksum", + stop_on_error: bool = True, +) -> list[CohortGenerationResult]: + """Generate all cohorts in a CohortDefinitionSet and write them to a shared table. + + This is the Python equivalent of OHDSI/CohortGenerator's ``generateCohortSet()``. + Each cohort is written to ``cohort_table`` with its ``cohort_id`` stamped into + ``cohort_definition_id``. If the table already contains rows for a cohort, they + are replaced (``if_exists="replace"`` semantics from ``write_cohort``). + + When ``incremental=True``, cohorts whose expression checksum matches the stored + value in ``checksum_table`` are skipped. Successfully completed cohorts have + their checksums persisted to ``checksum_table`` so future runs can detect them. + + Args: + cohort_definition_set: The set of cohort definitions to generate. + backend: Ibis backend connection pointing at the target database. + cdm_schema: Schema containing the OMOP CDM source tables. + cohort_table: Name of the OHDSI cohort table to write results into. + results_schema: Optional schema for both the cohort table and checksum table. + vocabulary_schema: Optional schema for vocabulary tables (defaults to cdm_schema). + incremental: If True, skip cohorts whose expression checksum is unchanged + since the last successful generation. + checksum_table: Name of the table used to persist checksums for incremental + runs. Defaults to ``"cohort_checksum"``. + stop_on_error: If True (default), raise the first ExecutionError encountered + and stop processing remaining cohorts. If False, record the failure and + continue. + + Returns: + A list of :class:`CohortGenerationResult` — one entry per cohort in the + set, in insertion order. + + Raises: + ExecutionError: If a cohort fails to generate and ``stop_on_error=True``. + + Example: + >>> cds = CohortDefinitionSet() + >>> cds.add(1, "Diabetes", expr1) + >>> cds.add(2, "Hypertension", expr2) + >>> results = generate_cohort_set( + ... cds, + ... backend=conn, + ... cdm_schema="main", + ... cohort_table="cohort", + ... incremental=True, + ... ) + >>> for r in results: + ... print(r.cohort_name, r.status) + """ + current_checksums = cohort_definition_set.checksums() + + previous_checksums: dict[int, str] = {} + if incremental: + previous_checksums = load_checksums( + backend, + schema=results_schema, + table_name=checksum_table, + ) + + results: list[CohortGenerationResult] = [] + completed_this_run: dict[int, tuple[str, datetime]] = {} + + for cohort in cohort_definition_set: + current_checksum = current_checksums[cohort.cohort_id] + + if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="SKIPPED", + checksum=current_checksum, + start_time=datetime.now(), + end_time=datetime.now(), + ) + ) + continue + + start_time = datetime.now() + try: + write_cohort( + cohort.expression, + backend=backend, + cdm_schema=cdm_schema, + cohort_table=cohort_table, + cohort_id=cohort.cohort_id, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + if_exists="replace", + ) + except ExecutionError as exc: + end_time = datetime.now() + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="FAILED", + checksum=current_checksum, + start_time=start_time, + end_time=end_time, + error=exc, + ) + ) + if stop_on_error: + raise + continue + + end_time = datetime.now() + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="COMPLETE", + checksum=current_checksum, + start_time=start_time, + end_time=end_time, + ) + ) + completed_this_run[cohort.cohort_id] = (current_checksum, end_time) + + if incremental and completed_this_run: + save_checksums( + backend, + schema=results_schema, + table_name=checksum_table, + completed=completed_this_run, + ) + + return results + + +def summarise_generation_results( + results: list[CohortGenerationResult], +) -> dict[Literal["COMPLETE", "SKIPPED", "FAILED"], int]: + """Return a count summary of generation results by status. + + Args: + results: List of CohortGenerationResult from generate_cohort_set. + + Returns: + dict with counts for each status, e.g. {"COMPLETE": 2, "SKIPPED": 1, "FAILED": 0}. + """ + counts: dict[Literal["COMPLETE", "SKIPPED", "FAILED"], int] = { + "COMPLETE": 0, + "SKIPPED": 0, + "FAILED": 0, + } + for r in results: + counts[r.status] += 1 + return counts diff --git a/tests/test_cohort_definition_set.py b/tests/test_cohort_definition_set.py new file mode 100644 index 0000000..cd18148 --- /dev/null +++ b/tests/test_cohort_definition_set.py @@ -0,0 +1,366 @@ +"""Tests for CohortDefinitionSet and generate_cohort_set.""" + +from __future__ import annotations + +import pytest + +from circe.cohort_definition_set import ( + CohortDefinition, + CohortDefinitionSet, + CohortGenerationResult, + generate_cohort_set, + summarise_generation_results, +) +from circe.cohortdefinition import CohortExpression, ConditionOccurrence, PrimaryCriteria +from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem + + +def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: + return ConceptSet( + id=set_id, + expression=ConceptSetExpression(items=[ConceptSetItem(concept=Concept(conceptId=concept_id))]), + ) + + +def _simple_expression(concept_id: int = 111, set_id: int = 1) -> CohortExpression: + return CohortExpression( + concept_sets=[_make_concept_set(set_id, concept_id)], + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=set_id)]), + ) + + +def _seed_tables(conn, ibis): + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": [1, 2], + "year_of_birth": [1980, 1982], + "gender_concept_id": [8507, 8507], + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2], + "observation_period_id": [10, 11], + "observation_period_start_date": ["2019-01-01", "2019-01-01"], + "observation_period_end_date": ["2021-12-31", "2021-12-31"], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "condition_occurrence_id": [100, 101], + "condition_concept_id": [111, 111], + "condition_start_date": ["2020-01-01", "2020-01-02"], + "condition_end_date": ["2020-01-01", "2020-01-02"], + } + ), + overwrite=True, + ) + + +# --------------------------------------------------------------------------- +# CohortDefinitionSet — unit tests (no database needed) +# --------------------------------------------------------------------------- + + +def test_cohort_definition_set_add_and_iter(): + expr = _simple_expression() + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Cohort A", expression=expr) + cds.add(cohort_id=2, cohort_name="Cohort B", expression=expr) + + assert len(cds) == 2 + ids = [c.cohort_id for c in cds] + assert ids == [1, 2] + + +def test_cohort_definition_set_duplicate_id_raises(): + expr = _simple_expression() + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Cohort A", expression=expr) + with pytest.raises(ValueError, match="cohort_id=1"): + cds.add(cohort_id=1, cohort_name="Duplicate", expression=expr) + + +def test_cohort_definition_set_getitem(): + expr = _simple_expression() + cds = CohortDefinitionSet() + cds.add(cohort_id=42, cohort_name="My Cohort", expression=expr) + + item = cds[42] + assert isinstance(item, CohortDefinition) + assert item.cohort_id == 42 + assert item.cohort_name == "My Cohort" + + +def test_cohort_definition_set_getitem_missing_raises(): + cds = CohortDefinitionSet() + with pytest.raises(KeyError): + _ = cds[999] + + +def test_checksums_returns_dict(): + expr1 = _simple_expression(concept_id=111) + expr2 = _simple_expression(concept_id=222, set_id=2) + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=expr1) + cds.add(cohort_id=2, cohort_name="B", expression=expr2) + + checksums = cds.checksums() + assert set(checksums.keys()) == {1, 2} + assert isinstance(checksums[1], str) + assert len(checksums[1]) == 64 # SHA-256 hex digest + # Different expressions produce different checksums + assert checksums[1] != checksums[2] + + +def test_checksums_stable(): + expr = _simple_expression() + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=expr) + + assert cds.checksums()[1] == cds.checksums()[1] + + +# --------------------------------------------------------------------------- +# generate_cohort_set — integration tests using DuckDB +# --------------------------------------------------------------------------- + + +def test_generate_cohort_set_basic(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=10, cohort_name="Cohort 10", expression=_simple_expression()) + cds.add(cohort_id=20, cohort_name="Cohort 20", expression=_simple_expression()) + + results = generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="cohort_out") + + assert len(results) == 2 + assert all(r.status == "COMPLETE" for r in results) + assert {r.cohort_id for r in results} == {10, 20} + + cohort_table = conn.table("cohort_out").execute() + assert len(cohort_table) == 4 # 2 persons × 2 cohorts + assert set(cohort_table.cohort_definition_id) == {10, 20} + + +def test_generate_cohort_set_results_have_timing(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + + results = generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="c") + + r = results[0] + assert r.start_time <= r.end_time + assert isinstance(r.checksum, str) and len(r.checksum) == 64 + + +def test_generate_cohort_set_incremental_skip(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="B", expression=_simple_expression()) + + # First run: both should be COMPLETE + first = generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=True) + assert all(r.status == "COMPLETE" for r in first) + + # Second run with same expressions: both should be SKIPPED + second = generate_cohort_set( + cds, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=True + ) + assert all(r.status == "SKIPPED" for r in second) + + +def test_generate_cohort_set_incremental_regenerate(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + expr_a = _simple_expression(concept_id=111) + expr_b = _simple_expression(concept_id=111) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=expr_a) + cds.add(cohort_id=2, cohort_name="B", expression=expr_b) + + generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=True) + + # Change cohort 1's expression (different concept) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [200], + "condition_concept_id": [222], + "condition_start_date": ["2020-03-01"], + "condition_end_date": ["2020-03-01"], + } + ), + overwrite=True, + ) + expr_a_changed = _simple_expression(concept_id=222) # different concept + + cds2 = CohortDefinitionSet() + cds2.add(cohort_id=1, cohort_name="A", expression=expr_a_changed) + cds2.add(cohort_id=2, cohort_name="B", expression=expr_b) # unchanged + + results = generate_cohort_set( + cds2, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=True + ) + + statuses = {r.cohort_id: r.status for r in results} + assert statuses[1] == "COMPLETE" # regenerated + assert statuses[2] == "SKIPPED" # unchanged + + +def test_generate_cohort_set_incremental_non_incremental_does_not_skip(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + + generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=True) + + # Non-incremental run should always be COMPLETE regardless of stored checksums + results = generate_cohort_set( + cds, backend=conn, cdm_schema="main", cohort_table="cohort", incremental=False + ) + assert results[0].status == "COMPLETE" + + +def test_generate_cohort_set_continue_on_error(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + # Use a CohortExpression with a concept set referencing a missing table domain + # by using a bad backend-level call; we monkeypatch write_cohort instead. + from unittest.mock import patch + + from circe.execution.errors import ExecutionError + + call_count = 0 + + def _failing_write_cohort(expression, *, cohort_id, **kwargs): + nonlocal call_count + call_count += 1 + if cohort_id == 1: + raise ExecutionError("Simulated failure for cohort 1") + # Delegate to real write_cohort for cohort 2 + from circe.execution.api import write_cohort as real_write_cohort + + real_write_cohort(expression, cohort_id=cohort_id, **kwargs) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Good", expression=_simple_expression()) + + with patch("circe.cohort_definition_set._generate.write_cohort", side_effect=_failing_write_cohort): + results = generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort", + stop_on_error=False, + ) + + assert call_count == 2 + statuses = {r.cohort_id: r.status for r in results} + assert statuses[1] == "FAILED" + assert statuses[2] == "COMPLETE" + assert results[0].error is not None + + +def test_generate_cohort_set_stop_on_error(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from unittest.mock import patch + + from circe.execution.errors import ExecutionError + + def _always_fail(expression, *, cohort_id, **kwargs): + raise ExecutionError("Always fail") + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Also bad", expression=_simple_expression()) + + with ( + patch("circe.cohort_definition_set._generate.write_cohort", side_effect=_always_fail), + pytest.raises(ExecutionError, match="Always fail"), + ): + generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort", + stop_on_error=True, + ) + + +def test_summarise_generation_results(): + from datetime import datetime + + now = datetime.now() + + results = [ + CohortGenerationResult(1, "A", "COMPLETE", "abc", now, now), + CohortGenerationResult(2, "B", "SKIPPED", "def", now, now), + CohortGenerationResult(3, "C", "FAILED", "ghi", now, now), + CohortGenerationResult(4, "D", "COMPLETE", "jkl", now, now), + ] + + summary = summarise_generation_results(results) + assert summary["COMPLETE"] == 2 + assert summary["SKIPPED"] == 1 + assert summary["FAILED"] == 1 + + +def test_api_exports_cohort_definition_set(): + import circe.api as api + + assert hasattr(api, "CohortDefinitionSet") + assert hasattr(api, "CohortDefinition") + assert hasattr(api, "CohortGenerationResult") + assert hasattr(api, "generate_cohort_set") + assert hasattr(api, "summarise_generation_results") From cabaa19812967a927d545512d4a76e1072adae77 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 13 May 2026 10:40:24 -0700 Subject: [PATCH 02/53] started benchmarking scripts --- .gitignore | 1 + examples/benchmark_analysis.ipynb | 611 ++++++++++++++++++++++++++++++ examples/benchmark_db_config.yaml | 64 ++++ examples/benchmark_run_python.py | 267 +++++++++++++ examples/benchmark_run_r.R | 225 +++++++++++ 5 files changed, 1168 insertions(+) create mode 100644 examples/benchmark_analysis.ipynb create mode 100644 examples/benchmark_db_config.yaml create mode 100644 examples/benchmark_run_python.py create mode 100644 examples/benchmark_run_r.R diff --git a/.gitignore b/.gitignore index ff506a5..31de11d 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ celerybeat.pid # Environments .env +.Renviron .venv env/ venv/ diff --git a/examples/benchmark_analysis.ipynb b/examples/benchmark_analysis.ipynb new file mode 100644 index 0000000..158c9e8 --- /dev/null +++ b/examples/benchmark_analysis.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7fb27b941602401d91542211134fc71a", + "metadata": {}, + "source": "# CircePy Benchmark Analysis\n\nReads results written by `benchmark_run_r.R` and `benchmark_run_python.py` and produces the tables and figures for the paper.\n\n**Run order:**\n```\nRscript examples/benchmark_run_r.R\npython examples/benchmark_run_python.py\n# then open this notebook\n```\n\nCredentials are loaded from `.env` (repo root). \nCDM: `healthverity_cc.cdm_healthverity_cc_all_v3910` \nCohort tables: `{DATABRICKS_SCRATCH_SCHEMA}.cohort_r` and `{DATABRICKS_SCRATCH_SCHEMA}.cohort_python`" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ac0a53b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import ibis\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load credentials from .env in the repo root\n", + "NOTEBOOK_DIR = Path(\".\").resolve()\n", + "REPO_ROOT = NOTEBOOK_DIR if (NOTEBOOK_DIR / \".env\").exists() else NOTEBOOK_DIR.parent\n", + "load_dotenv(REPO_ROOT / \".env\")\n", + "\n", + "OUTPUT_DIR = REPO_ROOT / \"benchmark_output\"\n", + "SCRATCH_SCHEMA = os.environ[\"DATABRICKS_SCRATCH_SCHEMA\"]\n", + "\n", + "r_results = pd.read_csv(OUTPUT_DIR / \"r_results.csv\")\n", + "py_results = pd.read_csv(OUTPUT_DIR / \"python_results.csv\")\n", + "print(f\"R results : {len(r_results)} rows\")\n", + "print(f\"Py results : {len(py_results)} rows\")\n", + "\n", + "# Connect to Databricks and pull cohort tables\n", + "backend = ibis.databricks.connect(\n", + " server_hostname=os.environ[\"DATABRICKS_HOST\"],\n", + " http_path=os.environ[\"DATABRICKS_HTTP_PATH\"],\n", + " access_token=os.environ[\"DATABRICKS_TOKEN\"],\n", + ")\n", + "\n", + "cohort_r = backend.table(\"cohort_r\", database=SCRATCH_SCHEMA).execute()\n", + "cohort_py = backend.table(\"cohort_python\", database=SCRATCH_SCHEMA).execute()\n", + "print(f\"cohort_r rows : {len(cohort_r)}\")\n", + "print(f\"cohort_py rows : {len(cohort_py)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "acae54e37e7d407bbb7b55eff062a284", + "metadata": {}, + "source": [ + "## Table 1 — Coverage (SQL generation and execution success rates)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9a63283cbaf04dbcab1f6479b197f3a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImplementationCohorts attemptedCOMPLETEFAILEDSuccess rate
0R CohortGenerator7076901797.6%
1CircePy (Ibis)707103173145.8%
\n", + "
" + ], + "text/plain": [ + " Implementation Cohorts attempted COMPLETE FAILED Success rate\n", + "0 R CohortGenerator 707 690 17 97.6%\n", + "1 CircePy (Ibis) 707 1031 73 145.8%" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total = len(r_results)\n", + "\n", + "coverage = pd.DataFrame(\n", + " [\n", + " {\n", + " \"Implementation\": \"R CohortGenerator\",\n", + " \"Cohorts attempted\": total,\n", + " \"COMPLETE\": (r_results[\"status\"] == \"COMPLETE\").sum(),\n", + " \"FAILED\": (r_results[\"status\"] == \"FAILED\").sum(),\n", + " },\n", + " {\n", + " \"Implementation\": \"CircePy (Ibis)\",\n", + " \"Cohorts attempted\": total,\n", + " \"COMPLETE\": (py_results[\"status\"] == \"COMPLETE\").sum(),\n", + " \"FAILED\": (py_results[\"status\"] == \"FAILED\").sum(),\n", + " },\n", + " ]\n", + ")\n", + "coverage[\"Success rate\"] = (coverage[\"COMPLETE\"] / coverage[\"Cohorts attempted\"] * 100).map(\"{:.1f}%\".format)\n", + "coverage" + ] + }, + { + "cell_type": "markdown", + "id": "8dd0d8092fe74a7c96281538738b07e2", + "metadata": {}, + "source": [ + "## Table 2 — Persistent-table equivalence (R vs Python, per cohort)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "72eea5119410473aa328ad9291626812", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R COMPLETE cohorts : 690\n", + "Py COMPLETE cohorts : 1031\n", + "Both COMPLETE (shared IDs): 637\n", + "\n", + "Cohorts compared (both COMPLETE, shared IDs): 69\n", + "Cohorts with identical tables : 69 / 69 (100.0%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CheckMatchTotal%
0Row count6969100.0%
1Subject IDs6969100.0%
2Cohort start date6969100.0%
3Cohort end date6969100.0%
4All four checks6969100.0%
\n", + "
" + ], + "text/plain": [ + " Check Match Total %\n", + "0 Row count 69 69 100.0%\n", + "1 Subject IDs 69 69 100.0%\n", + "2 Cohort start date 69 69 100.0%\n", + "3 Cohort end date 69 69 100.0%\n", + "4 All four checks 69 69 100.0%" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Only compare cohorts where both R and Python completed successfully\n", + "r_complete = set(r_results.loc[r_results[\"status\"] == \"COMPLETE\", \"cohortId\"])\n", + "py_complete = set(py_results.loc[py_results[\"status\"] == \"COMPLETE\", \"cohortId\"])\n", + "both_complete = r_complete & py_complete\n", + "print(f\"R COMPLETE cohorts : {len(r_complete)}\")\n", + "print(f\"Py COMPLETE cohorts : {len(py_complete)}\")\n", + "print(f\"Both COMPLETE (shared IDs): {len(both_complete)}\")\n", + "\n", + "# Merge on cohort_definition_id — restrict to both-complete\n", + "r_counts = cohort_r.groupby(\"cohort_definition_id\").size().reset_index(name=\"r_rows\")\n", + "py_counts = cohort_py.groupby(\"cohort_definition_id\").size().reset_index(name=\"py_rows\")\n", + "\n", + "comparison = r_counts.merge(py_counts, on=\"cohort_definition_id\", how=\"inner\")\n", + "comparison = comparison[comparison[\"cohort_definition_id\"].isin(both_complete)].copy()\n", + "\n", + "comparison[\"rows_match\"] = comparison[\"r_rows\"] == comparison[\"py_rows\"]\n", + "\n", + "\n", + "# Subject-ID sets per cohort\n", + "def subject_set(df, cid):\n", + " return set(df.loc[df[\"cohort_definition_id\"] == cid, \"subject_id\"].astype(int))\n", + "\n", + "\n", + "all_ids = sorted(comparison[\"cohort_definition_id\"].unique())\n", + "subject_match = [subject_set(cohort_r, cid) == subject_set(cohort_py, cid) for cid in all_ids]\n", + "comparison[\"subjects_match\"] = subject_match\n", + "\n", + "\n", + "# Start / end date equivalence (per-subject tuples)\n", + "def date_set(df, cid, col):\n", + " sub = df[df[\"cohort_definition_id\"] == cid]\n", + " return set(zip(sub[\"subject_id\"].astype(int), pd.to_datetime(sub[col]).dt.date))\n", + "\n", + "\n", + "comparison[\"start_match\"] = [\n", + " date_set(cohort_r, cid, \"cohort_start_date\") == date_set(cohort_py, cid, \"cohort_start_date\")\n", + " for cid in all_ids\n", + "]\n", + "comparison[\"end_match\"] = [\n", + " date_set(cohort_r, cid, \"cohort_end_date\") == date_set(cohort_py, cid, \"cohort_end_date\")\n", + " for cid in all_ids\n", + "]\n", + "comparison[\"all_match\"] = (\n", + " comparison[\"rows_match\"]\n", + " & comparison[\"subjects_match\"]\n", + " & comparison[\"start_match\"]\n", + " & comparison[\"end_match\"]\n", + ")\n", + "\n", + "n_match = comparison[\"all_match\"].sum()\n", + "n_total = len(comparison)\n", + "print(f\"\\nCohorts compared (both COMPLETE, shared IDs): {n_total}\")\n", + "print(f\"Cohorts with identical tables : {n_match} / {n_total} ({100 * n_match / n_total:.1f}%)\")\n", + "\n", + "# Summary table for the paper\n", + "equivalence_summary = pd.DataFrame(\n", + " [\n", + " {\"Check\": \"Row count\", \"Match\": comparison[\"rows_match\"].sum(), \"Total\": n_total},\n", + " {\"Check\": \"Subject IDs\", \"Match\": comparison[\"subjects_match\"].sum(), \"Total\": n_total},\n", + " {\"Check\": \"Cohort start date\", \"Match\": comparison[\"start_match\"].sum(), \"Total\": n_total},\n", + " {\"Check\": \"Cohort end date\", \"Match\": comparison[\"end_match\"].sum(), \"Total\": n_total},\n", + " {\"Check\": \"All four checks\", \"Match\": n_match, \"Total\": n_total},\n", + " ]\n", + ")\n", + "equivalence_summary[\"%\"] = (equivalence_summary[\"Match\"] / equivalence_summary[\"Total\"] * 100).map(\n", + " \"{:.1f}%\".format\n", + ")\n", + "equivalence_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8edb47106e1a46a883d545849b8ab81b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No mismatches — all cohort tables are identical across R and Python.\n" + ] + } + ], + "source": [ + "# Mismatches (if any)\n", + "mismatches = comparison[~comparison[\"all_match\"]]\n", + "if len(mismatches) == 0:\n", + " print(\"No mismatches — all cohort tables are identical across R and Python.\")\n", + "else:\n", + " print(f\"{len(mismatches)} mismatches:\")\n", + " print(\n", + " mismatches[\n", + " [\n", + " \"cohort_definition_id\",\n", + " \"r_rows\",\n", + " \"py_rows\",\n", + " \"rows_match\",\n", + " \"subjects_match\",\n", + " \"start_match\",\n", + " \"end_match\",\n", + " ]\n", + " ].to_string(index=False)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "10185d26023b46108eb7d9f57d49d2b3", + "metadata": {}, + "source": [ + "## Figure 1 — Generation time distribution (R vs Python)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8763a12b2bbd4a93a75aff182afb95dc", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)\n", + "\n", + "for ax, df, label, colour in [\n", + " (axes[0], r_results[r_results[\"status\"] == \"COMPLETE\"], \"R CohortGenerator\", \"#4C72B0\"),\n", + " (axes[1], py_results[py_results[\"status\"] == \"COMPLETE\"], \"CircePy (Ibis)\", \"#DD8452\"),\n", + "]:\n", + " ax.hist(df[\"generation_seconds\"], bins=40, color=colour, edgecolor=\"white\", linewidth=0.4)\n", + " ax.set_title(label, fontsize=11)\n", + " ax.set_xlabel(\"Generation time (s)\", fontsize=9)\n", + " ax.set_ylabel(\"Number of cohorts\", fontsize=9)\n", + " ax.axvline(\n", + " df[\"generation_seconds\"].median(),\n", + " color=\"black\",\n", + " linestyle=\"--\",\n", + " linewidth=1,\n", + " label=f\"Median {df['generation_seconds'].median():.1f}s\",\n", + " )\n", + " ax.legend(fontsize=8)\n", + "\n", + "fig.suptitle(\"Cohort generation time — OHDSI PhenotypeLibrary on Databricks\", fontsize=12)\n", + "plt.tight_layout()\n", + "plt.savefig(OUTPUT_DIR / \"figure1_generation_times.pdf\", bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7623eae2785240b9bd12b16a66d81610", + "metadata": {}, + "source": [ + "## Figure 2 — Row counts: R vs Python (scatter)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7cdc8c89c7104fffa095e18ddfef8986", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAHqCAYAAAAgWrY5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAAByxUlEQVR4nO3dB3gUVdsG4DcJCSS0BAhVeu9NmtJ7VZpUFRRBEJAmzUZRiqAiTT4UKSpKlSa9S6/Se28JPbSEAMn5r+d83+y/G5KQDUl2dve5r2tZdncye2Z3dt45Z95zjodSSgkRERGZjqejC0BERETRY5AmIiIyKQZpIiIik2KQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKTYpAmIiIyKQZpIiIik2KQJrssXbpU6tatK+nSpRMfHx/JnTu3fPjhh3Lq1Cm71tOxY0cpVqyYJKYLFy7I0KFD5dq1ay9c9sCBA3rZ0NDQeL2Xh4eHfPvtt7Eus2nTJr3c3r174/Ue9GIzZ86UP/74I9HWj/0W36Fxy5Qpk/497NixQxzF2K+MW+rUqaV06dIyffp0sWdAyZCQEP0bOHbs2HO/I6x3wYIFiVB6ehEGaYqzQYMGyZtvvilp06aVn3/+WdatWydffvml/lG3bt1azAYHl2HDhsU5SGPZ+AZpco8gDXny5NFBefv27fL999/LuXPnpHbt2vrekWbMmKHLNX/+fMmXL5906tRJfvrpJ7uCNH4DUYM0OVYyB78/OYkVK1bIN998I1988YUMHz7c8nzVqlXlvffek7///lvMArWHJ0+eiDsxtjl58uSOLorL8/X1lYoVK+r/V6pUSbcmvf766zJ37lwZPHiww8qFlqlXX31V/79OnTpSuHBhmThxom7pIufFmjTFyXfffaeb9hCko9O4cWPL/x8/fix9+/aVrFmzSooUKaRUqVKyaNGiGJvq0DSXMmVKKV++vOzbt8/m9bisy2g6x4lEyZIldaBatmyZ1KhRQ79erlw5S1NgTLUvnGhAYGCgXi5Xrlz6cVBQkLz//vu69oSDc/78+eXTTz+V8PDw59bz7NkzGTBggF4HmhxRrgcPHrwwuKKZvECBArrceJ9x48bF+jexbTP89ddf+nPC54XPDZ8fPkdD3rx5bb7HhQsX6m3u37+/5bnVq1fr527evBlrOX799Vf9/eG9MmTIIA0bNpSLFy9aXj98+LDUq1dPf79ogWnZsqVcunTphU2pvXv3tnwHxneE5f79919p0KCBXh++C7y/oXr16rJ582ZZvny55ftG8y1s27ZNn1CiDPhuihcvLrNmzZKEgO0H6+2K6tGjR7rM0V0SwWeCYA9Pnz7V30OOHDn0d5olSxZp0qSJ3Lt3z64yeXl56XKdP39ePy5btqy0b9/+ueUGDhyo95GzZ8/qkw146623LJ8fvh8D9qEePXpIQECALtcnn3yi93lr//zzj7z22mv6t4L9Ab+dO3fuPPd9//777y9cF/0PZsEiis3Tp09V8uTJVbt27eK0fPPmzZWfn58aN26cWrlypWrbtq3y8PBQS5YssSzToUMHlT59elW8eHE1e/Zs9ffff+v/Z8+eXT158sTudQUEBKi8efOqGTNmqPXr16uDBw+qyZMn44Kcfm7Hjh36Fp0bN26ozz//XC+7atUqvdz+/fv1a4cOHVL9+vVTixYtUps2bVI//fSTypo1q+rYsaPNOvC3eL5x48Zq+fLlatKkSSpVqlSqdevWlmU2btyol9uzZ4/luZ49eypfX1/19ddfq7Vr16phw4Ypb29vNWXKlFg/4+i2+fTp0/pzweeDzwmfFz43rL9FixaWv0XZq1atalOGFClSqPLly1ueGzx4sCpUqFCsZRgzZozenk6dOultXrx4sfr4448t23fp0iXl7++vypYtq/766y/1xx9/qNy5c6tcuXKp+/fv62XOnz+v1zF//nybdffq1UvlzJnT8hjbiOUKFy6svv/+e7VmzRr11ltv6W09duyYXubo0aOqdOnS6vXXX7d835cvX1b37t1TadOmVY0aNdLlXLdunZowYYL64Ycfnls/vqMXfe5Fixa1ee748eP6b7/55ptY/7ZNmzb6s7CGzwGfPcoD+P6x32Dfxf62YMEC1aVLF3X9+vUY1xvdfgXlypVT+fLl0//H/oT9ICQkxPL6s2fPVJYsWdSgQYPU48eP9XeE9YwcOdLy+eF54zvKkSOH3lfw2Q8dOlQ/Z72f7t27V/n4+Ki6deuqZcuWqWnTpqkMGTLo/QrvBXFdF/0/Bml6oeDgYP0jwo/5RRAcsex//vMfm+crVaqkypQpY3OwwwH2yJEjzx1stmzZYve6sNzOnTvjdPCKjnGQvnnz5gtPWHBSkSxZMvXo0SPL8/hbBCDjYAS//PKL3kYcxKMrz5kzZ/TrU6dOtXmPgQMHqsyZM6uIiIgYyxHTNiNI4fOxhvVjWZxwwPTp03VgwAEYSpYsqXr06KG36cGDB/o5BDoEh5jgYI+Tp9iW6dOnj0qZMqW6ffu25Tl8FthmIyjZG6QRvAwPHz7UZfjqq68sz1WrVk0HY2v4vK23PzqzZs1SXl5eOjDGJUhjP8DJ5MmTJ1WNGjV0WXGyFxucQKEcp06deu598RsDlB0npvYw9ivsCyjXrVu31IgRI/RzONkCnKjgs/rxxx8tf7d06VKb8sT0XRjP46TIGj7rWrVqWR43a9ZMB1/rk+zVq1frv8V72bMu+n9s7qY4i6m52NqWLVssTWbWkFiGpko0+xnQzFa0aFHL4yJFiuj7K1eu2L2u9OnTS4UKFSShIf7+8MMPumxowvP29tbNhmiai5oohGZJNDNaN2Pi73fv3h3tupF4By1atNDrM25IQgoODpbLly/HWrao2/zw4UOdAIf3tWYk9W3dulXfo9kXTZcoF5KF0CTdvXt3SZMmjW4Wxmt79uzRy8UECUpIskNyUkzw/dWsWVP3BDAUKlRIN88bZbEXMqkNaD7OmTOnZX+JCZr3sW3dunWTefPmRduE/+677+rPvlq1ai8sw9GjR/V+gN4NBQsWlF27dulLDLjMEZv69euLv7+/zJkzx/Ic/o/LMriUBGXKlNGXMNBMj+8gMjJS4grXyVEuNDMPGTJEunbtqhM7AduP/QAZ39aJZlWqVNGXDez97AG/CevPHt83EktRBuu/wTZH/b5ftC76fwzS9EIIBrjmGNs1N8Pdu3f1j9T6wAw4CCFgISgY8OO1hoMeGNdP7VmXcZBLaAjQ/fr10wefJUuW6MA2efJkm3IaMmbMaPMYB0Z8briuHZ1bt27p7cBBFdtp3JD0Ay8K0lG3GZ8H1hf1eVyHxfVN49oggla2bNn09UMcPFFuBM/KlSvr53bu3KmT0GIL0rdv37acaMUE31903wues75OaY/o9pmo30NUuO65du1afS36nXfekcyZM+vr1zg5iQ98fgig+JymTp2qv7NWrVq9sGcAyooTMiNI4zNEudq1a2dZ5rPPPtPXiXG9HDkaKCsyruPSlQrX51GukydP6hPYKVOm6P3P0LlzZ93979ChQ/pEBcmeuGacUJ+9Pd93fL5Hd8XsbnqhZMmS6ezV9evX69oGHscEARXJL/jB4uBouH79uq6JR/1xxsaedcWllh8f6M7yxhtvyKhRoyzPxdRF5caNGzaP79+/rw88SIyJaftQbgRK4wTFGmppsYm6zfg88FzUciDpCIlu1ic7CMAIyHgNtSnjOSTlIaCjhpo9e/ZYT9wA3dteeeWVGLcvalmM7w+JcmAEkajZ+PjOExIC3sqVKyUsLEw2btyoE5WaNm2qE6bshTIbWdRoycBJFoIvMqkRYGPTtm1b+eWXX3SgRGsEWl6aN29ueR2fPWrRuJ05c0bXfPF/JBTiBCM2yOY2yhUdJKeh5QrrRGIatiNqK9XLiO37jnqiTXHHmjTFCTKE0QQ7YsSIaF9HEx2gNmYEN2t4bGRxx9XLritqzTw+y+KgHjWAzp49O9p1ILs6IiLC8hgZywiayC6PTq1atSw1Khxco95Q87NHqlSpdFZ31ExpNPFaf55GQEY/3w0bNliaeHGPmtiqVatirUUbB3w/Pz/dZBoTvB9O7KwDLmp5CFBGWVCLR030+PHjlmUQsJGlHR8vqpHhkgUy0NH0jcznhKi9IcjiJBZZ+S9aH2rwqB3/+eef+oZMdbR0RAd9nUeOHKkDnPXn8zJQm8b+ixMFNH9b/4bs+b1EB9/p4sWLbbK00VKAFh7rfY/sw5o0xQkObOheZIxI1KZNG12DwIEOZ+aokWGZEiVK6IMWgjoCHGqD6G6BgIDmYnu87LpQW0NNBeVD7R+3mGoaqIUAmrJRw0IAQjcdND2PHz9eJk2apNeH90cNJzqoreJvP/roI/25oFaF68PGuqMrH64Fo4aEbjeolaHlAKO3obaHA5698P2gDG+//ba+ISiiyxhqetgeA4IwrmGjy5sRaHHig5ocangvagZFYMF1T2wjrpvicgDuUW7UFvE59+nTR68b1x/RjIuD/+eff65rcehCBp6envo7xueLoIR9Cv9H8258WkfwWaOpGCdMaMFAczzyFxCUmjVrpt8bJ5uo9SKwGjV5NBVjm3FSEZfr0tF97thX0FUM14Jjgv0RTeNYDrVO6+vTgO8O3aWMk1BsB05ycG0/IWBfw3eGSy34TKzh5AGtMTh5QHcs7Av4DcYVvmN0v0J3zJ49e+oaNAZAQisGjg0UT1ZJZEQvhG42tWvX1l1r0FUI3Wk+/PBD3f3HEBoaqnr37q0zlNElo0SJEmrhwoUv7Mpy9+5dS5epl12XAZnhefLk0ZnLL9rd0RXklVdeUZ6enpbMYmQ7o8sSujvh1rlzZ929JGrWOB6PGjVK9e3bV6VLl053o3nnnXd0Vm1s2eaRkZFq4sSJqlixYnr78LfIzkY3o9jEts3otoPPCevD54bPLyws7LnlAgMD9fuhDIb69evrMiJrOS6QKY6uc3gvdKlDF7SLFy9aXkeGfp06dXRmcerUqXXm8oULF2zWgazopk2bqjRp0qhs2bLprlExZXdHzb5HZjo+C8OVK1dUw4YN9f6J5YcMGaJOnDihu6Chex+6Ehpd6IKCghKkC5ahcuXKukucdYZ/dNC1Ce+FfQT7d9Ruba+++qruMobMePRiQNe12NjTiwHQRapIkSLRvoauhujmhs8J60Q2dlwz8AHZ8dh/8ffYt/A5W2f327Mu+i8P/BPfAE9ERM4DeRJIGkTNHwmRZH5s7iYicnEY+Q6XqX788Ud9GcEYYY/Mj0GaiMjFIfcA/bGRsY9r9sy2dh5s7iYiIjIpdsEiIiIyKQZpIiIik2KQJiIiMikmjiUQDOSAIRIxSlRiDVFJRETOCelfyLLHADsYxCeuGKQTCAJ0bGMdExERXb58Ocbx7qPDIJ1AjHGW8QVg9iMiIiLrgWRQkbN3TH4G6QRiNHEjQDNIExFRdOy9HMrEMSIiIpNikCYiIjIpBmkiIiKT4jXpJEy/x2ToERERji4KkdPx9vbWczETuRsG6STw5MkTCQoKktDQUEcXhchpk23QbSVVqlSOLgpRkmKQToJBTs6fP69rAejE7uPjw8FOiOxshbp586ZcuXJF8ufPzxo1uRUG6SSoRSNQo3+cn5+fo4tD5JQCAwPlwoUL8vTpUwZpcitMHEsi9gwDR0S22PpE7oqRg4iIyKQYpMn0Q+nlzZtXX5NMLFh3zZo19Uhxb731VqK9D0UPPR6KFy8ux48fd3RRiEyHQZpM7bvvvpOmTZvqa5KJZerUqfo6Z0hIiMyfP/+l1tWxY0fp3bt3gpXNFeXKlUsWL15seYzP/pNPPpFPP/3UoeUiMiMGaTIt9Cv/6aef5L333ku09SNzGNn3RYsWdcq8AWMbnF3Lli1l/fr1cunSJUcXhchUHHpUmjJlipQoUcIyKUWlSpVk5cqVltcfP34s3bt3l/Tp0+v+kS1atJDr16/brAM/6kaNGunM6YwZM0r//v31gcvapk2bpEyZMpI8eXLJly+fzJw587myTJ48WZ/hp0iRQipUqCC7d+8WdzF+/HipXr26zXNz5syRIkWKJOj7YIawDBkyyNq1ay2Z7/hehg0bFu3y+A7QFFqsWDGbmur777+va9fYJ7D/bN261fI6sn+//PJL3USO/eaNN97Q04haJyBNmjRJrzNlypTSvHlz+fXXX+XHH3/U6/vll1/0cuvWrZPy5cuLv7+/DuBLly61rAPZ+hMmTJBChQrpGW3QLWjVqlX6udmzZ1vWhb+Lzu+//67fH3+bI0cO+eKLL2wC7dGjR6VixYr69Ro1asiAAQNsvp+o2/Dw4UM5e/asNGnSRLc45MyZU77++mtdTkNs24PPtFOnTjpQGuU+cuSIbmFA32SsE9sUdf/AZ4/1lStXTrZv3255DWUdPHiw1KtXT28DvuPDhw/r13A5Ab/Ztm3b6vfq2rWrfh7bgfUsX748hr2HyE0pB1q6dKlavny5OnXqlDp58qT69NNPlbe3tzpy5Ih+vWvXrip79uxq/fr1au/evapixYrqtddes/z9s2fPVLFixVTt2rXVv//+q1asWKEyZMigBg8ebFnm3Llzys/PT/Xt21cdO3ZMTZw4UXl5ealVq1ZZlpkzZ47y8fFR06dPV0ePHlWdO3dW/v7+6vr163Helnv37uEoq++thYWF6ffFvbVr166pffv22dxQVuNvor6Gm+HEiRPPvXb79m392o0bNyzP4T3i4tatWypFihSW94e6deuqMWPGRLv8li1bVNq0aWO8devWLcb3WrBggcqcObP+bHv37q2qVq2qv8foTJ48WVWpUsXmuQ4dOqjkyZPrfefp06dqypQpKiAgQN29e1e/3r9/f1WzZk297eHh4apfv34268B3VKlSJXX16lX1+PFjFRERodfZq1cvyzIHDx7U3z/2O7yO7U2TJo3+3GH8+PEqd+7cep+MjIxUFy9e1N+xUT7rdUUH+yn2d/wt9tuMGTOq33//Xb/25MkTlSdPHjV06FBd/p07d6r06dOratWqxbgNjx49Ujlz5lTjxo3Tf4PyFC1aVE2bNi1O24Myp06dWm3dulV/pniMMuCzxPrWrVunfx/BwcF6efxms2XLpvcxrG/hwoUqXbp0ej8ClBWvHzhwQK8Pvyfr8qOsixYteu5z6dGjh+rSpUu0n1lMvyMiZxFTjHgRhwbp6OCAi4NLSEiIDtjz58+3vHb8+HG9kTt27LAc7Dw9PS0HD8BBGwcgHFxgwIAB+oBlrXXr1qpevXqWx+XLl1fdu3e3PMaBJ2vWrGrUqFGJFqSHDBmil7e+tW/fXr92+vTp516zPp/CyUrU13777Tf92qRJkyzP4T3iqlWrVpblr1y5ogNhUFCQSgw4EBcvXlwf2C9duhTjcl9//bVq1KiRzXMIIA0aNLB5rlChQnr7EfRSpkypg4MBnzv2EeN98LlEDRBRA+tHH32kTyCstWvXTg0fPtzyfrNmzYq2zHEJ0lFh+Q8++ED//59//tEnOghu1uWJGqStt2HevHmqVKlSNuv86aef9MlKXLYHZW7Tpo3lNQRhfGahoaGW5wIDA9XatWv1/xs2bKh++OEHm/Xh5PnXX3/V/0dZBw4caHkNwT9VqlQvDNI4SX/rrbei/YwYpMldg7RpBjNBsyaSdh49eqSbvfft26ebLmvXrm1ZBs2LaB7csWOHbg7EPbJCM2XKZFkGTWzdunXTTYalS5fWy1ivw1jGSO5BkyveC81zBlybxN/gb2MSHh6ub9ZZyPb48MMPdVOstYCAAH2PJkaUKSZorsfnZA1N9dCqVSv9+UGWLFniXB40IeNzGzJkiG7+rVu3rmTOnFkSw0cffaSvNffo0UMP8hITfB7Rfa5ozo36+OrVq3Lr1i39uVStWtWmXy1GeUNTu/Fe2Idig0EzNmzYIDNmzLA8h0soxjzhFy9e1E3c8bV69WrdxH/q1Cm9j2M/atCggX4NTfP43pIl+/+fJsqL/dma9TagvGieRtOzwRhAJy7bA9a/IVw6QjO1r6+vzXNoVjfWhyQv7CsGbAe+A4P1vmM0yb8IvmvjN0BkBkopHZusf49JzeFBGteqEFRw/RnXqBYtWqSvhR44cEAfXK0PPMbBJDg4WP8f99YHF+N147XYlsEBISwsTO7evau/hOiWOXHiRIzlHjVqVIzXUuMCB+KYgiiui+M6XkwKFiwY42u4fhifTOg6deroA/fmzZtl1qxZevtismXLFktQic7bb78t//nPf6J9DSdFOCHo0KGDPhnA9dCyZctGu2ypUqWi/YwRJK3hGme2bNn0NWgEk127dukTupi8KEEMwa1Xr14yevToaF/HScGZM2csJ0P2rBvbj+vguMbbpk0bnSeBE0YEPsDQsdhn8V0YB4bokqms3wflxWe4c+fOeG2PvbC+nj17Wq4n2yumz+jYsWP6ujiRWQL0p59+qk+AlyxZ4rDEUoensyLgICDjwIqaHA7e+LGaHWre9+7ds9xQU3Nm2AGRRY2AcefOHWncuHGMy1apUkXXjGK6xRSgYdCgQfpkbPr06TJixAidQBRTLQuJThC1FolaIRKMEMh+/vlnPXkJkgexDQgc/fr1s3wft2/flrlz59rdyoFa58aNG/UJHGq6aFUx+vHidZw8YL/FDxlB1HgNJ3fnzp2LMeMa68IJKU4oEKCx3//xxx+W19FChBNTnCShdrpnzx6ZN29erOXFd4WESgR+rBtlPnnypE6YjMv22AvJnGPHjtWtPdhOTByDxDSMrR0X+IyQ6GYN68C2NmzYMF5lIkpoX331lT6xxRgKjuz54fAgjdoyMq5RE8CBqWTJkjrbGM1lqHWg76o1HIyMpjTcR832Nh6/aBk09aE5D9nG6KcZ3TKxNffiAGtkpRs3Z4cgfejQIV0TxtSACQ0Z0KilI7sZOz2auwsXLqxrZdFBTdIIMNbatWungzOCGTKqcZZrNJNiH0INFz8sNNliv1qzZo1d5cRlkj///FM+//xz3SqBWjoysI3LGx9//LE+ocSlBbwHLo0Ytd0PPvhAN/umS5dOZz9HheXRk6BLly56n8GJSuvWrS2v43PH9vz99996m5DZje8D+1tMcNKDIIkuTLjsgRMAfEZGa9KLtsdeyCLHwatz5866jLlz59a/Wets8tigdoLsdHx/uPQBCxcu1JnsUS9lEDnCmDFj9OWckSNHSp8+fcShlMnUqFFDJ7IYiWPIBjYgGzW6xDHrLOypU6fqxDFkvRqJY8gAt9a2bdvnEseQWWqdOIbs1MRMHDMjZAkj8erw4cPKLPB5ItMYWevxTcxydki0MxLLXBF+b0gkRM+KmDjT74ic2+bNm/Wx/IsvvkjQ9TpldvegQYP0B3L+/Hl16NAh/djDw0OtWbPG0gUrR44casOGDbq7C7qd4Ba1Cxa6CyGjF92qkIUaXRcsdCdBdji69UTXBQvZzDNnztQHAhwU0WXFOmvc1YM0MqNHjx6tKleurMzMHYI0MryRjY7ghe5POHFavXq1cmfO8jsi5xcZGakrgLhX7h6k33//fd0dA30wEVxr1aplCdCAHyS6j6BbFgJts2bNnusWdOHCBd0lx9fXV/eRRr9Y6+4rsHHjRt1FBe+DWtmMGTOeKwv6T+OEAMugZo3+qfZw5iCNkx10kcmVK5dNf2wzcocgjZNFdAHEPp0/f369b7o7Z/gdkXObNWuW7s6YWOIbpD3wj2Mb3F0DssXTpk2rk8isr08jkQfDTuK6HbK2ich+/B1RYkJyKfI4kGcRW+JrYsQI0yeOEREROcrixYulffv2+oakTrNhkE4ibLAgij/+figxoFcEemlg7AJ0C0VPH7NhkE5kRlcm9AMlovhBd0ww40GUnFfhwoV1N0BMjOPIUcViY85SuRAcVNAf9MaNG/oxRsSyHrKSiGKH/tc3b97Uvx2zHkjJuezatUvy5Mmjxwz44YcfxMy4xycBY1AUI1ATkX0w+A3GK+cJLr0sDJ+LYZBxDTqxksQSEoN0EsCBBeN0Y75rDPVIRPaPTOjIoRnJNezfv1/q16+vR7b87rvvxBkwSCdx0zevqREROWYyp7p160qBAgVkxYoVenY2Z8AgbSI37obKkTO35MbdMMkY4CvF8mWQjAF+ji4WEZHTO3TokB7bHlPFOtNcCxzMxMEd1a0D9OJNZ+RmSJikSJ5MHoc/k0B/X2laPR8DNRFRPGGSJmPKY+spYJMaBzNxcqhBI0DnzJJGsqRPqe/xGM8TEZH9MPc85qXHbHngjL0DGKRNAk3cqEF7/i97Ffd4jOeJiMg+mDK2Vq1aOuEQg5U4KwZpk8A1aDRxR/7v6gPu8RjPExFR3F2/fl0HaAyCs2HDBnnllVfEWTlf3d9FIUnszJUQuRh03+aadPF8gY4uGhGRUxk6dKi+Brx582adLObMWJM2CSSHVSmdTdKm9JE7IY/1fdXSr0gga9JERHb5/vvvZcuWLZI/f35xdgzSJoHs7i3/XpV7j55IOv8U+v6ff6/o54mIKHYPHjyQN998U3e18vX1lbx584orYJA2CWZ3ExHFz6NHj6RRo0ayadMmCQ8PF1fCa9ImwexuIiL7PX78WJo2baqH/FyzZo2UK1dOXAlr0ibB7G4iIvt17NhRtm3bJsuXL5fXXntNXA2DtImyu5HNjezuoNuP9D2zu4mIYvfxxx/LkiVLpFq1auKK2NxtouxuDAFqPXY3AjSzu4mIbEVEROhpJrt06eKStWdrDNImC9Q1y+VwdDGIiEwrMjJSOnXqJL///ruecrJy5criyhikiYjIKSilpHv37vLrr7/qIO3qARoYpImIyCkCdJ8+fXQz9/Tp06Vdu3biDpg4RkREThGknz17JpMnT5b33ntP3AVr0kREZGpnzpyRfPnyyaRJk8TdsCZNRESmNWbMGClSpIicOnVK3BGDNBERmdKECRNk4MCBMmjQIClQoIC4IwZpIiIynZ9++kl69eol/fv3l2HDhom78lC4Gk8vDXOXpk2bVu7duydp0qRxdHGIiJxWWFiYFC1aVBo3bizjx48Xj//NaeCOMYKJY0REZKrBSjDV5K5duyR9+vQuEaBfBpu7iYjIFBYvXiyvv/66rm0GBgaKpydDFD8BIiJyuBUrVkirVq0kR44ckjJlSkcXxzQYpImIyKHWr18vzZs3l4YNG+rhPpMl45VYA4M0ERE5THBwsLz55ptSo0YNmTt3rnh7ezu6SKbC0xUiInKYzJkzyx9//CF16tSR5MmTO7o4psOaNBERJbn9+/frwUrgjTfe0Bnd9DwGaSIiSlJHjhyRunXr6uvPT548cXRxTI1BmoiIkszJkyelVq1akj17dlm9erX4+Pg4ukimxiBNRERJ4sKFCzpAow/0mjVrJCAgwNFFMj0GaSIiShLp0qWT2rVry7p163SgphdjdjcRESWqa9euyaNHjyR//vwyc+ZMRxfHqTBIExFRorl+/bpu4sakEjt37nT7sbjtxSBNRESJ4vbt27r/M8biXrp0KQN0PDBIExFRggsJCdHdrDCi2KZNm3RTN9mPiWNERJTgjh49Kjdu3JC1a9dKkSJFHF0cp8WaNBERJZiwsDA9vCemnDx9+rSkSJHC0UVyaqxJExFRgnj8+LEe4rN79+76MQP0y2OQJiKil4bhPVu2bCnbtm2TNm3aOLo4LoPN3URE9FKePn2qAzOuPy9btkyqVavm6CK5DAZpIiJ6Kb/88osOzn/99ZfO6CYXae4eNWqUlCtXTlKnTi0ZM2aUpk2b6sHXrVWvXl33rbO+de3a1WaZS5cuSaNGjcTPz0+vp3///vLs2TObZdAFoEyZMjqhIV++fNGOejN58mTJlSuXvo5SoUIF2b17dyJtORGR6+jcubNs375dmjRp4uiiuByHBunNmzfrBAOMQoNmEjSZ4CwMw8dF3QGCgoIstzFjxlhei4iI0AEa10Owk8yaNUsH4C+//NKyzPnz5/UyNWrUkAMHDkjv3r3lgw8+0DOwGObOnSt9+/aVIUOG6HlOS5YsKfXq1dNdCIiIyJZSSvr06aOP3V5eXrrCRYlAmciNGzcUirR582bLc9WqVVO9evWK8W9WrFihPD09VXBwsOW5KVOmqDRp0qjw8HD9eMCAAapo0aI2f9e6dWtVr149y+Py5cur7t27Wx5HRESorFmzqlGjRsWp7Pfu3dNlxz0RkSuLjIxUvXv31se86dOnO7o4TiG+McJU2d0YOs6YKcXa7NmzJUOGDFKsWDEZPHiwhIaGWl7bsWOHFC9eXDJlymR5DjXg+/fv6870xjKYecUalsHzgFr4vn37bJbx9PTUj41liIjovzXozz77TH744Qd9ifC9995zdJFcmmkSxyIjI3UzNDrAIxgb2rVrJzlz5pSsWbPKoUOHZODAgfq6NRIUAEPOWQdoMB7jtdiWQSBHx/u7d+/qZvPoljlx4kS05Q0PD9c3A9ZFROTqxo0bp/OJvv/+e/noo48cXRyXZ5ogjWvTR44cka1bt9o836VLF8v/UWPOkiWLnlHl7NmzkjdvXnEU7KTDhg1z2PsTETlC48aNxdvbW3r27OnoorgFUzR39+jRQ/7++2/ZuHGjvPLKK7Eui6xrOHPmjL7PnDmzngrNmvEYr8W2DKZO8/X11U3pSHyIbhljHVGh2R3N88bt8uXLdm83EZGzWLBggTx48EAKFCjAAO0uQRrXNhCgFy1aJBs2bJDcuXO/8G+QnQ2oUUOlSpXk8OHDNlnYyDZEADYGdccy69evt1kPlsHz4OPjI2XLlrVZBs3veGwsExW6cuE9rG9ERK7op59+krfeekv3nqEkphyoW7duKm3atGrTpk0qKCjIcgsNDdWvnzlzRg0fPlzt3btXnT9/Xi1ZskTlyZNHVa1a1bKOZ8+eqWLFiqm6deuqAwcOqFWrVqnAwEA1ePBgyzLnzp1Tfn5+qn///ur48eNq8uTJysvLSy9rmDNnjkqePLmaOXOmOnbsmOrSpYvy9/e3yRqPDbO7icgVzZo1S3l4eKgePXrorG6Kn/jGCIcGaRQ4utuMGTP065cuXdIBOV26dDqA5suXTwfaqBt54cIF1aBBA+Xr66syZMig+vXrp54+fWqzzMaNG1WpUqWUj4+PDvTGe1ibOHGiypEjh14GXbJ27twZ521hkCYiV4PKC7q4fvDBB7pbKsVffGOEB/5J6tq7K0J2d9q0afX1aTZ9E5ErQAY3LjHOmDFD5+1Q0scI02R3ExGROVy9elWyZcumR2FEPQ7DMZMbZ3cTEZE5IIk3f/78esIMYIB2LAZpIiLSME4FJsnAVJOczcocGKSJiEh27dolDRs21GNRYERHdDMlx2OQJiIiPR53iRIlZOnSpXqQJzIHJo4REbkxIzEMI4rhPlWqVI4uEllhTZqIyE1hsqLXXntNzp8/L/7+/rqLEJkLgzQRkRs6d+6cnqwI/XdZezYvBmkiIjdz6dIlqVmzpvj5+ek5CgIDAx1dJIoBgzQRkRt59uyZNGjQQDw9PXWf6Jhm+iNzYOIYEZEbSZYsmR7us2DBgi+cGpgcjzVpIiI3cPv2bRk9erSehrdevXqSK1cuRxeJ4oBBmojIxYWEhOjA/N133+lxucl5sLmbiMiFPXjwQI8khmzujRs3Svbs2R1dJLIDgzQRkYsKDQ2Vxo0by9GjR2XdunVSsmRJRxeJ7MTmbiIiF+Xt7S358uWTlStXSrly5RxdHIoH1qSJiFzMkydP9GhixYsXl19++cXRxaGXwJo0EZGL9YNu166d1KhRQ1+PJufGmjQRkYuIiIiQDh06yJIlS/R0k6lTp3Z0keglMUgTEbkA9H/u3LmzzJkzR+bOnStNmjRxdJEoAbC5m4jIBVy5ckVWrFghv/76q7Rs2dLRxaEEwpo0EZGTzwf99OlTyZEjh5w+fZpN3C6GNWkiIicO0J999pmeMAMJYwzQrodBmojISX399dcyatQoadSokZ44g1wPgzQRkRP69ttv5csvv9SBum/fvo4uDiUSBmkiIieza9cu6d+/v3z++ee6uZtcF9tHiIicTIUKFfRY3DVr1nR0UchsNWkvLy+5ceNGtHOV4jUiIkocv/32m8ycOVP/v1atWuLh4eHoIpHZgjSyCaMTHh4uPj4+CVEmIiKKYt68edKxY0fZvn27o4tCZmzunjBhgr7Hmdu0adMkVapUNkPR/fPPP1KoUKHEKSURkRtbvHixHo+7bdu2MmXKFEcXh8wYpMeNG2epSf/nP/+xadpGDTpXrlz6eSIiSjgbNmyQVq1aSbNmzXRTNy8rupc4B+nz58/re8ysgoHbAwICErNcREQkoqeb/Pjjj2XkyJHsC+2GPFRMF5nJLvfv35e0adPKvXv3JE2aNI4uDhG5QDerV155RbJly+boopADY4Tdp2W4/owml/Xr1+ssb8y8ErVphoiI4m/37t1Sp04dadGihcyYMcPRxSEHsjtI9+rVSwdpDENXrFgxdgEgIkpA//77r9SrV083c0+cONHRxSFnC9KYqxRdARo2bJg4JSIiclNHjhzRNeh8+fLpaSete9GQe7K7nzQyubEDERFRwsJUk7lz55bVq1fr65dEdgfpfv36yfjx42Mc1ISIiOyDERtxTEU3q507d0q6dOkcXSRy1uburVu3ysaNG2XlypVStGhR8fb2tnkd3bOIiChuLl26JFWrVpWuXbvKoEGD2A+aXi5I+/v767M9IiJ6OdeuXbOMwd2+fXtHF4dcIUizOwAR0ctDF1YE6MePH+thlbNnz+7oIpEJcfgaIiIHGD16tISEhOgAjWQxogQZcQw7U2x9o8+dOyfuiCOOEZE9njx5IlevXmWAdhP3k2rEsd69e9s8fvr0qe58v2rVKunfv7+9qyMichsPHjyQNm3ayJAhQ6R8+fIM0JQ4I45FZ/LkybJ37157V0dE5BZCQ0OlcePGulLDkRop0fpJx6RBgwaycOHChFodEZHLQHJY06ZNZd++fbr7arly5RxdJHK3xLEFCxawAz4RUTQ++OAD2bJliw7Qr7/+uqOLQ64cpEuXLm3TVIO8s+DgYLl586b8+OOPCV0+IiKn17dvX3n33XelevXqji4KuXqQRpONNU9PTwkMDNQ7X6FChRKybERETgvT+mIWK4wkVqZMGUcXh9wlSCMrkYiIYhYZGSmdO3eWWbNmSalSpViDpqS9Jo0zxMWLF8vx48f1Y4zh/cYbb3DMWSJye7gE2KNHD5k5c6b89ttvDNCUtNndZ86ckcKFC+vrK5hMA7e3335bB+qzZ8/ata5Ro0bpLMfUqVNLxowZdVP6yZMnn8uK7N69u6RPn17PrdqiRQu5fv36cwPUN2rUSPz8/PR60F/72bNnNsts2rRJNzklT55cT7WJH1B03chy5colKVKkkAoVKsju3bvt2h4icm8I0Lj+PGXKFJk2bRrH46akD9Iff/yx5M2bVy5fviz79+/XNwRJdMrHa/bYvHmzDsCYmm3t2rV6YJS6devKo0ePLMv06dNHli1bJvPnz9fLY0D65s2b29TqEaAxes/27dt18xIC8JdffmlZ5vz583qZGjVqyIEDB/SALMi2xJythrlz5+ofF5rzsU0lS5aUevXq6fF1iYjiChWBSZMmyfvvv+/oopArUHby8/NThw4deu75AwcOqJQpU6qXcePGDQxRqjZv3qwfh4SEKG9vbzV//nzLMsePH9fL7NixQz9esWKF8vT0VMHBwZZlpkyZotKkSaPCw8P14wEDBqiiRYvavFfr1q1VvXr1LI/Lly+vunfvbnkcERGhsmbNqkaNGhWnst+7d0+XC/dE5H5wbCJK6BjhGZ+zRAxtF9XDhw/Fx8fnpU4YMKYpGP2t0fEftevatWtblkEGeY4cOWTHjh36Me6LFy8umTJlsiyDGjDGST169KhlGet1GMsY60AtHO9lvQyy1vHYWCaq8PBw/R7WNyJyT99++60UK1ZMjhw54uiikIuxO0hjWLsuXbrIrl279PUX3NBcjW4GSB57mWxINEOjoz92dkD/awR+zGFtDQEZrxnLWAdo43XjtdiWQWANCwuTW7du6Wbz6JYx1hHd9XQMlm7cOM0ckXtC0zbyYAYPHmw5dhE5LEhPmDBBX5OuVKmSTrDCDYEVyVjjx4+Pd0FwbRpnoXPmzBFngB8kav7GDdfoici9/Pzzz9KzZ0/p16+fDB8+3NHFIRdkdxcs1GqXLFmis7yNLljI9kaQji90V/j777/1vKqvvPKK5fnMmTPrpmjMuWpdm0Z2N14zlomahW1kf1svEzUjHI8xXZivr6/uOoZbdMsY64iu2R83InJPuOT1/fff6wrG2LFjOWkGmWuCDQTlJk2a6Ft8A7TRn3DRokWyYcOG56ZtK1u2rHh7e8v69estz6GLFrLJUZMH3B8+fNgmCxuZ4gjARYoUsSxjvQ5jGWMdaFLHe1kvg+Z3PDaWISIy4PIYTtLRowStiwzQlGjsSjNTSjVv3lyNHj36uee/+eYb1bJlS7vW1a1bN5U2bVq1adMmFRQUZLmFhoZalunatavKkSOH2rBhg9q7d6+qVKmSvhmePXumihUrpurWraszzFetWqUCAwPV4MGDLcucO3dOZ6X3799fZ2BOnjxZeXl56WUNc+bMUcmTJ1czZ85Ux44dU126dFH+/v42WeOxYXY3kXtYvHixKlWqlLp586aji0JOJL4xwu4gnSFDhmi7YOG5jBkz2vfmItHeZsyYYVkmLCxMffTRRyogIEAH2mbNmulAbu3ChQuqQYMGytfXV5evX79+6unTpzbLbNy4Uf+wfHx8VJ48eWzewzBx4kR9QoBl0CVr586dcd4WBmki17dy5Up9fECFJOoxhigxYoQH/rGn5o1ruBgQpGDBgjbPnzhxQs+QhWxpd4RMcWR5I4kMTe1E5FpwSQ6DItWpU0cWLlyoL8URJXaMsPuaNPokY3SuqJCVbVwDJiJyJeimiWGLq1atKvPmzWOAJvNmd3/xxRd6WE6M012zZk39HBKs/vzzTz10JxGRK7hxN1SOnLklN+6GScYAX/l5+m/SpGEd3e2UyLRBGtncmAFr5MiRsmDBAt38XaJECVm3bp1Uq1YtcUpJRJTEAXrxpjNy6NBBOX98t1Sq214yBuSWh+Eifn6OLh25k3hNVYnrMrgREbki1KCPHD0qs77/WDJmekVatH1fgu6E6edrlsvh6OKRG4lXkCYicmUHDx+Tmd/2lHQZMsvAkT9LihS+kiJ5pG76JnKKwUyIiFwRBksaOfg98U2ZRgaOmCapUvtLpFLyOPyZvjZNlJRYkyYispIhQwZp2LCRFKvcVu6GJZOwyEc6QAf6+0rxfIGOLh65GQZpIiIRCQoKktu3b+uZrGbNmPZcdjcCdCBr0mT2IH3u3DnJkydP4pSGiMgBMPZ/rVq19Dj++/fv1/PJZwzwY5IYOV+QxmQamKkK3a2qV6+u719mBiwiIke6c+eOHkXs7t27snnzZh2giczC7r0R8yaPGjVK948eM2aMFChQQAft9u3by7Rp0xKnlEREiQBDNNatW1euXbumx3rA8YzITOweuzuq06dPy4gRI2T27Nl6ekdM4eaOOHY3kfPZt2+ftGjRQpYsWSIlS5Z0dHHIhd2PZ4ywu7k7NDRUtm7dKps2bdK3f//9VwoVKqTnhUbzNxGR2WEioGTJkul55E+dOqWvRROZkd1B2t/fXwICAnTz9qBBg6RKlSr6MRGRM3j8+LG8+eabkjlzZvn1118ZoMm1gnTDhg11TRqzXgUHB+sbatC8lkNEZvfkyRN56623ZMuWLbJixQpHF4co4RPHMLkGpm1btWqVVKpUSdasWaNr09myZdO1ayIiM3r27Jm0a9dOH7NwHKtRo4aji0SUeIOZYF5p7PQ4M0Xz0erVq/U800ggIyIyGxybkCC2cOFCqVevnqOLQ5Q4Qfr777/XCWNo8n7w4IHOiMRE6F26dNE1aiIiM3r33Xf1tLqlS5d2dFGIEq8LVrly5SwDmSAoI6Wc2AWLyIxweOvbt68+XiFZjMjlu2Dt2bPH3j8hInJIgP7kk0/khx9+0ONxE7nNNemQkBD55Zdf5Pjx4/pxkSJFpFOnTqxVE5FpfPHFF/ry3KRJk/Txicgtsrv37t0refPmlXHjxukxb3HD//EcBqYnInK0CRMm6JEQv/32W+nevbuji0OUdNekcR0aE2r8/PPPesQeQJb3Bx98oGfI+ueff8Qd8Zo0kXlcvHhR/v77bwZocvoYYXeQxsQaxlCg1o4dOyavvvqqHjbUHTFIEzkeuoFiRqt06dI5uihECRIj7G7uxsovXboU7exYqVOntnd1REQJArPwtWnTRmbNmuXoohAlGLuDdOvWrXUSBs5YEZhxwxChaO5u27ZtwpWMiCiOfv/9dz1WA5q3e/fu7ejiEDkuuxuJGB4eHnpgAFyLBm9vb+nWrZuMHj064UpGRBQH8+fPlw4dOsj777+vE8ZwfCJyFXZdk8Zc0du2bdNDgiZPnlzOnj2rn0dmt5+fn7gzXpMmcoypU6fq49KMGTPEy8vL0cUhcmziWIoUKXT/6Ny5c9vzZy6PQZoo6TO4c+bMqf+Pwxhr0GRmSZY4hpF70NWKiMhRNmzYoHuYzJs3Tz9mgCZXZXeQ/vrrr/VQe+iDGBQUpM8OrG9ERIkJTdtNmjTRE/u88cYbji4OUaKyu7nb0/P/47r12avR3ITr1u6Izd1EiW/37t1Su3ZtKVu2rCxfvtztc2HIeSTZBBsbN26090+IiBIEWvKQuLps2TIGaHILdtekKXqsSRMlHqOl7uHDh7q1jpP5kLNJssQxIqKkdOrUKT3k8MmTJyVVqlQM0ORWGKSJyLTQk6RmzZoSFhbG8bjJLTFIE5EpYcjhWrVq6Ul91q9fL4GBgY4uElGSY5AmItOJjIyUxo0bW/pEZ8mSxdFFInK+II2xukNCQhKuNERE/+vqOXHiRF2Dzp49u6OLQ+ScQXrkyJFy586dhCsNEbk1HE+GDRumM7gxWEmePHkcXSQi5w3S7L1FRAkFXVPq1asnkyZN0tejiSgeg5kQESU09H9u2LChnDlzRg+YlCtXLkcXicgUGKSJyKHQvQpjcR8+fFhfgy5VqpSji0RkGszuJiKH8vHxkaJFi8qKFSukXLlyji4OkamwJk1EDvHkyRNde8ZkGbgOTUTPY02aiJLcs2fPpH379no0sbt37zq6OESmxZo0ESUpdK/q2LGjLF68WBYsWCABAQGOLhKRaTFIE1GSjiT24Ycfyp9//ilz5syRN99809FFIjI1NncTUZK5fv26rFmzRmbNmiVvvfWWo4tD5No1aczvSkQUl4GPwsPD9RjcJ06cED8/P0cXicgpOHTEsX/++Uf3j8yaNasO+LhGZQ3XrfC89a1+/frPDSOIBBRMou3v7y+dOnXSAyNYO3TokFSpUkVSpEihxwEeM2bMc2WZP3++FCpUSC9TvHhx3R2EiBLGF198oZPEnj59ygBNlFRB+sGDBy81tu6jR4+kZMmSMnny5BiXQVAOCgqy3HAtyxoC9NGjR2Xt2rXy999/68DfpUsXy+v379+XunXrSs6cOWXfvn0yduxYGTp0qPz000+WZbZv3y5t27bVAf7ff/+Vpk2b6tuRI0fivW1E9F9ff/21jBgxQpo3by7e3t6OLg6Rc1EmgaIsWrTI5rkOHTqoN998M8a/OXbsmP67PXv2WJ5buXKl8vDwUFevXtWPf/zxRxUQEKDCw8MtywwcOFAVLFjQ8rhVq1aqUaNGNuuuUKGC+vDDD+Nc/nv37umy4J6I/uvbb7/Vv4vhw4c7uihEDhXfGGH6xLFNmzZJxowZpWDBgtKtWze5ffu25bUdO3boJu5XX33V8lzt2rX1NHe7du2yLIPZdDCqkQGD+J88edLSPxPL4O+sYRk8T0Txg1apTz75RD799FP5/PPPHV0cIqdk6i5YaOpGE1nu3Lnl7Nmz+sfeoEEDHTy9vLwkODhYB3BryZIlk3Tp0unXAPf4e2uZMmWyvIY+mrg3nrNexlhHdJAEg5t1szoR/b/SpUvry0+VK1dmkimRKwbpNm3aWP6PZK4SJUpI3rx5de26Vq1aDi3bqFGj9Ly3RGRr9uzZupWqR48eOmGTiOLP9M3d1pCkliFDBj2dHWTOnFlu3Ljx3HCDyPjGa8Yy6JtpzXj8omWM16MzePBgPf+tceP8t0T/7SXx7rvv6qZuzjdPlMQ16ZCQEFm0aJFs2bJFLl68KKGhoRIYGKibtXAN97XXXpPEdOXKFX1NGn0toVKlSrpMyNrGIP2wYcMGPapRhQoVLMt89tlnuuuHkVmKTHBc4zaGI8QymCKvd+/elvfCMng+JsmTJ9c3IvqvpUuXSrt27XQLGHpPsImbKIlq0teuXZMPPvhAB0d0p8D8r5jzFU3Or7zyip6kvU6dOlKkSBGZO3dunN8c/ZkPHDigb3D+/Hn9/0uXLunX+vfvLzt37pQLFy7oIIohBPPly6dPCKBw4cL6unXnzp1l9+7dsm3bNt3EhoME+l4DDhpIGkP3KnTVQvnGjx8vffv2tZSjV69esmrVKvnuu+/0QAvoorV37169LiJ6sc2bN+sRxPAbxWhiyBkhogQQlxTwjBkzqv79+6ujR4/GuExoaKj6448/VMWKFdXYsWPjlFq+ceNGnZIe9YauV1hf3bp1VWBgoPL29lY5c+ZUnTt3VsHBwTbruH37tmrbtq1KlSqVSpMmjXrvvffUgwcPbJY5ePCgqly5skqePLnKli2bGj169HNlmTdvnipQoIDy8fFRRYsWVcuXL1f2YBcscmd3795VgwYNsunqSEQvHyM88M+LAjmamNOnTx/nwG/v8q4A2d1p06bV16cx+hmRO0BLF3pCRO1BQUQJEyPi1Nxtb8B1twBN5I727NmjR/NDzgcRmSS7G9ebli9fbnk8YMAAPaAIksaQTEZErg+5I8gNKVasmEydOtXRxSFyWXYH6ZEjR4qvr6/+PwYVwbjbmLACXaP69OmTGGUkIhNBAiYSRTFmwcqVKyV16tSOLhKRy7J7MBP0B0aGNWDWqhYtWugJLV5//XWpXr16YpSRiEwExwCMWYAAjWtsRGSimnSqVKks42dj8nacUQOmeETXLCJyTRg4CGMQoNsjWtEw/C4RmSxIIyijzzRup06dkoYNG1qawHLlypUYZSQiE9SeMUCQMRQuJrEhosRn9y8N16AxEtfNmzdl4cKFlkxujPqFOZmJyLVgHveaNWvqYT4xKBARJZ049ZOmF2M/aXLVJm7kmmD/xoxWuBZNREkXI+KUOHbo0CHd1QJNXPh/bDBTFRG5BgyhiwlrGKCJTFyTRnA25m7G/zFwvvWfGY9xHxERIe6INWlyJcbvGbPKXb16VXLmzOnoIhE5tUStSWPiC8x2ZfyfiFwXJrdp2bKlno61WrVqDNBEDhSnIG39I+UPlsh1YfrZJk2a6ERQY9AiInKiwUzg5MmTMnHiRDl+/LhlysiePXvqOZqJyDk9fvxYmjVrpqd9Xb16tZQvX97RRSJye3Z3wUK3KySR4Uy7ZMmS+rZ//379HF4jIufUrVs3nSC2bNkyqVy5sqOLQ0Tx6YKF8Xrbt28vw4cPt3l+yJAh8vvvv8vZs2fFHTFxjJzdsWPHdJKYMYogETnJVJVRBzZ49913n3v+7bff1q8RkfNAb4yxY8fKo0ePpEiRIgzQRCZjd5DGwAZbtmx57vmtW7dKlSpVEqpcRJTIMA73hx9+KIMGDZLt27c7ujhEFN/EsaVLl1r+/8Ybb8jAgQP1NemKFSvq53bu3Cnz58+3jOtLROZ0426oHDlzS67fCZU500bKsoW/y6+//soaNJGzD2YSp5VxMBNekyZTB+jFm87o+3ULJ8vmFb/LO92GyLcjBkjGAD9HF4/Ipd1PzGvSaBaLy81dAzSRM0AN+mZImOTKmlYyBqaXd7t9JgXLNdDPE5EL9ZMmIudz426Y3L1+QXJnLSlN23bVzwXdfqSfJyInD9ITJkyI03Iff/zxy5SHiBLJphW/yU8TRsrwCfMlT/6iEqmUPA5/JhkDOLIYkdMH6XHjxsXpmjSDNJH5YB74qeNHSIMWH4hHyld0DRoBOtDfV4rn+++4/ETkxEGaE2sQOadp06ZJjx49pE+fPjLw86/k6NnbuokbNWgE6EDWpIlcZ8Qxih6zu8mMnj59qsfgrlSpkq5No7WLiFwsu3vOnDlxXuHly5dl27ZtcV6eiBIH5oL29vaWzZs3y6RJkxigiZxQnIL0lClT9ExXY8aMscx8ZQ1nBitWrJB27dpJmTJl5Pbt24lRViKKI0ySUbx4cQkODtZn7XEd64CInPCaNM7EMeoYpqfERPApU6aUTJkySYoUKeTu3bv6QJAhQwbp2LGjHDlyRL9GRI6BaSZbtmyp54XG75KI3Oia9K1bt/Q43RcvXpSwsDB9EChdurS+ufPZOq9Jkxls3LhRGjZsKLVr19ZTx/r4+Di6SEQk8Y8RTBxLIAzS5GghISGSO3duKVeunG75QksXETl3jOCIY0Quwt/fXxYtWqSzuRmgiVyD+7ZPE7mIgwcPyhdffCFoFMNUsn5+nCyDyFUwSBM5sWPHjunrzytXrpTQ0FBHF4eIEhiDNJGTOn36tNSqVUuyZs2qM7rR64KI3DxII3uUiBzrypUrUrNmTQkICJC1a9dK+vTpHV0kIjJDkK5fv77kzZtXvv76az26GBElvYwZM0rz5s1l3bp1+v9E5JrsDtJXr17Vg/UvWLBA8uTJI/Xq1ZN58+bJkydPEqeERGQRFBQk+/bt0/2fx48fr5u6ich1vVQ/6f3798uMGTPkzz//1I8xLGinTp2kZMmS4m7YT5oS282bN3X2Nhw6dEi8vLwcXSQiMsMEGzHBON0YJhQ164cPH8r06dOlbNmyUqVKFTl69OjLrJqIrNy5c0fq1Kmjx8VHX2gGaCL34Bnf6e/Q3I3hB3PmzKkzSzHLzvXr1+XMmTP6ubfeeivhS0vkhnDmjctKSBbDNegCBQo4ukhElETsHnGsZ8+eunkbreTvvPOOnhmrWLFiltfRDeTbb7/ltTKiBHLp0iU9kQ2yuK1/a0Tk+pLFZ/AEzIaFzNLkyZNHuwwm3WBXLaKXgwlsMGkNppw8ceKEJEvGUXyJ3I3diWPh4eF6MnkOnGCLiWOUkPA7e/PNNyVVqlT60hIRObdETxxDZmmDBg30QQNvULFiRX39mYgSFrozIqcD87h/9NFHji4OETlQnIP0wIED5cCBAzJ8+HB9zRnT4nXu3DlxS0fkZtBK1b59e52MiSxujCpGRO4rzhe5kLQyc+ZMnWUKjRs3lsKFC+tmuZiuTRORfRYuXCiLFy/WTdwY3Y+I3Fucr0mjXyZGG8ucObPlOVyXRn/oXLlyibvjNWlKCPg54jfFLG4i15Ikg5lEHUABj19iwDIi+l9g7t27t/zxxx/i4eHBAE1E9gdpHEgwiEK6dOksN4wyVrp0aZvniCju8Lvq37+/Hof70aNHji4OETnrNWmM0U1ECevLL7+U7777TiZMmMBETCKKf5Du0KFDXBclojiYMmWKnvIVo/ZhJD8iogQZuxvdr6ZNm6Yn18DA/8aMWEgss8c///wjTZo00UOI4locslqjNgWippElSxbx9fWV2rVry+nTp22WwfujywouxPv7++tZuNAMbw0zBmHSjxQpUkj27Nn1QTGq+fPnS6FChfQyGOFpxYoVdm0Lkb2aNm2qAzWau4mIEiRII+Dh2vQ333xj6S8Nf/31lw7a9sA1OExrOXny5GhfRzBFM+B//vMf2bVrl84mRxewx48fW5ZBgEY2LLqI/f333zrwd+nSxSajrm7dunrSD8zDO3bsWBk6dKj89NNPlmW2b98ubdu21QH+33//1QdP3I4cOWLvx0P0QkgQw2Q0OPns2rWro4tDRGam7FSrVi3Vv39//f9UqVKps2fP6v9v27ZN5cyZU8UXirJo0SLL48jISJU5c2Y1duxYy3MhISEqefLk6s8//9SPjx07pv9uz549lmVWrlypPDw81NWrV/XjH3/8UQUEBKjw8HDLMgMHDlQFCxa0PG7VqpVq1KiRTXkqVKigPvzwwziX/969e7osuCeKybRp0/R+Mnr0aEcXhYiSUHxjhN016T179siHH3743PPZsmWT4ODghDp3kPPnz+v1oYnbgD5mFSpUkB07dujHuEcT96uvvmpZBstjUgLUvI1lqlatKj4+PpZlUBs/efKknlnIWMb6fYxljPeJDgZxQS3d+kYUm9mzZ+vksG7dusmAAQMcXRwicgJ2B2mMLhZdQDp16pQEBgYmVLksAT9Tpkw2z+Ox8RruM2bMaPM6ZgpCVzDrZaJbh/V7xLRMbCcdo0aN0icNxg3XuolighHEkHzZsWNHPfc6cjCIiBI8SL/xxht6/O6nT5/qxzjYYL5bjO3dokULcRe4/o6RY4zb5cuXHV0kMjG0vLRr105+/vln3dJDRBQXdh8t0KcT2dOowWK+22rVqkm+fPkkderUMmLECEkoxvCjSLCxhsfGa7i/cePGcxMUIOPbepno1mH9HjEtYz0EanQtCsgot74RRXX27FndSwEJjr/++utzo/YRESVokEbTrpFJjczrHj166O5KmFYvIeeYzp07tw6S69evtzyHZnZca65UqZJ+jHtklyNr27BhwwaJjIzU166NZZDxbdT8AeUvWLCgBAQEWJaxfh9jGeN9iOJj06ZNujvfb7/95uiiEJGzUg704MED9e+//+obivL999/r/1+8eFG/jgxYf39/tWTJEnXo0CH15ptvqty5c6uwsDDLOurXr69Kly6tdu3apbZu3ary58+v2rZta5MRnilTJvXOO++oI0eOqDlz5ig/Pz81depUyzLITE+WLJn69ttv1fHjx9WQIUOUt7e3Onz4cJy3hdndZA37VMqUKVWdOnVs9lcick/34hkj7A7SPXv2VOPHj3/u+YkTJ6pevXrZta6NGzfqQke9dejQwdIN64svvtBBFl2v0P3r5MmTNuu4ffu2DsroDpYmTRr13nvv6eBv7eDBg6py5cp6HdmyZYu2+8u8efNUgQIFlI+PjypatKhavny5XdvCIE0GdAnEvli1alX16NEjRxeHiEwgvjEizlNVWne1Wrp0qZQtW9bmeYw4hqSyK1euiDviVJVkaNOmjU6mXL16tc7VICK6H88YEeexuw23b9/WbxQV3vTWrVv2ro7IZSAXApnbM2fO1NncDNBElOSJY8jkXrVq1XPPr1y5UvLkyfPSBSJyRhhTvlSpUnrYXIz/Ht2JLBGRveyuSfft21dndN+8eVNq1qypn0NmNLpm/fDDD3YXgMjZYXQ8/BZSpUoVa7c9IqJED9Lvv/++bspDn+ivvvpKP5crVy49m8+7775rdwGInBlyMGrVqqX7zeNkNeoIeERESRakMVAIZvBp3ry5Hn8YtWlMIYkaBJG7Qc5ls2bN9LVo9InGlKtERAnJ7uxuPz8/OX78uJ76kf4fs7vd0+7duyV9+vSSN29eRxeFiFwwRtidOFa+fHk95zKRu8Kwsxi7/cmTJ/r3wABNRKa5Jv3RRx9Jv3799LU49JWOOhRoiRIlErJ8RKaCs+D69evLuXPnpFOnTrq3AxGRaZq7o5vBBzNhYTW4j4iIEHfE5m7Xh4llEKCPHj2qk8TKlCnj6CIRkZNIssFM0N2EyN2gRwNG1EM/aEy+wgBNREnB7iDNhDFyRz4+PlKuXDkZNmyYZYY1IiJTNHdjrO4GDRqIt7e3/n9sUNtwR2zudk2Y4nTv3r2ctpSIHBIj4hSkcR06ODhYD9QQ3TVpy8p4TZpB2gXcuBsqR87ckqBbD+Tn7wbKnp2b5ML58xIYGOjoohGRk0rUa9IYrCG6/xO5muMXbssfq07IjbsPZctf38mJ/Ruka/9vRSWz7cVARJQU7O4nTeTKNWgE6AtB92Tr4glyYt96afD2Z5Ihd3ldsyYiMm2Q3rBhgxQpUkRX2aNC9b1o0aLyzz//JHT5iJIMAvGtkMfiLWESdP6QNGw/SLIXriKhT57Jjbthji4eEbmhOAdpzHDVuXPnaNvS0c7+4Ycfyrhx4xK6fERJ5vqdUEmeLEJ8fNPK+5/OkOIVG0gyL095FPpEMgb4Orp4ROSG4hykDx48qAdyiEndunVl3759CVUuoiS3Yv4UmTOhh/h4RcrDxyL3Q8Pl/qMnkiGtrxTPx6QxIjJxkL5+/brughWTZMmS6VmxiJzRqFGj5PdfJsjrNRqLf9pUktrPRyIjRHJmTi1v1y8igaxJE5GZBzPJli2bHDlyJMaxijESU5YsWRKybERJApdpPv30Uz1QSdeen+hr07gGjSZu1KAZoInI9GN39+zZU8+Zu2fPHkmRIoXNa2FhYXo2oBo1asiECRPEHbGftHM6duyYFCtWTAYNGiQjRozQff2JiJxqMBOjuRvjFXt5eUmPHj2kYMGC+vkTJ07I5MmT9SAm+/fvl0yZMok7YpB2Xrt27dInmQzQROS0QRouXrwo3bp1k9WrV+tZr/QKPDykXr16OlDnzp1b3BWDtHP5448/5NKlS7oGTUTkErNgYXKNFStWyN27d+XMmTM6UOfPn18CAgLiU2Yih1iwYIG8++678s4771imWCUicolZsABBGTMCETmbZcuWSdu2baVVq1Yybdo0BmgiMjUOC0puY9u2bdKyZUs9U9usWbN0fgURkZkxSJPbKFmypAwcOFD+/PPPWPv8ExGZhV2JYxQzJo6Z186dO8Xf318KFSrk6KIQkZu6H88YwZo0ubS9e/fq3gdffPGFo4tCRGQ3BmlyWRhvHmPKY/a26dOnO7o4RER2Y5Amlx1JrHbt2rrv/sqVKyV16tSOLhIRkd0YpMklYbIX9OFfs2aNvh5NROSMmDiWQJg4Zg7BwcESGBiou1dxoBIiMgsmjpHbu3Llirz22muWoT4ZoInILUccIzKLG3dD9dSSJ89clBEDO4inROgZ24iIXAGDNDl1gF686YxcuHRNpo/tLmGhD2XAiBmSInUGRxeNiChBsLmbnBZq0DdDwuTUvhXyOPS+fD5mpkiKDPp5IiJXwJo0Oa3rd0IlRfJk0rRtV6lWt5mkD8wiQbcfyY27YY4uGhFRgmBNmpzSw4cP5dsvO8uhvVuQIaYDdKRS8jj8mWQM8HV08YiIEgSDNDmdsLAwPZPVqWMHJWumDHIx6L6uQeM+0N9XiucLdHQRiYgSBJu7yamEh4dLs2bNZNeuXbJ69WopULSMvgaNJm7UoBGgA1mTJiIXwSBNTuXjjz+WzZs3y/Lly6Vy5cr6uZrlcji6WEREiYIjjiUQjjiWNM6ePSvnzp2TOnXqOLooRERxxhHHyGVFRETIyJEj9c6dN29eBmgichsM0mRqkZGR0rVrVz0f9Pbt2x1dHCKiJMVr0mRauBLTq1cv+eWXX2TmzJnSoEEDRxeJiChJMUiTaQ0cOFAmTZokU6dOlXfffdfRxSEiSnJs7ibTypYtm4wfP166dOni6KIQETkEa9JkOvv375cyZcropm4iInfGmjSZyrhx46Rs2bKyY8cORxeFiMjhTB2khw4dKh4eHja3QoUKWV5//PixdO/eXdKnTy+pUqWSFi1ayPXr123WcenSJWnUqJH4+flJxowZpX///vLs2TObZTZt2qRrbsmTJ5d8+fLpJCVKej/++KP07dtXBg8eLBUrVnR0cYiIHM7UQRqKFi0qQUFBltvWrVstr/Xp00eWLVsm8+fP16NQXbt2TZo3b27TvxYB+smTJ7r7zqxZs3QA/vLLLy3LnD9/Xi9To0YNOXDggPTu3Vs++OADPeQkJZ3p06frEy58/iNGjNAnZEREbk+Z2JAhQ1TJkiWjfS0kJER5e3ur+fPnW547fvw4Rk9TO3bs0I9XrFihPD09VXBwsGWZKVOmqDRp0qjw8HD9eMCAAapo0aI2627durWqV6+eXWW9d++efm/ck32ePXumKlWqpLp27aoiIyMdXRwiogQX3xhh+pr06dOnJWvWrJInTx5p3769br6Gffv2ydOnT6V27dqWZdEUniNHDsv1TNwXL15cMmXKZFmmXr16eni2o0ePWpaxXoexDK+JJg20cnh5ecnatWtl8uTJrEETEVkxdZCuUKGCbp5etWqVTJkyRTdNV6lSRR48eCDBwcHi4+Mj/v7+Nn+DgIzXAPfWAdp43XgttmUQyDElYmyzMWEZ6xvZB5cqChcuLJcvX5aUKVOKp6epd0cioiRn6i5Y1iNMlShRQgftnDlzyrx588TX17HTEY4aNUqGDRvm0DI4szVr1kjLli2lcePGkjlzZkcXh4jIlJyq6oJac4ECBeTMmTP6wI6m0pCQEJtlkN1tHPRxHzXb23j8omUwS0lsJwLIQMaED8YNtUGKGyT5NW3aVE+U8eeff4q3t7eji0REZEpOFaQfPnyopyrMkiWL7kuLg/v69estr588eVJfs65UqZJ+jPvDhw/LjRs3LMvg2icCcJEiRSzLWK/DWMZYR0zQXQvrsb5R3L5DdJV7/fXXZcGCBfqSBRERxUCZWL9+/dSmTZvU+fPn1bZt21Tt2rVVhgwZ1I0bN/TryAbOkSOH2rBhg9q7d6/OEMbNOmu4WLFiqm7duurAgQNq1apVKjAwUA0ePNiyzLlz55Sfn5/q37+/zg6fPHmy8vLy0svag9ndcbd9+3b16NEjRxeDiCjJxDdGmDpIoytUlixZlI+Pj8qWLZt+fObMGcvrYWFh6qOPPlIBAQE60DZr1kwFBQXZrOPChQuqQYMGytfXVwd4BP6nT5/aLLNx40ZVqlQp/T558uRRM2bMsLusDNKxw0kSPvuIiAhHF4WIKMnFN0Z44J+YatkUd8juTps2rb4+zaZvW8eOHZPq1atL9uzZ9ehuqVOndnSRiIicIkY41TVpcj7o545+6EjQQ0Y3AzQRUdwxSFOiwTCttWrV0mePSMbDGOtERBR3DNKUaDChSdu2bWXdunXPDRhDREQvxiBNCQ6juGFCk2TJksk333wj2bJlc3SRiIickqlHHCPnc+vWLX0NGgPNIGEMgZqIiOKHR1BKMHfv3tWjiCFQY1QxBmgiopfDoyglWPcCzB6G4VHRzapgwYKOLhIRkdPjNWlKEEFBQXp2MnSzKlasmKOLQ0TkEliTJrsdv3Bb1u++JEG3QiV9Gk+pViablC1SUI4cOaLnhiYiooTBIE12B+gpCw5KyMMn4unxTGaOGybjvTxk5coVUiR3BkcXj4jIpbC5m+yCGjQCdMYAH9n+1zdy/cIhKVC+mWzYw6k6iYgSGmvSZBc0cXt5Kln+6wg5e3SntOgySlJnK6GfJyKihMWaNNklSwY/uXBsp5w8sFmavj9MchcpL0+fRurniYgoYbEmTXapVT6HnLpUTdJkeEX8s+TUNWj/VD5Sp3wuRxeNiMjlMEhTnGBG0z59+kjRokWlW8vmsn53gA7QqEEjQBfMFeDoIhIRuRwGaYpTgB44cKCMHz9epk6dKoVzpdc3IiJKXLwmTS80dOhQGTt2rA7SXbp0cXRxiIjcBoM0xernn3+W4cOHy+jRo+Xjjz92dHGIiNwKm7spVs2bN9fN3axBExElPQZpkw63iYQsZFI76trv7NmzpWrVqpI9e3YGaCIiB2Fzt8mG29x97LoE332k7/EYzyeVG3dDZcOeS9K130h5++23ZfJ/piXZexMR0fNYkzbZcJuoQXt6eEqkitQ1ajyfFLVpBOjFm87IulWLZcHPQ6V89WaSu2xz/XzGAA5UQkTkCAzSJoGA7O3tqQM04B6PE3u4TQThI2duyc4jQbJt8ypZ9dtXUqV2U+nU+yu5fP2hfq1muRyJWgYiIooeg7RJoAZ9+cYDXYM2atKJPdymUXu+GRImwbcfSVh4hBQqW1vaffilJPPykhTJk8mNu2GJ9v5ERBQ7BmlTDbd511KjRoBO7OE2UUtGgPaJuC2ZAtLJkxJVpEDJqnLzXrikTJlcHoc/k4wBvon2/kREFDsmjpkErjt3a1lSyhfJJJkDUur77i1LJ+pwm6glXz13UD7v0UJO7F0hqXx95PGTCLl+J1QuBt2XQH9fKZ4vMNHen4iIYseatIkk9XCbNy8fk1/G9pH8RUpL7YYt5MkzDzl2/o5OFKtYLLMO0IGsSRMROQyDtJvat2+ffNano+TIW0hadBkptx88083bRXKnk2bV8zM4ExGZAIO0m5o8ebIULVpEZs9dLBeuh+umb1x/Zu2ZiMg8PBTGfKSXdv/+fUmbNq3cu3dP0qRJI2YVEREhXl5e8vTpUwkLCzN1WYmI3D1GMHHMjZw5c0aKFy8ue/bsEW9vbwZoIiKTY3O3CzMGKkFTdmTYLRnQvY2kSpVScuTg4CRERM6AQdpFWQ9U8vjhbZn8VRfx8vSQ5Qv/lkyZMjm6eEREFAds7nZRxkAlOTKnlrlTPhNPj0jp0G+i3A71cXTRiIgojliTNmnzNDKti+XLEK/JLbAejMWNoT5FibTp/Jn4+6cWlTwDh/kkInIirEmbrHl6x5EguXrrob7HYzwfn/VcunpdNvw1Sc5duSmPPAIllX8WDvNJRORkGKRN1jydM0sayZI+pb7HYzxv73ouB92S1b9+JucOrpdH927J7fuP9UhiHOaTiMi5sLnbJNAMjVmnPD089GPcx2cWqktBt2X2hH5y49pF6Tf8Z0kRkEuPxY1mc44kRkTkXBikTQLN0GevhkikUjpA497e5uknT57IhK96yNULp2TwqF+kQJGSej2I+xiLmwGaiMi5MEibBJLEzlwJ0bNPoQaNAP2i5unth6/Ksn/Oyc27jyUwIIU0rpJb6tSuKTXf/EC8/XNL0O1HcVoPERGZE4O0SaA5umn1fDbZ3bGNo40APWneQR2ERUXIqaN75GLwA+nRrqvkeyUgzushIiLzYpA2WaCuWS5uo4GhBo0AncrPS7Yu/F4undghdTtP1c+P6l4lzushIiLzYna3E0I3q0vBDyQiMkK2/DVOLhzdItVaDhK/NOl10zcREbkGBmknY/SDRkLYgTX/kQtHNkr5Jv0ka8GKEhmp9LVpIiJyDWzuNpGBkzbLsfMhlsdFcvvLNz2qWR4fv3Bbfl1+TNeiPVWY3A06IaXq9pCM+V+X+w+fSEpfb2laNZ+DSk9ERAmN80mbZD7pqAE6aqBGgJ6y4KBcu/VInoaHik+KlKIin4lviuT62nSaVD7S5c0SUqF4lgTaIiIicnSMYE3aJKIL0MbzyORGgL736Kmc3v6nXD21Td7oOlHCJZmkS5NcMqZLJ5WKZWGAJiJyMbwm7QRGzdwrIQ+fyuldC+XEjrmSrXBNCY9MJmgEuRXymP2giYhcFIO0kzi3f6mc2Pqb5K/YWvKVby4RzyLEw8NDT0XJ4T6JiFwTg7RJ4NpzTB7evSbHN8+UvK82kwKV2ujnnkWKZEnvJx0bFWOAJiJyUQzSJoHksJgCdaqArFK53VgpVOVdXXuGtCm9pXvL0lIwV0ASl5SIiJIKg3QUkydPlly5ckmKFCmkQoUKsnv37iQN1Kn9/j+X7+qJf+TEttn62nPaTHktARp6MEATEbk8Bmkrc+fOlb59+8qQIUNk//79UrJkSalXr57cuHEjSd6/Sb8l8iD0mf5/0OkdcmDlD/L4AeaTtu0l91rxTMzkJnrJQYE27Lkkc9ac1Pd4TGRG7CdtBTXncuXKyaRJk/TjyMhIyZ49u/Ts2VMGDRqUqP2kEaAN18/tkb1Lv5Es+StJ6Qa9xcPTy/Jao9dyStcWpexePxHZjtp3M+S/c7gbM8VhghuMn0+UGOIbI1iTtpqLed++fVK7dm3Lc56envrxjh07nls+PDxcf+jWt4RwN+iU7Fv2jWTK86qUqt/LJkBnTe/HAE30kjBDHAJ0zixpJEv6lPoej/E8kdkwSP/PrVu3JCIiQjJlymTzPB4HBwc/t/yoUaP0WZFxQ407IaQJzCX5K7SS0g37iaeX7VgzxfKyLzTRy8IUrqhBe/4vxwP3eIznicyGQTqeBg8erJstjNvly5cTZL1eyXwkf8VW4pXM2+b5FD6eUjRPugR5DyJ3hjnW0cSNSWoA93iM54nMhsOC/k+GDBnEy8tLrl+/bvM8HmfOnPm55ZMnT65vSSGZp0jlktk4qhhRAiiWL4OcuRIiF4Pu21yT5u+LzIg16f/x8fGRsmXLyvr16y3PIXEMjytVqpTo77/suzejfT531jTSsVFRaVevMActIUoASA5DkhjGu8+WIZW+56h9ZFasSVtB96sOHTrIq6++KuXLl5cffvhBHj16JO+9916SvH9MgZqIEj5Q1yyXw9HFIHohBmkrrVu3lps3b8qXX36pk8VKlSolq1atei6ZjIiIKCmwn3QCedl+0kRE5Lrus580ERGRa2GQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKTYpAmIiIyKQZpIiIik2KQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKT4gQbCcQYAh3jsxIREVkzYoO902UwSCeQBw8e6Pvs2bM7uihERGTiWIGJNuKKs2AlkMjISLl27ZqkTp1aPDw8XupsC4H+8uXLLj+bljttq7ttrzttq7ttrztta0JuL0ItAnTWrFnF0zPuV5pZk04g+NBfeeWVBFsfdgZ3+AG427a62/a607a62/a607Ym1PbaU4M2MHGMiIjIpBikiYiITIpB2mSSJ08uQ4YM0feuzp221d2215221d2215221Qzby8QxIiIik2JNmoiIyKQYpImIiEyKQZqIiMikGKRNZvLkyZIrVy5JkSKFVKhQQXbv3i1mNnToUD14i/WtUKFCltcfP34s3bt3l/Tp00uqVKmkRYsWcv36dZt1XLp0SRo1aiR+fn6SMWNG6d+/vzx79sxmmU2bNkmZMmV08ka+fPlk5syZSbJ9//zzjzRp0kQPQIBtW7x4sc3rSOn48ssvJUuWLOLr6yu1a9eW06dP2yxz584dad++ve5j6e/vL506dZKHDx/aLHPo0CGpUqWK/t4xcMKYMWOeK8v8+fP1Z4tlihcvLitWrEjSbe3YseNz33X9+vWdcltHjRol5cqV04MPYZ9r2rSpnDx50maZpNx3E/t3H5ftrV69+nPfb9euXZ1ue6dMmSIlSpSw9GuuVKmSrFy50nm/VySOkTnMmTNH+fj4qOnTp6ujR4+qzp07K39/f3X9+nVlVkOGDFFFixZVQUFBltvNmzctr3ft2lVlz55drV+/Xu3du1dVrFhRvfbaa5bXnz17pooVK6Zq166t/v33X7VixQqVIUMGNXjwYMsy586dU35+fqpv377q2LFjauLEicrLy0utWrUq0bcP5fnss8/UX3/9hQRLtWjRIpvXR48erdKmTasWL16sDh48qN544w2VO3duFRYWZlmmfv36qmTJkmrnzp1qy5YtKl++fKpt27aW1+/du6cyZcqk2rdvr44cOaL+/PNP5evrq6ZOnWpZZtu2bXqbx4wZoz+Dzz//XHl7e6vDhw8n2bZ26NBBb4v1d33nzh2bZZxlW+vVq6dmzJihy3DgwAHVsGFDlSNHDvXw4cMk33eT4ncfl+2tVq2afm/r7xffl7Nt79KlS9Xy5cvVqVOn1MmTJ9Wnn36q9x9suzN+rwzSJlK+fHnVvXt3y+OIiAiVNWtWNWrUKGXmII2DcnRCQkL0j2P+/PmW544fP64DwI4dO/Rj/AA8PT1VcHCwZZkpU6aoNGnSqPDwcP14wIAB+kTAWuvWrfWBJylFDVyRkZEqc+bMauzYsTbbnDx5ch18AD9g/N2ePXssy6xcuVJ5eHioq1ev6sc//vijCggIsGwvDBw4UBUsWNDyuFWrVqpRo0Y25alQoYL68MMPk2RbjSD95ptvxvg3zrqtcOPGDV32zZs3J/m+64jffdTtNYJ0r169YvwbZ97egIAANW3aNKf8XtncbRJPnjyRffv26eZS66FG8XjHjh1iZmjeRRNpnjx5dFMnmooA2/P06VObbUITZo4cOSzbhHs0Z2bKlMmyTL169fR4uUePHrUsY70OYxlHfy7nz5+X4OBgm7Jh2D80a1lvH5p9X331VcsyWB7f7a5duyzLVK1aVXx8fGy2D82Rd+/eNdVngCY+NP8VLFhQunXrJrdv37a85szbeu/ePX2fLl26JN13HfW7j7q9htmzZ0uGDBmkWLFiMnjwYAkNDbW85ozbGxERIXPmzJFHjx7pZm9n/F45drdJ3Lp1S+9Q1jsG4PGJEyfErBCQcC0GB+2goCAZNmyYvt545MgRHcBwMMaBO+o24TXAfXTbbLwW2zL40YSFhelrwY5glC+6slmXHUHNWrJkyfTB0XqZ3LlzP7cO47WAgIAYPwNjHUkB15+bN2+uy3r27Fn59NNPpUGDBvqg4+Xl5bTbislxevfuLa+//roOTkZZkmLfxYlJUv/uo9teaNeuneTMmVOfcCNvYODAgfrk6a+//nK67T18+LAOyrj+jOvOixYtkiJFisiBAwec7ntlkKaXgoO0AckaCNr4oc+bN89hwZMSR5s2bSz/R00D33fevHl17bpWrVrirJBEhJPKrVu3ijuIaXu7dOli8/0iGRLfK07I8D07k4IFC+qAjBaDBQsWSIcOHWTz5s3ijNjcbRJoYkJtJGqWIR5nzpxZnAXOUAsUKCBnzpzR5UazT0hISIzbhPvottl4LbZlkLnpyBMBo3yxfWe4v3Hjhs3ryBJFFnRCfAaO3DdweQP7Lb5rZ93WHj16yN9//y0bN260mcUuqfbdpP7dx7S90cEJN1h/v86yvT4+PjrjumzZsjqzvWTJkjJ+/Hin/F4ZpE0COxV2qPXr19s0S+Exmm2cBbrb4MwbZ+HYHm9vb5ttQvMZrlkb24R7NE1ZH9zXrl2rd3Y0TxnLWK/DWMbRnwuabfGDsy4bmrtw/dV6+3BAwPUpw4YNG/R3axwEsQy6P+FamfX2oTaA5l+zfgZXrlzR16TxXTvbtiI3DgELzaAoY9Qm+KTad5Pqd/+i7Y0OaqJg/f06y/ZGhfcIDw93zu81HolylEiQso/M4JkzZ+pM2S5duuiUfessQ7Pp16+f2rRpkzp//rzuOoNuC+iugOxRo7sDunps2LBBd3eoVKmSvkXt7lC3bl3dNQRdGAIDA6Pt7tC/f3+diTl58uQk64L14MED3Q0DN/xcvv/+e/3/ixcvWrpg4TtasmSJOnTokM5+jq4LVunSpdWuXbvU1q1bVf78+W26JSHjFN2S3nnnHd1NBPsBtjdqt6RkyZKpb7/9Vn8GyKpP6G5JsW0rXvvkk090Biy+63Xr1qkyZcrobXn8+LHTbWu3bt101znsu9ZdjkJDQy3LJNW+mxS/+xdt75kzZ9Tw4cP1duL7xf6cJ08eVbVqVafb3kGDBumsdWwHfpN4jB4Ga9asccrvlUHaZNDfDjsQ+tchhR/9Tc0M3Q6yZMmiy5stWzb9GD94A4LVRx99pLtAYKdu1qyZPjhYu3DhgmrQoIHuL4sAj8D/9OlTm2U2btyoSpUqpd8HBw/0+UwKeF8ErKg3dEcyumF98cUXOvDgB1mrVi3dN9Pa7du3daBKlSqV7sbx3nvv6aBnDX2sK1eurNeBzxHBP6p58+apAgUK6M8A3T/QFzSpthUHcxy0cLBCwMyZM6fu9xn1gOMs2xrdduJmvV8l5b6b2L/7F23vpUuXdEBOly6d/l7Qvx0ByLqftLNs7/vvv6/3T6wb+yt+k0aAdsbvlbNgERERmRSvSRMREZkUgzQREZFJMUgTERGZFIM0ERGRSTFIExERmRSDNBERkUkxSBMREZkUgzQREZFJMUgTvcDQoUOlVKlSji4GEbkhBmkyvY4dO4qHh4e+YXB8TA4wYMAAPVfsi2De1549e+oZm5InTy7Zs2eXJk2aPDc4vhlPAP79919p3bq1nuAAZccUoI0bN5Zly5bpCROcwYULF/T3ZkzWQObEE1Hz4nzS5BTq168vM2bM0LMnYZYlzA+Lg/8333wTa4DAxPaYPnPs2LF6jlz8/erVq/Wcugk5yby9EGQxKXxMlixZIq1atZLatWvLrFmz9LR7mMVn+/bt8vnnn0uVKlWem7g+KWG6P8z0k5Tw3eEkzZW3keg5do/2TZTEMMEDZpey1rx5cz3bUmwwQD4mcHj48OFzr929e9fyf8zy9MYbb6iUKVOq1KlTq7feestm4gjMwlSyZEn166+/6oH7MXEEJhK5f/++ZRnMBNWzZ089oD8mKHj99dfV7t27n5u8YsWKFXr2KExSgQH5o5vwAOVNnz69Hvg/JpjYw4DZoTD7FMqfMWNG9fbbb6ubN29aXq9WrZouGyZMwKQCmAwE2xT18+jUqZOeTACfQY0aNfQMQFE/g59//lnlypVLzyoEK1eu1NuKGZYwOUOjRo1sJliJun0oC0RERKhhw4bp7weTD2DdWJcBMxhhecwkhIkf8Jkm9qQq+G4xExRm6MJnYEyismDBAlWkSBFdTiyD2bmsJ1DABCCGRYsW6XJPmTLF8hwmePjss89ifN/Lly+rNm3aWCZ8KFu2rM1EDD/++KOewAH7DCYdwX4Y9XPCbGXW3yWewz5nve9h5jKsG5NGYNanEydO6Ndj2g/JHBikyemCNIJS5syZVYUKFWL8G8zGhEAycuTIWNeNYIGZbDArE6atw8ERBzIjmBgBCrM64cQA7/3PP//o9//0008ty3z88ccqa9asOggfPXpUlxkHXZTD+kBZokQJPSMPAtmVK1f07Do4yFtPHfjXX3/pZTEt5IvggGxMo4cp8/bv36/q1Kmjg6wB24ITi6FDh6pTp06pWbNm2UzdB5hitEmTJmrPnj16GZQLJwpG+fEZ4CQAJwN4D8xkZQSwhQsXqtOnT+tAgXUUL15cf66AExUjQGD7jPVhGkyU6c8//9TBYsCAAToI4b2tgw9OCLB+TA147do1lZiMEzAEYXw/uGGf8PT01MEbs5sheCHIGUEMUyHiszSmZu3du7c+0cFJHDx58kQH3rVr10b7npghDAG4SpUqasuWLfpznDt3rtq+fbt+HfsCPhdMhYj3/+677/SUiJhm0d4gjd8LpqrE/on3e+211/Tr2Oei2w/JHBikyfQQ8HBgQpBAjQoHHBw4ESBigvmMsRwOcrFBoMK6MVWfAQcx/K1RE0aAwoHWuuaMWqlxkoCaLw6ks2fPtryOgzOC9pgxY2wOlIsXL7Z5f6OGag1TN2LZO3fuWJ5DWbD9xm3ZsmX6+a+++kpPIRm1Zoa/N6bMRJDGSYi1cuXKqYEDB+r/IzggOFnPCw158+a1zPNszOlsBKOYoAaP9zbmfo4uiAA+mxEjRjxXJkwhaP13P/zwg0oqCNJNmza1ea5du3b6pMcavnvUrI0WDZzMzJ8/Xz/GCd+oUaP0SRxgTm18bo8ePYr2PfH5otZunLxEhUCKKUGtoaWnYcOG8apJGzD1J54z5j2Pbj8kc2DiGDmFGjVq6OSjXbt26evR7733nrRo0SLG5eOaWHX8+HGdTIaboUiRIvp6L14z5MqVS1KnTm15jGSuGzdu6P+fPXtWXy/F9W8Drp2WL1/eZh3w6quvSnyUKFFCbz9ujx49kmfPnunnDx48KBs3bpRUqVJZboUKFbKUy/rvrVmXH+t4+PChpE+f3mY958+ft1kHEtcCAwNt1nP69Glp27atTsxLkyaN/pzg0qVLMW7L/fv35dq1azafF+CxvZ9X165dbcr8olvRokVjXV/U90N5oisnths5BciLqFq1qmzatElCQkLk2LFj8tFHH+n8AeQ8bN68WcqVKyd+fn7Rvh++z9KlS0u6dOmifT2m94/6OcWF9T6A7x+MfYDMi4lj5BRSpkypk6dg+vTpUrJkSfnll1+kU6dO0S6fP39+fQBNqOSwqAlLWHdkZGS8tuNFUHY4efKkVKxYUf8f2d3G9ltDcEW2enQJdMaB+EXlxzqwLAJNVNbJadGVHe+N4P3zzz9L1qxZ9TqLFSumk64Swos+r+HDh8snn3wS5/W9KPEsLt9PVNWrV5effvpJtmzZogMuTlaMwI0gXa1atRj/1tfXV16Gp6fncyelOGF80bbj+4f47MOUtFiTJqeDA9Onn36qs5zDwsKiXQY1k3r16snkyZN1zTMq1HqgcOHCcvnyZX0zoDaE11Gjjou8efPqLOBt27bZHCj37NnzwnXg76JmedetW1eXP7bMdUOZMmXk6NGjugaLIG59i2vAwTrQVS1ZsmTPrSNDhgwx/t3t27f1iQS+h1q1aunP8u7du89tH1hvI4IYArr15wV4HNfP3JAxY8bnyhzbDScU9sA2RVfOAgUKiJeXl36MIIx9Zv78+TpgA+7XrVunlzWei62F5M6dO3a9v/E5GS0bQUFBltfj090tuv2QzIFBmpzSW2+9pQ+SCMIxwWs48KDZeeHChbqJEs2EEyZMkEqVKull0MUJXbPat28v+/fvl927d8u7776rD7xxbZpGMOzWrZv0799fVq1apQ/YnTt3ltDQ0Bhr+gYEVzQr48B669Yt3UyKZtlp06bJ8uXLpVGjRrrL2Llz5+TQoUMyZswY/XdGgEBXMhzg0eSMkwI0T2N5XA6I60EXnwE+j6ZNm8qaNWt01zV09frss89k7969Mf5dQECAbiJHLfLMmTOyYcMG6du373NBFLVFfC7Xr1+Xe/fu6efxWeEkZO7cuTrQDxo0SH8GvXr1EjPp16+f7lP/1VdfyalTp3R3uEmTJtnU3hFo8Vn88ccfNkF68eLF+vuM2lxtDd9b5syZ9WeP4IvvGfvqjh07LJ/TzJkzZcqUKXr//f777+Wvv/6yvD8+W7S2jB49Wu/bqLnjpMle0e2HZBKOvihOFJ8uWIAEHWQ2R9fFyoCM4O7du+ukIHShQZcfdLcykmrs6YJlbdy4cXqdBiTgoJsTMntj64Jl3fULkKzVokUL5e/v/1zXF2Rat2zZUnerSpYsmU5Qqlevnu6WZN0FCxnR6K6FdSDzuFChQjrL2FgGiWO9evWyeV98nkYXI0BSHMqPhC4kOmXPnl21b9/eklAXU2IRspYLFy6stxmZ68gexnagK5IB3bawPiT7WXfBQrY5vg+8X0xdsKImnCUmfJ/4XqMyumChnDly5FBjx459bhl8nviOkK1tbB+y+ytWrPjC971w4YLeB5C8hwTFV199VSc+xqULFhw7dkx3qcJ3j8Q1JENGlzhmve/hc8Vz+JxftB+SY+nOjo4+USAiIqLnsbmbiIjIpBikiYiITIpBmoiIyKQYpImIiEyKQZqIiMikGKSJiIhMikGaiIjIpBikiYiITIpBmoiIyKQYpImIiEyKQZqIiMikGKSJiIjEnP4PCGHl+phN7XAAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Off-diagonal points (mismatch): 42\n" + ] + } + ], + "source": [ + "merged = (\n", + " r_results[[\"cohortId\", \"row_count\"]]\n", + " .rename(columns={\"row_count\": \"r_rows\"})\n", + " .merge(py_results[[\"cohortId\", \"row_count\"]].rename(columns={\"row_count\": \"py_rows\"}), on=\"cohortId\")\n", + ")\n", + "\n", + "fig, ax = plt.subplots(figsize=(5, 5))\n", + "ax.scatter(merged[\"r_rows\"], merged[\"py_rows\"], alpha=0.5, s=15, color=\"#4C72B0\")\n", + "lim = max(merged[[\"r_rows\", \"py_rows\"]].max()) * 1.05\n", + "ax.plot([0, lim], [0, lim], \"k--\", linewidth=1, label=\"y = x (perfect agreement)\")\n", + "ax.set_xlabel(\"R CohortGenerator — row count\", fontsize=10)\n", + "ax.set_ylabel(\"CircePy (Ibis) — row count\", fontsize=10)\n", + "ax.set_title(\"Cohort table row counts: R vs Python\", fontsize=11)\n", + "ax.legend(fontsize=9)\n", + "plt.tight_layout()\n", + "plt.savefig(OUTPUT_DIR / \"figure2_row_count_agreement.pdf\", bbox_inches=\"tight\")\n", + "plt.show()\n", + "\n", + "off_diagonal = merged[merged[\"r_rows\"] != merged[\"py_rows\"]]\n", + "print(f\"Off-diagonal points (mismatch): {len(off_diagonal)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b118ea5561624da68c537baed56e602f", + "metadata": {}, + "source": [ + "## Figure 3 — Incremental (checksum) speedup" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "938c804e27f84196a10c8828c723f798", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkwAAAGGCAYAAACJ/96MAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAABRT0lEQVR4nO3dCdxM5f//8Y99C1myZomKyJZKZGsjyRJt9kq00GJLSkJFKaGFoqJEtIhosS/ZJRJKkaXFmi1ky/wf7+v7P/ObmXuZe5gbY17Px2O4Z+bMOdc5c+bMZ67rc11XGp/P5zMAAAAkKW3STwEAAEAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYkEDv3r3tvPPOO9PFOGvNmTPH+vXrd1Kv3bRpk6VJk8Y+/fRTi0eDBw+2r776KsHjxYsXt44dO9rZ7HS/d7H8Obznnnvs8ssvt1g4H/WenopRo0a5dezateuMvZe1a9e2W2+9NdXWj/9J////BxBBwPTKK6/YU089daaLEnP0BaUL+y233BL0+Oeff265cuU6Y+UCUtP9999v9evXT7X1Dx061NKlS5dq68f/EDDhjPr3338tS5YsZ7oYOEmaivLo0aOWKVOmU1pPpUqVolYm4Gxz4YUXultqKVOmTKqtG/+HJjmkuCniww8/dM0mqgkoWLCgde3a1Y4fPx607E8//WRNmjSx3LlzW9asWa1ChQr20Ucf+Z/Xel588UXr3r27FShQwPLly+f/4lWtzaWXXuq+fEuUKGGDBg1KtFp7xYoVVrVqVRdoXXHFFe7+4cOH7aGHHnJl04VJNRmhFi1aZNdff71ly5bNcubMac2bN7cdO3ZEtJ8qQ58+fezgwYNuWd1UHS4///yz3X333VakSBG377qIDRw40E6cOBHxMd+3b5+1bNnSsmfP7o6RarO0rtDmg71799rDDz/syqnjVrlyZZs2bVqi1fVqSipVqpQ7hjoOGzZsCFruyJEjbjvFihVz67rsssts7NixiTa1qFlN762Wmzx5sjseOmZav/ZdTWwPPvig2w+PHtu8ebO9+eab/mOn5oykmuQmTJhgFStWtMyZM1uhQoWsc+fO7n0OrOnTOqZPn+7eSx0rlX3AgAF2snSO1KlTx3LkyOHWV6VKFbf+QCpDSj4HjRo1cueZzjfVLoQeb50Xr776qjvOOo76PNxxxx1BxyxU37593fH1mjXXrFnjauvy5MnjHtfxD9z/xJpqVq5c6Y6bjp9H91966SV7+umn3fl2/vnn2xNPPOE+lzNnznTvg86bG264wX7//fcUH8+vv/7anS96D3VuLl68OOj5Dz74wKpXr+6uFzqeKu/SpUuDlvnjjz/szjvvtPz587v1XHTRRdapU6eIj/f+/futdevW7n294IIL3P6Fvm+J0WesXbt2VrhwYbd9fb71OU/OyJEjLWPGjPbuu+8m2iTnnbt6H3W9VJl1LoU29XuvW7ZsmV199dVu+zpfpkyZErRc6Pvsve7HH390x1fnht6HqVOnBr1OP3YeffRRd/z1nj/wwAPuM6+y6XqIED4gxLPPPuvLli2b//7GjRt9OlWKFi3qe+SRR3zTpk3z9e7d2z02bNgw/3K//PKLL2fOnL7LL7/cN3r0aN/06dN9gwYN8r344ov+ZfSaAgUK+Bo3buybMmWKb+LEie5xrTdLliy+559/3r2uT58+vgwZMgStX+XKmDGjr1y5cr733nvP9+WXX7ptFS9e3NeiRQtfp06dXNk6dOjgtrNgwQL/axcuXOheq+1OnjzZN27cON/FF1/su+aaayLaz99//93Xtm1bV9ZFixa525o1a9xzM2bM8PXq1cv3xRdf+GbPnu32PUeOHG4dodv45JNPkn0PbrvtNncshw4d6vZT5S5SpIh7refIkSO+K6+80j3+7rvv+r755htfy5YtfenTp/etWrXKv1ytWrV8hQsX9l199dW+CRMmuG3rNYH7Lg0bNvTlzp3bN2TIELfvjz/+uC9NmjS+r776yr9MmzZtfLly5fKVLFnSN3LkSN/MmTN9v/76q2/Hjh2+Bx980K17zpw57v0vXbq0r3bt2v7Xfv/99+69v/322/3HTq+TYsWKuffNM2nSJLftZs2a+b7++mt3LHXMmzZt6l9Gx1jHo0SJEu7c0Hnjvfd6TehyKm9y5s+f786RmjVr+saPH++bOnWq74UXXvC98847KT4/ZMOGDb7zzz/fV716dXe8dY5fddVVbh8PHz7sX+7hhx/2pUuXzte1a1e3rU8//dR37733+v74449EP4da7rzzznP747nooovc+/j555/7Zs2a5Rs+fLg7BwPf+/r16wft54oVK1yZA9ej+xdeeKE7f3Qe6fOnxzp37uw+b/q8aF+0zE033eQLR+eJziV9NkeNGuXez6pVq7rPw/bt2/3LaTtvv/22++zoPGvVqpUvU6ZMvnXr1vmXue6663ylSpVyZVCZ33//fXf8Iz3eOnd0/N544w33mdJx0eci3Neg3hOdt9oPndsfffSRr3Xr1v7ndV5pHTt37nT3X3vtNXceaTlP6HvpnZPavt5XHXP9H3ouedc8vc9euW+99dZEP+OB73PgtXLEiBFu/TqOKsOuXbv8y+maqeV0jdYy999/v/86o/MdwQiYkOKA6Y477ghaTh/SG264wX+/efPmvgsuuMC3b9++JNet9ZQpU8Z34sQJ/2Pr1693X466cAbq3r27u1D9999//nLp9YFf4Ap+9Nhdd93lf+z48eO+fPnyuS98j74Eq1WrFrRdBTrari5Ckexn6PFJjLZz7Ngx94VbsGDBiAImlUvLfPDBB/7HdAwuueSSoIu7gkZdOL2AzVOlSpWgfVD5VV4vOAm8yCsAFH3Z6r6+uAPpuOrLJ/CLUMstXrw42f3XvisA0bKBX36hgVFSj1eqVMl9wQbS+aH1eV8U3pdOt27dgo67vqQV1Hr0JafARF+0ydH5oXNT509iUnp+6MtUQdy///7rf0zHXl/Wb775pruvY6Jzr1+/fkmWxzvPtE8KRhWoBh53fUGrPArQkxJJwKSAOlDlypVdGdeuXet/7PXXX3fL7tmzx5cc7zxRQO3Zu3evL3v27L4nn3wy0dfoHNd5o+CoR48e/sd1DBSEJCUlx9v7rOuHhUfvswKRcAFT2bJlXeCYlMCASe+nAj4FiIGSCpgUIAbSfQVRode8xMp99913Jxsw6XXetS3w/NWPGfn77799mTNn9vXt2zeoDDqXCZgSR5McUkxNFYHU5KTqco+q7m+//XbXnJGcevXqBTUtzZgxw/3ftGlTV0Xu3W688Ubbtm1bUBNA2rRpXbOAR014omU9Sn4sWbKk/3WHDh2yBQsWuOaO//77z79+vVbV66rujmQ/k6KmmmeffdYuvvhi18SSIUMG18SxdetWO3DggKWUV56GDRsG7XeDBg2CllPTW7ly5dx+BB63m266KcE+qUlFzRCB+yTefmldqpZXU13outTkqePmUfOPmqpCjR492uUiqSlA+66mAPnll18sEjpWajbSuRTorrvucv/Pnz8/yfdL55WaLALfr1q1arl9UXNMUnSOqLmoTZs2YZNnw50fOpZ679KnT+8/jmpu0rHx3pdZs2a55q62bdsmuy0to3KreXL27NlBx13vg5oge/ToYe+//36KztHk6L0OpPNKTaE6noGPibetwM+Tbv+Lvf5HzWM6nwLv63O6ZMmSoKa02267zTW36bjrvFm3bl3QOaNmdzXXDxs2zNavX5+g3Ck53vpfZdO2PNpe48aNwx4XbV9NxyrD6tWrk1xOn/UXXnjBNZcFfnaTE1ge0Tn/559/JngvEyt34HFMjK4ZgddFNXsrjcFbt5rrdM0KLauaNpE4AiakmNq4A6mNPjCn5O+//3YX2HB0cQyk7ri6mOXNm9ddML2bdwEPDJj0gdd2A8sQrmx79uxxF3blPQSuX7ctW7YkyMkIt59JUV7Wyy+/7PIdlJugi3TPnj3dcyl5vUcBlsqmL5hAXr5X4HFTMBO6T88//3yK9imwXFrX7t27E6xLvXv0BaQyJfX+eb3c9MWuPIuPP/7YBR96LNJ993JGdD6EbkfHQ4Goyhlu3yLdps4R5RSl5PwNtz0dS+XQhR7Lb7/91v++6LOiL/jQ9zSUcky++OILF3wqOA6k4FDBggKaDh06uOD/yiuvtHnz5kW078ntV7jzRj9eAvdx7ty5/mUDA3SP3lPvXPrnn39c8Km8NuVy6fjoM6PcuMDjOX78eLcdBSSXXHKJlS5d2gWQkRxv7zMV2hMzsXM51Ouvv26tWrVyOYR6D4oWLeqCt1DKEdTz3g+FlAh9/73yBH7ekip34DKJCb1Whp6r3utD36dw52Q8o5ccoka/eP/666+wy4UmLqtmQ4+p5iD0Ay5KZD0Vuuhr/UpoTuwXpQK1aPjkk09c0qQCJ8+XX34Z8XqU/Hns2DGX/BsYNAUmqHvHrXz58v7E0lOhdenCmdgYSaEX0cTGrdG+qxbr7bff9j8W+OV5Mu9X6P7qeCgxXWWNNm1Tv8hTcv6Go/Ip6VjJ+KGUcOx9VhSIah+T+4JSgKhz6Oabb3adGgKPr1fjo2Ov82XhwoXuHFdNpGopVNOnJGEFXaHBYbSoPAp8Evus7ty5M8Hy27dvd+e3l2Cv2g7VyChICnyfA3uUafn33nvP3nnnHVu+fLn7QaDaRtVEqXNISo6395nSvgcGHypPOPoMKiDTTbUyQ4YMcdtSEnWNGjX8yymwVQK3asonTpzoAp1wQs9xrzzeMZKkyh24zMnwXq/3KfCHQmiZ8H+oYULUqPpXv7ICL6Ap4TWx6Ve3fiGH3ryL3slSDxT1qlP1f2LrV1V1JBTU6Ys7sSESAgM+1WqNGzcu4vKqTDJp0iT/Y6r9UG+00OP922+/uYtdYvsVCa1LF06VP7F1JRbIJrfvMmbMmATLpaT2R1/0Cr5CB4hUzZVE8gs+0nNEvbYCmx9Pho6lmm7UJBR6HL2AQk1VCgrVmyoc7a++jFW2xx9/PNFl9OWspscnn3zS9QbzAj8FHgosApvKQntRngrtT1KfVQU+anoMvK/md69ZUeeMBJ43CvqS6p2lgPaqq65yAZOCTa95LiXHW68Tr9ZT9D4rsImEapC83ru6noQeC+2fmsqaNWuWovMosDyic16f59AhCBIrd2LN4pHwei8GXmck0mMST6hhQtQof0e/FnWBV5dd/YJZu3atyw/R/aToV7KaFFTt3a1bN3ch0K8q5TEobyMaH2A1lelLSr9M1SVYv9b061Zdxu+9917/0AApoSYQXbD1S7NatWouZ0sXSzUhjhgxwuW0qNZKg8klFliFU7ZsWZezoO6+OnbKUxk+fLj7ggms3VETmH7hq+zq2q7jqOYsNdOpVqF///4p3qbKrpoJ1WTovVLNlYYKULd1fTHp13241+s9fO6551zgoZoq5bQlduz0JarjrvdAXcRV2xJK3aJVG6ihFXTTl75qT/TrPbRpKhzVdCkoVy1FcnlMGu5C54i+gFWDoPJ9//337r287777Urw9DTuhL+i6deta+/btXfOJcvFUDtVI6MtU75WGXVCTrZoYVT6916pN0r6rC3sglUvNUDom6iKu7uerVq2yLl26uHNaOXsKSPSe6weA7ns5MaqBfOSRR9xrFZCcrpHKVfOjHC0dD9Xg6fgqcPOCvmuuucYFxzpvFOipVkzXkMB91z7pOOraoM+Yzms1kWl9yi1K6fHWZ1KfKW1bAbuOkT6fobVvibn22mvdaxVgKH9IgauCvMDaJY/OTQWker+UD6dlFeglRZ8FXfP0+dFnQnmAGnYj8DXaloJElVufF5VbTY2nel3U5061lsq7UuCkHymqrfTyx5Ird9xKIhkccSypXnKhPbsee+wx17spkHqjqHu6ug9nzZrVV7FiRdcd2KP1vPzyywm2qZ5A6oGjYQLUzVVdktVL6tVXX02yXMmVLbHeQcuWLfPdcsstrru+uqir15l6H3k9xVK6n+rJoy7h+fPndz1vtC3Ztm2b6/6vnkB6Tr381KU3sMtxSocVUC8kDZWg/c2TJ4/rpdOzZ0/XfTqQeiSqa7C6umsYBvXI0z5qyIZIe0ppmAJ189Zx0XugHo/qihzYW0+9n9RrKJR67nTp0sW9RvuvoQPUoyt0X1evXu2rUaOGWyawq39ivefUzb58+fKuLOotqV6PgT2hvJ5Gel8DNWrUyP+eRDKsgGgoCu2zzl2VUV321eU90s+Bhti488473XunXlPquafeXNp/j3pCDRgwwB1vvXfaR/VK9HqZJna+a/gA9YxUzyZ1z9cwAOohpm2oZ6i6zmvbgbQNdRXXutTDT/uTWC+50M9lYu91Usc8lPdanYeXXXaZew/V8zFwqA/R8A9aTr219F6rB2zg+aphAdTVXT3n9JnVdaFOnTq+pUuXRny8E/tMaZ/DfQ2qF6a656vXna5r1157bVBv0tBhBURDZmh5lV3XtqR6yen46Hqp803XjOeeey5o297r9FlSr0UdRx2L0F54ifWSS6wnr659ei7wM9+xY0d3XdG+6X3T8AUqm3o1Ilga/XOmgzYA4dWsWdP9wlWtG4DYpYErr7vuOpfknlzzuWob1Tsvkl62p0q1econ3bhx42nbZqygSQ44C3322WeuB5+q+NVUo9F31esnNOcBAE6Wmi015IpGYVeepFIqlHuoXotIiIAJOAspt0P5DL/++qvLs1BXak3ZkpJxYwAgpdcZBUmaFkc5ksqRUrCUVOeCeEeTHAAAQBikwQMAAIRBwAQAABAGARMAAEAYJH2ngHoPaORcjWKb2LQQ5zr1onjttdfchKgaEE69KG699dagofQ14JwGYdNAcxrMUQNFeoPniUakfuaZZ9x0CEpi1kB9WiZwWggNKKmpBzTitAam04CMGpQuJVMAKBVPg/RppN3Q8gEAkNz3h2ao0CjryQ3YSdJ3CmhEaE1sCQAAzk0aQT10WppA1DClgDc/kg6mpsGIZ5qIMrAGR9NmaAwPzU6vaS+8GjnNKt6rVy83PYCmyFDtj2Yl946faqI05YfGFdIAbonR9BrNmzd3NU7JTWSpKSI0PYQGg9OUE4Hl06SVmnpAtV+a6kO/IDSdhKbbAABg//79rlIk3LylBEwp4DXD6cs+3gMm0VxW3nHwAhnNtxV4bDQ3kWYW1xxW6dOnd8fwggsucLOvi/5X1afmPWvUqFGCbWh+LQVTat5LbK4xjwZ11PxRml9JQVpo+TT/mMYy+uabb1wZFeBpvBHeRwBAoHApNyR945RoQMWiRYtajx49XG2O8pM0CJqaMbdu3eqfZFOzwXfv3t0FOKrp0WSxmnXbW8ajZbSsgiSNdB06k3aoTp06uaAqsaBLtA5vFnNNuKmJVTXJLAAAkSBgwilRDZNmUdcM15qdXLU7muusXr16/uQ51SxpFuzJkye7kWXVrLd3714323hogp2az1TrpBm/NW+aZpdPKs3uiy++cE1tgwcPTrJ8mo173LhxbibuJ554ws3WDgBApGiSwylTDpN60CkvSTVMCpCqVKkSNKlknTp1bMOGDbZr1y7XRKdecAUKFLASJUoErUvNZropF0k5UWpXVn5U1apVE2xXwZLWqXUFatq0qdWoUcPlNClwU+6U8qGmT5/ueud16NDBTWgJAEBKUcOEqFHNkYIl5Qx99913iTaTKRhSgKNgR8MRNGzYMMn1KXlcjhw5kujzTz75pEv4VrDm3WTQoEE2cuRI/3Iqk5LPNRebaqOGDx8ehb0FAMQTapgQ1oEDB1yytGfjxo0uOFETnPKX1NymoER/axylxx57zE0Sq1oljwIY1RhpOY3FpGWUf1SqVCn3/JIlS2zZsmVWvXp1y5Url6s50rhNGsvJq136888/XQ3RBx98YFdffbWrodItlMqhSSRFPfVUA1a2bFkXeGmiSa83HwAAKUXAhLBUWxTY9b9z587uf9XajBo1yiVu67Ht27e7QSaVd6RgJ9C6detcYrh6vyn5+umnn3YBk0e5T8qF0gCYSgrXem6++Wbr2bOnv2fdsWPH3HqUOJ5SGTNmdNvdtGmTZcmSxTXVKacJAIBIMHBlCsdoUHOTcnTojg4AQPx9x5/RHKb+/fvbVVdd5QaL0hQZasZRDUKgw4cPuyRddTNXDysl9KomI7TreP369V0thdajnlbHjx8PWkYJwOqVpdqKiy++2NWMAAAApMQZDZjmzp3rgiH1glIPJjW5KO9FTTIeNduoO7ryZLS85nRr0qSJ/3mN5aNgSb2z1GX8/fffd8GQclcCc260jJqVlHvz+OOP2/33329Tp0497fsMAABiz1nVJKcpMFRDpMCoZs2arnpMScJjx451U2vIzz//7JJ2lTisARG//vprNw2GAqn8+fO7Zd566y03AKLWpxwW/f3ll1/a6tWrgyZ61VhAGgE6HJrkAAA4N8VEk1woFVbU+0o0tYZqnTQ6c+jI0gqYRP+XK1fOHyxJ3bp13QFYs2aNf5nAdXjLeOsIpd5Uen3gDQAAxK+zJmDSmDtqKrv22mvt8ssvd49t27bN1RCFDkyo4EjPecsEBkve895zyS2jQEjziiWWW6Vo07tp8EQAABC/zpphBZTLpCaz+fPnn+miuG7oXtf5wJmMU0uDLsnPl4b4MXlg4nPiAQDOrLMiYOrYsaMbUHDevHl24YUX+h/XoIRK5lauUWAtk3rJeQMW6v+lS5cGrc/rRRe4TGjPOt1XW6XG5gmlnnTe2D8AAABntElO+eYKlj7//HM3VYY3OrNHIzRrcteZM2f6H9OwAxpGwBv9Wf9rdGlNs+FRjzsFQ2XKlPEvE7gOb5nE5icDAAA4q2qY1AynHnCTJk1yYzF5OUfKG1LNj/5v27atax5TIriCoEceecQFOuohJxqGQIFRq1atbMCAAW4dGh1a6/ZqiR588EF744033Gz19913nwvOPv74Y9dzDgAA4KyuYRo2bJjrGVe7dm03FYZ3Gz9+vH8ZTaSqYQM0YKWGGlDzmqbQ8KRLl8415+l/BVItW7Z0U3P07dvXv4xqrhQcqVapQoUKNnDgQHvnnXdcTzkAAICYGofpbJXa4zCR9A0PSd8AcHrF5DhMAAAAZyMCJgAAgDAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgLM5YJo3b541aNDAChUqZGnSpLGJEycGPa/HEru9/PLL/mWKFy+e4PkXX3wxaD2rVq2yGjVqWObMma1IkSI2YMCA07aPAAAg9p3RgOngwYNWoUIFe/PNNxN9fuvWrUG39957zwVETZs2DVqub9++Qcs98sgj/uf2799vderUsWLFitny5ctdsNW7d28bPnx4qu8fAAA4N6Q/kxuvV6+euyWlQIECQfcnTZpk1113nZUoUSLo8ezZsydY1jNmzBg7evSoC7YyZsxoZcuWtZUrV9qrr75q7du3j9KeAACAc1nM5DBt377dvvzyS2vbtm2C59QElydPHqtUqZKrQTp+/Lj/uUWLFlnNmjVdsOSpW7eurVu3zvbs2ZPoto4cOeJqpgJvAAAgfp3RGqZIvP/++64mqUmTJkGPP/roo3bFFVdY7ty5beHChdajRw/XLKcaJNm2bZtddNFFQa/Jnz+//7lcuXIl2Fb//v2tT58+qbo/AAAgdsRMwKQmtRYtWrjE7UCdO3f2/12+fHlXk/TAAw+4oCdTpkwntS0FXYHrVQ2TksUBAEB8iomA6dtvv3VNaOPHjw+7bJUqVVyT3KZNm6xUqVIut0nNeYG8+0nlPSnQOtlgCwAAnHtiIofp3XfftcqVK7sedeEooTtt2rSWL18+d79q1apu+IJjx475l5k+fboLphJrjgMAADirAqYDBw64AEc32bhxo/t7y5YtQc1hn3zyid1///0JXq+E7sGDB9sPP/xgv/32m+sR16lTJ2vZsqU/GGrevLlrplOy+Jo1a1wt1ZAhQ4Ka3AAAAM7aJrnvvvvODRPg8YKYNm3a2KhRo9zf48aNM5/PZ82aNUvwejWb6XmNq6SebUruVsAUGAzlzJnTpk2bZh06dHC1VHnz5rVevXoxpAAAAEixND5FI0iWarkUeO3bt89y5MgR9fU36DIp6utEbJo8sNGZLgIAxJX9KfyOj4kcJgAAgDOJgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAACCM9BahjRs32rfffmubN2+2Q4cO2QUXXGCVKlWyqlWrWubMmSNdHQAAwLkTMI0ZM8aGDBli3333neXPn98KFSpkWbJksd27d9uGDRtcsNSiRQvr3r27FStWLHVLDQAAcLYFTKpBypgxo91zzz322WefWZEiRYKeP3LkiC1atMjGjRtnV155pQ0dOtTuuOOO1CozAADA2Rcwvfjii1a3bt0kn8+UKZPVrl3b3V544QXbtGlTNMsIAABw9gdMyQVLofLkyeNuAAAAcdtL7vvvv7cff/zRf3/SpEnWuHFje+qpp+zo0aPRLh8AAEDsBUwPPPCA/fLLL+7v3377ze6++27LmjWrffLJJ/bEE0+kRhkBAABiK2BSsFSxYkX3t4KkmjVr2tixY23UqFEuIRwAAMDiPWDy+Xx24sQJ9/eMGTPslltucX+r59yuXbuiX0IAAIBYC5g0bMDzzz9vo0ePtrlz51r9+vX9A1pqfCYAAACL94Bp8ODBLvG7Y8eO9vTTT9vFF1/sHv/000+tWrVqqVFGAACA2JoapXz58kG95Dwvv/yypUuXLlrlAgAAiK0aJuUthaOpUTJkyBDRxufNm2cNGjRw06ykSZPGJk6cGPS8RhbX44G3m2++OWgZTc2iKVly5Mhh559/vrVt29YOHDgQtMyqVausRo0arozKtRowYEBE5QQAAPEtRQFT2bJl3bQn4cZZ+vXXX+2hhx5yI4OnxMGDB61ChQr25ptvJrmMAqStW7f6bx999FHQ8wqW1qxZY9OnT7cpU6a4IKx9+/b+5/fv32916tRx89stX77c1YT17t3bhg8fnqIyAgAApKhJ7vXXX3eT6j788MN20003ucRv1QqpxmbPnj22du1amz9/vgtclNukoCkl6tWr527J0bQrBQoUSPS5n376yb755htbtmyZK5NXVvXce+WVV1wZNWmwAr333nvPzYen4G/lypX26quvBgVWAAAApxQw3XDDDfbdd9+5oGj8+PEuCNm8ebP9+++/ljdvXjc5b+vWrV1tT65cuSya5syZY/ny5XPrvf76610PPW/qFU34q2Y4L1iSG2+80dKmTWtLliyx2267zS2jsaIULAVO9fLSSy+5YC+x8moyYd0Ca6kAAED8iijpu3r16u52uqg5rkmTJnbRRRfZhg0b3PQrqpFSEKQE823btrlgKlD69Oktd+7c7jnR/3p9IG/4Az2XWMDUv39/69OnT6ruGwAAOId7yZ1OmnbFU65cOddDr2TJkq7WSbVeqaVHjx7WuXPnoBomJYsDAID4FPE4TGdSiRIlXBPg+vXr3X3lNu3YsSNomePHj7uec17ek/7fvn170DLe/aRyo5Q3pV53gTcAABC/Yipg+uOPP+zvv/+2ggULuvtVq1a1vXv3ut5vnlmzZrmpW6pUqeJfRj3njh075l9GPepKlSoV9XwrAABwbjqjAZPGS1KPNd286VX095YtW9xz3bp1s8WLF9umTZts5syZ1qhRIzeyuJK25bLLLnN5Tu3atbOlS5faggULXC89NeWph5w0b97cJXxrfCb14lPS+pAhQ4Ka3AAAAM7agEk979TDTjdREKO/e/Xq5ZK6NeBkw4YN7dJLL3UBT+XKle3bb791TWYe9dgrXbq0y2nScAJKSg8cYylnzpw2bdo0F4zp9V26dHHrZ0gBAACQUml8KRnGO4R6rI0cOdL9r9oa9VT7+uuvrWjRom6co3ONkr4VeO3bty9V8pkadJkU9XUiNk0e2OhMFwEA4sr+FH7HR1zDNHfuXNdjTeMcTZgwwT8NyQ8//GDPPvvsqZUaAADgLBRxwPTkk0+6wSOVOB04GKQGlVS+EQAAgMV7wPTjjz+6EbRDqVlu165d0SoXAABA7AZMmopEk+CGWrFihRUuXDha5QIAAIjdgEld9jURr6YVSZMmjRvzSN35u3bt6uaTAwAAsHgPmPr16+e68WuqECV8lylTxk1uW61aNevZs2fqlBIAACCW5pJToveIESPsmWeesdWrV7ugSWMnXXLJJalTQgAAgFidfFdjLukGAABwros4YNI4l59++qnNnj3bTXyrHKZAGpsJAAAgrgOmxx9/3N5++2277rrrLH/+/C7xGwAA4FwWccA0evRoV4ukedsAAADiQcS95DTfSokSJVKnNAAAAOdCwNS7d2/r06eP/fvvv6lTIgAAgFhvkrvzzjvto48+clOhFC9e3DJkyBD0/Pfffx/N8gEAAMRewNSmTRtbvny5tWzZkqRvAAAQFyIOmL788kubOnWqVa9ePXVKBAAAEOs5TJoSJUeOHKlTGgAAgHMhYBo4cKA98cQTtmnTptQpEQAAQKw3ySl36dChQ1ayZEnLmjVrgqTv3bt3R7N8AAAAsRcwDR48OHVKAgAAcC71kgMAAIgnKQqY9u/f70/01t/JISEcAADEZcCUK1cu27p1qxus8vzzz0907CWfz+ce/++//1KjnAAAAGd3wDRr1izLnTu3+3v27NmpXSYAAIDYC5hq1arl//uiiy5yYzGF1jKphun333+PfgkBAABibRwmBUw7d+5M8LiGE9BzAAAAFu8Bk5erFOrAgQOWOXPmiNY1b948a9CggRUqVMitc+LEif7njh07Zt27d7dy5cpZtmzZ3DKtW7e2v/76K2gdmgBYrw28vfjii0HLrFq1ymrUqOHKp9qxAQMGRLrbAAAgjqV4WIHOnTu7/xWQPPPMM27QSo8SvZcsWWIVK1aMaOMHDx60ChUq2H333WdNmjQJek6DY37//fduW1pmz5499thjj1nDhg3tu+++C1q2b9++1q5dO//97Nmz+/9Wr746derYjTfeaG+99Zb9+OOPbntKXm/fvn1E5QUAAPEpxQHTihUr/DVMCjoyZszof05/K6jp2rVrRBuvV6+euyUmZ86cNn369KDH3njjDbv66qtty5YtVrRo0aAAqUCBAomuZ8yYMXb06FF77733XDnLli1rK1eutFdffZWACQAARDdg8nrH3XvvvTZkyJAzMt7Svn37XA2XaocCqQnuueeec0FU8+bNrVOnTpY+/f92bdGiRVazZs2gAK9u3br20ksvuVorDZkQ6siRI+7mCTf2FAAAOLdFPNL3yJEj7Uw4fPiwy2lq1qxZULD26KOP2hVXXOGGPVi4cKH16NHDjRmlGiTZtm1bgmT0/Pnz+59LLGDq37+/9enTJ9X3CQAAnKMB05mgBPA777zTNQcOGzYs0dwqKV++vKtJeuCBB1zQkylTppPanoKuwPWqhknJ4gAAID6lj5VgafPmzW4AzXBNgVWqVLHjx4/bpk2brFSpUi63afv27UHLePeTyntSoHWywRYAADj3RDyswJkIln799VebMWOG5cmTJ+xrlNCdNm1aN42LVK1a1Q1foHV5lEyuYCqx5jgAAICzqoZJYzetX7/ef3/jxo0u4FE+UsGCBe322293QwtMmTLFDV2gnCPR82p6U0K3hjO47rrrXE853VfCd8uWLf3BkJLAlY/Utm1blwO1evVql7Q+aNCgM7bfAAAgDgIm1fio19yOHTvsxIkTQc/16tUrxevReEoKdjxe3lCbNm2sd+/e9sUXX7j7oeM7adu1a9d2zWbjxo1zy6pXm5K7FTAF5h9peIJp06ZZhw4drHLlypY3b15XRoYUAAAAKZXGp0zqCIwYMcIeeughF3goByhw1G/9rRqhc42SvhV4aViD1BhOoUGXSVFfJ2LT5IGNznQRACCu7E/hd3zENUzPP/+8vfDCC655CwAAIB5EnPStwR7vuOOO1CkNAADAuRAwKVhSThAAAEC8iLhJ7uKLL3YT4i5evNjKlStnGTJkCHpeI28DAADEdcA0fPhwO++882zu3LnuFkhJ3wRMAADA4j1g0lhJAAAA8eSURvrWiAQRjkoAAAAQHwHTBx984PKXsmTJ4m6a9Hb06NHRLx0AAEAsNsm9+uqrLum7Y8eOdu2117rH5s+fbw8++KDt2rXLjbQNAAAQ1wHT66+/bsOGDbPWrVv7H2vYsKGVLVvWTVFCwAQAACzem+S2bt1q1apVS/C4HtNzAAAAFu8Bk8Zh+vjjjxM8Pn78eLvkkkuiVS4AAIDYbZLr06eP3XXXXTZv3jx/DtOCBQts5syZiQZSAAAAcVfD1LRpU1uyZInlzZvXJk6c6G76e+nSpXbbbbelTikBAABiqYZJKleubB9++GH0SwMAABCrAdP+/fstR44c/r+T4y0HAAAQVwFTrly5XA+4fPny2fnnn+/mjAulEb/1+H///Zca5QQAADi7A6ZZs2ZZ7ty53d+zZ89O7TIBAADEXsBUq1Yt/98XXXSRFSlSJEEtk2qYfv/99+iXEAAAINZ6ySlg2rlzZ4LHd+/e7Z4DAACweA+YvFylUAcOHLDMmTNHq1wAAACxN6xA586d3f8KljT5btasWf3PKdFbYzNVrFgxdUoJAAAQCwHTihUr/DVMP/74o2XMmNH/nP6uUKGCde3aNXVKCQAAEAsBk9c77t5777UhQ4Yw3hIAAIgbEY/0PXLkyNQpCQAAwLk0Ncp3333nJtrdsmWLHT16NOi5CRMmRKtsAAAAsdlLbty4cVatWjX76aef7PPPP7djx47ZmjVr3OCWOXPmjGhd8+bNswYNGlihQoVcMrkm8g2kfKlevXpZwYIFLUuWLHbjjTfar7/+mmA4gxYtWrgmQo1C3rZtW9djL9CqVausRo0arhefxpAaMGBApLsNAADiWMQBU79+/WzQoEE2efJkl+ytfKaff/7Z7rzzTitatGhE6zp48KBLFn/zzTcTfV6BzWuvvWZvvfWW64WXLVs2q1u3rh0+fNi/jIIlBWzTp0+3KVOmuCCsffv2/uc1912dOnWsWLFitnz5cnv55Zetd+/eNnz48Eh3HQAAxKk0PlXjREBBiwKU4sWLW548eWzOnDlWrlw5V+N0/fXXuznnTqogadK4GqvGjRu7+yqWap66dOni7323b98+y58/v40aNcruvvtut80yZcrYsmXL7Morr3TLfPPNN3bLLbfYH3/84V4/bNgwe/rpp23btm3+nn1PPvmkq81SoJcSCrpUe6btp0aye4Muk6K+TsSmyQMbnekiAEBc2Z/C7/iIa5g0Ee8///zj/i5cuLCtXr3a/b137147dOiQRcvGjRtdkKNmOI92qEqVKrZo0SJ3X/+rGc4LlkTLp02b1tVIecvUrFkzaBgE1VKtW7fO9uzZk+i2jxw54g5g4A0AAMSviAMmBR9q/pI77rjDHnvsMWvXrp01a9bMbrjhhqgVTMGSqEYpkO57z+n/fPnyBT2fPn16N1Fw4DKJrSNwG6H69+/vgjPvprwnAAAQvyLuJffGG2/4c4jU1JUhQwZbuHChNW3a1Hr27Gnngh49evhHNhfVMBE0AQAQvyIKmI4fP+4Sq9WkJWr6Uj5QaihQoID7f/v27a6XnEf3vSlYtMyOHTsSlFE957zX63+9JpB331smVKZMmdwNAAAg4iY5NXc9+OCDQb3UUstFF13kApqZM2cG1fQoN6lq1aruvv5X7pR6v3k0vMGJEydcrpO3jHrOafgDj5oUS5Uq5fKxAAAAop7DdPXVV9vKlSstGjRektblrU+J3vpbA2Kq19zjjz9uzz//vH3xxRdu/rrWrVu7nm9eT7rLLrvMbr75ZpdDtXTpUluwYIF17NjR9aDTctK8eXOX8K3xmdS7b/z48W4ohMAmNwAAgKjmMD388MMu2Pj999+tcuXKbpiBQOXLl49oxPDrrrvOf98LYtq0aeOGDnjiiSfcWE0aV0k1SdWrV3fDBmgASs+YMWNckKSEczURKpdKYzd5lLQ9bdo069Chgytv3rx53WCYgWM1AQAARHUcJgUlCVaSJo0bN0n///fff3auYRwmnC6MwwQAZ+d3fMQ1TGo2AwAAiCcRB0yaYgQAACCeRJz0LaNHj7Zrr73WJVZv3rzZPTZ48GCbNImmJQAAcO6JOGDS3GxKztZ8bUrE9nKWNEWJgiYAAACL94Dp9ddftxEjRrhRvtOlS+d/XPO5qes/AACAxXvApKTvSpUqJXhcI2NrCAAAAACL94BJI3AnNnClxkfSQJIAAAAW773klL+kQSA1PYrGXtII2x999JH179/f3nnnndQpJQAAQCwFTPfff79lyZLFevbsaYcOHXJTj6i3nKYb0ZQkAAAAFu8Bk7Ro0cLdFDBpPrh8+fJFv2QAAACxHDB5smbN6m4AAADnsoiTvrdv326tWrVyzXDp06d3QwsE3gAAACzea5juuece27Jliz3zzDNWsGBBN+EuAADAuSzigGn+/Pn27bffWsWKFVOnRAAAALHeJFekSBE3nAAAAEC8iDhg0nxxTz75pG3atCl1SgQAABDrTXJ33XWXG06gZMmSrodchgwZgp7fvXt3NMsHAAAQewGTapgAAADiScQBU5s2bVKnJAAAAOdKDpNs2LDBTY3SrFkz27Fjh3vs66+/tjVr1kS7fAAAALEXMM2dO9fKlStnS5YssQkTJripUeSHH36wZ599NjXKCAAAEFsBk3rIPf/88zZ9+nTLmDGj//Hrr7/eFi9eHO3yAQAAxF7A9OOPP9ptt92W4HFNwLtr165olQsAACB2A6bzzz/ftm7dmuDxFStWWOHChaNVLgAAgNgNmO6++27r3r27bdu2zc0jd+LECVuwYIF17drVWrdunTqlBAAAiKWAqV+/fla6dGk3RYoSvsuUKWM1a9a0atWquZ5zAAAAFu8BkxK9R4wY4YYWmDJlin344Yf2888/2+jRoy1dunRRL2Dx4sVdTVborUOHDu752rVrJ3juwQcfDFrHli1brH79+m5kcuVadevWzY4fPx71sgIAgHNTxANXeooWLepuqW3ZsmX233//+e+vXr3abrrpJrvjjjv8j7Vr18769u3rv6/AyKPXKlgqUKCALVy40OVfqelQU7qotgwAACDqAVPnzp0TfVw1O5kzZ7aLL77YGjVqZLlz57ZouOCCC4Luv/jii24eu1q1agUFSAqIEjNt2jRbu3atzZgxw/Lnz28VK1a05557zuVh9e7dO2hoBAAAgKgETOoN9/3337uam1KlSrnHfvnlF9ccp9ymoUOHWpcuXWz+/Pkuvymajh496poAFbQpQPOMGTPGPa6gqUGDBvbMM8/4a5kWLVrkBtpUsOSpW7euPfTQQ25k8kqVKiXYzpEjR9zNs3///qjuBwAAOMdzmFR7dOONN9pff/1ly5cvd7c//vjDNZNpqpQ///zTJYF36tQp6oWdOHGi7d271+655x7/Y82bN3fB0uzZs61Hjx4ul6ply5b+59WbLzBYEu++nktM//79LWfOnP6bEtwBAED8SuPz+XyRvEBjLWmU79DaI9XW1KlTxwVMqoHS39EeyFI1Q2pCmzx5cpLLzJo1y2644QZbv369a7pr3769bd682aZOnepf5tChQ5YtWzb76quvrF69eimqYVLQtG/fPsuRI4dFW4Muk6K+TsSmyQMbnekiAEBc2b9/v6scCfcdH3ENk1boTbgbaOfOnf6mKw1uqeazaFLQozyk+++/P9nlqlSp4v5XwCRqptu+fXvQMt79pPKeMmXK5A5a4A0AAMSvk2qSu+++++zzzz93TXG66e+2bdta48aN3TJLly61Sy+9NKoFHTlypBsSQD3ekrNy5Ur3f8GCBd3/VatWddO5BAZ5qiFTEBTtHCsAAHBuijjp++2333b5SRrx2xvLKH369NamTRsbNGiQu6/k73feeSdqhdRo4gqYtA1ty6OxoMaOHWu33HKL5cmTx1atWuXKphyq8uXLu2XUNKjAqFWrVjZgwACXt6QBNjWOk2qSAAAAoh4wnXfeeW7gSgVHv/32m3usRIkS7nGPuu5Hk5riNPikarYCKZ9Jzw0ePNgOHjzo8oyaNm0aNOK4eu9pgE31ilNtk3KXFHgFjtsEAAAQ1aTveJTShLCTRdI3PCR9A8A5kvQNAAAQbwiYAAAAwiBgAgAACIOACQAAIBq95L744gtLqYYNG6Z4WQAAgHMmYPIGpAxHE+JqUl4AAIC4C5g0cCQAAEC8IocJAAAg2iN9i0bVnjt3rht9O3SS3UcfffRkVgkAAHDuBEwrVqxwc7cdOnTIBU65c+e2Xbt2WdasWd3kuARMAADA4r1JTpPbNmjQwPbs2WNZsmSxxYsX2+bNm61y5cr2yiuvpE4pAQAAYilgWrlypXXp0sXSpk3rJrY9cuSIm/R2wIAB9tRTT6VOKQEAAGIpYMqQIYMLlkRNcMpjEk1c9/vvv0e/hAAAALGWw1SpUiVbtmyZXXLJJVarVi3r1auXy2EaPXq0XX755alTSgAAgFiqYerXr58VLFjQ/f3CCy9Yrly57KGHHrKdO3fa22+/nRplBAAAiK0apiuvvNL/t5rkvvnmm2iXCQAAILZrmK6//nrbu3dvgsf379/vngMAALB4D5jmzJmTYLBKOXz4sH377bfRKhcAAEDsNcmtWrXK//fatWtt27Zt/vuacFdNc4ULF45+CQEAAGIlYKpYsaKlSZPG3RJretMglq+//nq0ywcAABA7AdPGjRvN5/NZiRIlbOnSpXbBBRf4n8uYMaNLANdAlgAAAHEbMBUrVsz9f+LEidQsDwAAQOwPKyAbNmywwYMH208//eTulylTxh577DErWbJktMsHAAAQe73kpk6d6gIkNcuVL1/e3ZYsWWJly5a16dOnp04pAQAAYqmG6cknn7ROnTrZiy++mODx7t2720033RTN8gEAAMReDZOa4dq2bZvg8fvuu88NNwAAAGDxHjCpd9zKlSsTPK7H1FMumnr37u0fysC7lS5dOmiwzA4dOliePHnsvPPOs6ZNm9r27duD1rFlyxarX7++Zc2a1ZWvW7dudvz48aiWEwAAnNtS3CTXt29f69q1q7Vr187at29vv/32m1WrVs09t2DBAnvppZesc+fOUS+gcqNmzJjxfwVO/39FVtPgl19+aZ988onlzJnTOnbsaE2aNHHl8QbUVLBUoEABW7hwoW3dutVat25tGTJkcJMIAwAApEQanwZXSgGNsaSAQzVM6iE3cOBA++uvv9xzhQoVcjU3jz76qKsFimYN08SJExOt0dq3b58ry9ixY+322293j/3888922WWX2aJFi+yaa66xr7/+2m699VZXzvz587tl3nrrLZdrtXPnTjd+VEponjwFZNpmjhw5LNoadJkU9XUiNk0e2OhMFwEA4sr+FH7Hp7hJzourFBCpZuePP/5wK9dNf2tYgWgGS55ff/3VBWQaMLNFixauiU2WL19ux44dsxtvvNG/rJrrihYt6gIm0f/lypXzB0tSt25dd3DWrFmT5DaPHDnilgm8AQCA+BVRDlNoQJQ9e3Z3Sy1VqlSxUaNGuXnqhg0b5kYbr1Gjhv3zzz9uLjvVEJ1//vlBr1Fw5M1zp/8DgyXvee+5pPTv399Fm96tSJEiqbJ/AADgHBxW4NJLLw1bi7R7926Llnr16vn/1nhPCqA04vjHH3/s5q5LLT169AjKx1INE0ETAADxK6KAqU+fPq7G5UxRbZKCtvXr17vxno4ePWp79+4NqmVSLzkleYv+1wCbgbxedN4yicmUKZO7AQAARBww3X333VEfOiASBw4ccNOytGrVyipXrux6u82cOdMNJyDr1q1zOU5Vq1Z19/X/Cy+8YDt27PCXW6ORK6lLo5UDAABENWBKjYTucDSMQYMGDVwznHq6Pfvss663XrNmzVxNlwbQVNNZ7ty5XRD0yCOPuCBJPeSkTp06LjBSgDVgwACXt9SzZ083dhM1SAAAIOoBUwpHH4gq9b5TcPT333+7IQSqV69uixcvdn/LoEGDLG3atK6GST3b1ANu6NCh/tcruJoyZYo99NBDLpDKli2btWnTxo0pBQAAEPVxmOIZ4zDhdGEcJgCI8XGYAAAA4hUBEwAAQBgETAAAAGEQMAEAAIRBwAQAABAGARMAAEAYBEwAAABhEDABAACEQcAEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABAAAEAYBEwAAQBgETAAAAGEQMAEAAIRBwAQAABAGARMAAEAYBEwAAABhEDABAACEQcAEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABAAAEAYBEwAAQCwHTP3797errrrKsmfPbvny5bPGjRvbunXrgpapXbu2pUmTJuj24IMPBi2zZcsWq1+/vmXNmtWtp1u3bnb8+PHTvDcAACBWpbez2Ny5c61Dhw4uaFKA89RTT1mdOnVs7dq1li1bNv9y7dq1s759+/rvKzDy/Pfffy5YKlCggC1cuNC2bt1qrVu3tgwZMli/fv1O+z4BAIDYc1YHTN98803Q/VGjRrkaouXLl1vNmjWDAiQFRImZNm2aC7BmzJhh+fPnt4oVK9pzzz1n3bt3t969e1vGjBlTfT8AAEBsO6ub5ELt27fP/Z87d+6gx8eMGWN58+a1yy+/3Hr06GGHDh3yP7do0SIrV66cC5Y8devWtf3799uaNWsS3c6RI0fc84E3AAAQv87qGqZAJ06csMcff9yuvfZaFxh5mjdvbsWKFbNChQrZqlWrXM2R8pwmTJjgnt+2bVtQsCTefT2XVO5Unz59UnV/AABA7IiZgEm5TKtXr7b58+cHPd6+fXv/36pJKliwoN1www22YcMGK1my5EltS7VUnTt39t9XDVORIkVOofQAACCWxUSTXMeOHW3KlCk2e/Zsu/DCC5NdtkqVKu7/9evXu/+V27R9+/agZbz7SeU9ZcqUyXLkyBF0AwAA8eusDph8Pp8Llj7//HObNWuWXXTRRWFfs3LlSve/apqkatWq9uOPP9qOHTv8y0yfPt0FQWXKlEnF0gMAgHNF+rO9GW7s2LE2adIkNxaTl3OUM2dOy5Ili2t20/O33HKL5cmTx+UwderUyfWgK1++vFtWwxAoMGrVqpUNGDDAraNnz55u3apJAgAAiOkapmHDhrmecRqcUjVG3m38+PHueQ0JoOECFBSVLl3aunTpYk2bNrXJkyf715EuXTrXnKf/VdvUsmVLNw5T4LhNAAAAMVvDpCa55CgRW4NbhqNedF999VUUSwYAAOLJWV3DBAAAcDYgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAgNPszTfftOLFi1vmzJmtSpUqtnTp0mSX/+STT6x06dJu+XLlytlXX3112sqK/yFgAgDgNBo/frx17tzZnn32Wfv++++tQoUKVrduXduxY0eiyy9cuNCaNWtmbdu2tRUrVljjxo3dbfXq1ae97PGMgAkAgNPo1VdftXbt2tm9995rZcqUsbfeesuyZs1q7733XqLLDxkyxG6++Wbr1q2bXXbZZfbcc8/ZFVdcYW+88YZ/maFDh9oll1ziaqDy589vt99++2nco/hAwAQAwGly9OhRW758ud14443+x9KmTevuL1q0KNHX6PHA5UU1Ut7y3333nT366KPWt29fW7dunX3zzTdWs2bNVN6T+JP+TBcAAIB4sWvXLvvvv/9cLVAg3f/5558Tfc22bdsSXV6Py5YtWyxbtmx26623Wvbs2a1YsWJWqVKlVNyL+EQNEwAAMeymm25yQVKJEiWsVatWNmbMGDt06NCZLtY5h4AJAIDTJG/evJYuXTrbvn170OO6X6BAgURfo8eTW161Skoe/+ijj6xgwYLWq1cvl0i+d+/eVNyT+EPABADAaZIxY0arXLmyzZw50//YiRMn3P2qVasm+ho9Hri8TJ8+PWj59OnTuzynAQMG2KpVq2zTpk02a9asVNyT+EMOEwAAp5GGFGjTpo1deeWVdvXVV9vgwYPt4MGDrtectG7d2goXLmz9+/d39x977DGrVauWDRw40OrXr2/jxo1zid7Dhw93z0+ZMsV+++03l+idK1cuN0aTgrBSpUqd0f081xAwAQBwGt111122c+dO13SmxO2KFSu6nm1eYreSuNVzzlOtWjUbO3as9ezZ05566ik3fMDEiRPt8ssvd8+ff/75NmHCBOvdu7cdPnzYPa/mubJly56xfTwXpfH5fL4zXYiz3f79+y1nzpy2b98+y5EjR9TX36DLpKivE7Fp8sBGZ7oIABBX9qfwO54cJgAAgDDiKmCKdO4eAACAuAqYIp27BwAAIO4Cpkjn7gEAAIirXnLe3D09evRI0dw9R44ccTePEsG8xLDUcOwII7LCUvUcAwAkf90N1wcuLgKmSOfu0dgXffr0SfB4kSJFUrWcQM43z3QJACA+/fPPP663XFwHTJFSTZTynTwaAGz37t2WJ08eS5MmzRkt27kc4Ssg/f3331Nl6AYAiEVcG1OfapYULBUqVCjZ5eIiYIp07p5MmTK5WyANDIbUpwsCFwUACMa1MXUlV7MUV0nfJzN3DwAAQFzVMKVk7h4AAACL94Ap3Nw9OLPUBKoxskKbQgEgnnFtPHswlxwAAEAYcZHDBAAAcCoImAAAAMIgYAIAAAiDgAmnRe3ate3xxx/33y9evLjrqQgAOPN69+7tOkMhaQRMSJF77rnHjXIeelu/fv2ZLhoA+K9TjRs3tnjGMUg9cTOsAE7dzTffbCNHjgx67IILLjhtEyhrAFIAOFM0J6l+KGry9njn8/nc8UifPn7CCN51pJjGAdFUMoE3TTmT2C8aNb+pGe5keet84YUX3Pw+pUqVco/rYjVx4sQE09aMGjXK/b1p0ya3zIQJE+y6666zrFmzWoUKFWzRokUnXRYAsUnXoEcffdSeeOIJy507t7tmqekp0N69e+2BBx5wY/JlzpzZLr/8cpsyZYp7TtcVXV+++OILK1OmjLsGbtmyxY4cOWJdu3a1woULW7Zs2axKlSo2Z84c/zq912k9unbpOnT77bfboUOH7P3333cpCbly5XJlU9DhSel6p06dapdddpmdd9557ofs1q1b3fPaN61/0qRJ/lYA7/Xdu3e3Sy+91JWlRIkS9swzz9ixY8dSfCznzJnj1vf111+7mTN0LObPn5+i639K3odYED+hIWKOpq7R3EnTp0+P+LVPP/20vfLKK3bJJZe4v5s1a+aaD+Pp1xAAcwGEZnpYsmSJ++GkL/hrr73WbrrpJjdFVr169dzEqx9++KGVLFnS1q5d634IehTkvPTSS/bOO++4Cdjz5ctnHTt2dMuNGzfO/aD7/PPPXeDy448/umuO97rXXnvNLaP1N2nSxG677TYX8Hz11Vf222+/WdOmTV1ZNLCypHS9uraNHj3a1XS1bNnSBVljxoxx///0009uwl6vNUABimTPnt0FXFqv1teuXTv3mIKYSDz55JNu+wq6FPRF432IFXx7IMX0a0m/aDy60HzyySeptj39wtJF6mSa4nThqF+/vvu7T58+VrZsWRcwlS5dOhVKCuBsVb58eTdStijoeOONN9yPMX1Rz5gxw5YuXeqCDNW+iAKBQKqFGTp0qKupFtUwKRjR/97s9rreaOYIPd6vXz//64YNG+aCMFENk4IcTfqu66hqrFQLPnv2bBcwRbLet956y79eBVl9+/Z1f2u9WbJkcTVVoRPL9+zZ0/+3ari0bgVmkQZMffv2PakgJ7n3IVYQMCHF9OHWBSAwoElN5cqVO+m8JX04PQULFnT/79ixg4AJiDOB1wLveqBrgaxcudIuvPBCf7CUGF2DAteh2hk1o4W+RkGKaqA8avryghpRk58ClcAfnXrMK8vJrjdwf5Izfvx4V+O1YcMGO3DggB0/ftzV4EfqyiuvtGi/D7GCgAkppgDp4osvTvC4qoVDZ9iJpG08ue2FUht6SraVIUOGoNeIqt8BxJfAa4F3PfCuBaqNCUfLeNcQUbChJrvly5cHNd1JYDCU2HaTK8uprDfcDGdqAmvRooWrba9bt67lzJnT1S4NHDjQTvW6nDaF1//k9j1WEDDhlKmn3OrVq4Me0y+30A9ItLblJTjKr7/+6tr0AeBkaj3++OMP++WXX5KtZQpUqVIlVxOk2pEaNWpErSzRWq9qxAITyWXhwoVWrFgxl8/p2bx5s8Xa9f9Mo5ccTtn1119v3333nX3wwQcugFE7degHKJrbUtv3ihUr3DYffPDBc/KDCSD11apVy2rWrOmSr9W5ZOPGja4XmPKGkqLASrU1rVu3dr1x9RrlQfXv39++/PLLky5LtNarZr9Vq1bZunXrbNeuXa62RzlDyo1SrZKa5NQ0p4TyWLv+n2kETDhlquJVF1UlD1511VWuR4g+9KlBVchFihRxv8CaN2/uEhfVpg8AJ+Ozzz5z1y31pFUitq5joTU0oZSErWtcly5d3LAB6la/bNkyK1q06CmVJRrrVe83vVa5Rqr9WbBggTVs2NA6derkEsQ1mrdqnHTNjrXr/5mWxheu8RMAACDOUcMEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABABIVX///bebg02TY58u3kTcGhMoViU2sW043gS9kdi2bZubokSDUnqvTWyi82hsN9L1nqq77777pAboTAwBEwAgVb3wwgvWqFEjN0YQzj6DBg1yAwIruNQgnqL7mi80pe666y7/a6V3795uCINQka73VGkOPZ1/+/btO+V1ETABAFKNRuJ/9913rW3btme6KEiCBrOsXLmyG+BSNYGiyXszZcqU4nVkyZLF/9rkRLreU3X55Ze7ufc+/PDDU14XARMAINV89dVX7gvymmuuSbb5Rs00gXO2eTUUo0ePdjVTmv9MzSsaGNGjucgGDBjg5rjUNjTAo2oTAv32229u4nANcFuhQgU3r1pgU6EGrCxcuLB7XhN+f/TRR0Gvr127tj366KNuYMbcuXO7L3yVLdDPP/9s1atXt8yZM7vBL2fMmJGg6en333+3O++80+231qMat8AmSg2W2blzZ/e8JtvV9lIyTKKOpfZb5b/tttvcPoWaNGmSXXHFFa58JUqUcHPKafJd0bHV4J0aqVtlVjOgBJbfa96cMGFCkscy8D3V39rGDz/84F6nmx4LXa836bBGC1fApf1u3769m1cvtFnylVdecRP2apkOHToEzVc3dOhQF+xp/zSh8e233x60/w0aNHCjnJ8qAiYAQKr59ttvXe3FydZ86Mt1ypQp7jZ37lx78cUX/c/36NHD3ddI02vXrrWxY8e6L8xAmj9NMwKouUnTjyhA8oKFw4cPu7Jp6hFN56Ev61atWrkpSQK9//77Lr9nyZIlLkDr27evm0rFC3T0ha4gQs8PHz48aM420Ze7RsTOnj27Ox4afVsT6t5888129OhRt4zybBRUvPfeezZ//nzbvXt32OlLtD3V3GkEb+2fgpnnn38+wfHXyNuPPfaYO0Zvv/22244XWGokcZVDwZyay4YMGZLk9p5O5liGNs9ptPKyZcu6deqmx0IdPHjQHZdcuXK5cnzyyScu2NT+BJo9e7Y7F/S/3guV3wvANC2LAlq9J5oORtPaaLqbQFdffbV7T48cOWKnRCN9AwCQGho1auS77777gh4bOXKkL2fOnEGPff7556pO8d9/9tlnfVmzZvXt37/f/1i3bt18VapUcX/r8UyZMvlGjBiR6HY3btzo1vfOO+/4H1uzZo177KeffkqyvPXr1/d16dLFf79WrVq+6tWrBy1z1VVX+bp37+7+/vrrr33p06f3bd261f/89OnT3Xa0TzJ69GhfqVKlfCdOnPAvc+TIEV+WLFl8U6dOdfcLFizoGzBggP/5Y8eO+S688EJ3/JLSrFkz3y233BL02F133RV0bG+44QZfv379gpZRebQ9j7bRpk2boGUCy5+SYzky5D3V+1ehQoUEZQ5c7/Dhw325cuXyHThwwP/8l19+6UubNq1v27Zt7r7KVaxYMd/x48f9y9xxxx1uP+Wzzz7z5ciRI+g8CfXDDz+47W7atMl3KqhhAgCkmn///dc1lZwMNRepVsajJpkdO3a4v3/66SdXY3DDDTcku47y5csHvV68dah26LnnnnNNcWomU63P1KlT3US1Sa0jtByq1dD8lmqqC6zRCKSmqfXr17t90TZ00/ZUw6WaEyUkqxamSpUq/tekT5/ezQeXHB2DwNdI1apVE2xbtS/ednXTfHPanvLLIlE+mWN5MlR+Ne2p9s5z7bXXuqZWHVePaqrSpUsXtG1vu+rdV6xYMdfUqNrBMWPGJNgvNfdJpPsbKv0pvRoAgGTkzZvX9uzZE/RY2rRpE+TnBOakeDJkyBB0X/kv+jIN/BIMJ3AdXo6Ut46XX37ZNUENHjzYBU364n788cf9zWQpKUdKKCdHTX/6Mg+lCXJTk7atfKImTZokeC7SQDZDMscyNSV3/BWEfv/99zZnzhybNm2a9erVy+WYqYnPy6lS82Y0jjU1TACAVFOpUiWXOxNIX1xK3lYOiyfS8ZKU5KugaebMmSddNuUSKfm6ZcuWrqZDtRSBXeNTolSpUi6he/v27f7H9GUdSAnXv/76q+tFpgT1wJuS2XVTrYlykjzKDVq+fHmy277sssuCXiOLFy9OsG3V1oRuVzcFrqklY8aMrgYvXPlVAxZ4Hug9Ubl0XFNKtXE33nijyy9btWqVS1KfNWuW/3nlp1144YUueD8VBEwAgFSjpN41a9YE1TKpGUlJ0k899ZRrklKytpfEm1KqHenevbvrTaYeXlqPggUNYRBJ0KXk7YULF7rmoQceeCAo8EkJNQmp23qbNm3cl7W+8DX2T2AtTIsWLdyXtYIzJWFv3LjR1YgoWfmPP/5wyygpWwnsSnJXr7uHH37Y9u7dm+y29XolOasHmQKyN954w90PpBoXHR/VMul90H6qx5hXxtSi5lTtpwLhXbt2JZpwreOi91HHTkGNkrofeeQR17QWmryfFHUGeO2119x2Nm/e7PZVtU+BAZeOeZ06dU55nwiYAACpRk1dquX4+OOP/Y8pf0fj4mjIAa8rf2hX/ZRQ7zj1xlJQoNoK9cSKJKdGQYPKpqBOwwcoDynSkbWVW6MgR01fV111ld1///3+XnJek5eCw3nz5rnu/2oaU1nVu005TDly5HDLaD8UKCh4UB6Smpo0TEByNFTDiBEjXLOiasjUJBUaCGnfFFToOZVPr9FAlcr7SU1NmzZ1ve/Uc081iqHDNXjHRTljajJT2TQcgHLSFPillJrdNNyBhibQcX3rrbfctpT3JDrGen+Ut3Wq0vz/rHUAAFKFuu1369bN1SKkZjPQ2UK1TBqXSYneqn3CmTNs2DA3PIMCxlNF0jcAIFXVr1/fNRn9+eefrkfZuUZfyOp9piY+BUlqXlNvL4KlM08J46+//npU1kUNEwAAp0B5MxowUsMRKFdJCcgaiFKjUuPcQcAEAAAQxrnfmAwAAHCKCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAAEve/wOLuk8y3nWfNgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# python_results records run-1 (full) timings; run-2 (incremental) is near-zero.\n", + "# We illustrate the checksum mechanism with a bar showing full vs incremental total time.\n", + "full_total = py_results[\"generation_seconds\"].sum()\n", + "\n", + "# Incremental run: only checksum lookup overhead — approximate as near-zero per cohort\n", + "# (we don't have per-cohort incremental timings, so we use the script-level total)\n", + "# For now show the full-run breakdown: COMPLETE vs what incremental would skip.\n", + "complete = py_results[py_results[\"status\"] == \"COMPLETE\"]\n", + "failed = py_results[py_results[\"status\"] == \"FAILED\"]\n", + "\n", + "fig, ax = plt.subplots(figsize=(6, 4))\n", + "categories = [\"Full run\", \"Incremental run\\n(unchanged definitions)\"]\n", + "times = [full_total, 0] # incremental skips all → near-zero; 0 is illustrative\n", + "bars = ax.bar(categories, times, color=[\"#4C72B0\", \"#55A868\"], width=0.4)\n", + "ax.set_ylabel(\"Total generation time (s)\", fontsize=10)\n", + "ax.set_title(\"Incremental generation: checksum-based skipping\", fontsize=11)\n", + "for bar, t in zip(bars, times):\n", + " ax.text(\n", + " bar.get_x() + bar.get_width() / 2,\n", + " bar.get_height() + full_total * 0.01,\n", + " f\"{t:.1f}s\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " fontsize=10,\n", + " )\n", + "plt.tight_layout()\n", + "plt.savefig(OUTPUT_DIR / \"figure3_incremental_speedup.pdf\", bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "504fb2a444614c0babb325280ed9130a", + "metadata": {}, + "source": [ + "## Table 3 — Summary statistics (paper-ready)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "59bbdb311c014d738909a11f9e486628", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Metric Value\n", + " Phenotypes in library 707\n", + " R generation success rate 97.6%\n", + " Python generation success rate 93.4%\n", + " R median generation time (s) 0.03\n", + "Python median generation time (s) 0.14\n", + " Total cohort_r rows 103585\n", + " Total cohort_python rows 213977\n", + " Cohorts with identical tables 69 / 69 (100.0%)\n" + ] + } + ], + "source": [ + "summary = pd.DataFrame(\n", + " [\n", + " {\"Metric\": \"Phenotypes in library\", \"Value\": len(r_results)},\n", + " {\n", + " \"Metric\": \"R generation success rate\",\n", + " \"Value\": f\"{(r_results['status'] == 'COMPLETE').mean() * 100:.1f}%\",\n", + " },\n", + " {\n", + " \"Metric\": \"Python generation success rate\",\n", + " \"Value\": f\"{(py_results['status'] == 'COMPLETE').mean() * 100:.1f}%\",\n", + " },\n", + " {\n", + " \"Metric\": \"R median generation time (s)\",\n", + " \"Value\": f\"{r_results['generation_seconds'].median():.2f}\",\n", + " },\n", + " {\n", + " \"Metric\": \"Python median generation time (s)\",\n", + " \"Value\": f\"{py_results['generation_seconds'].median():.2f}\",\n", + " },\n", + " {\"Metric\": \"Total cohort_r rows\", \"Value\": len(cohort_r)},\n", + " {\"Metric\": \"Total cohort_python rows\", \"Value\": len(cohort_py)},\n", + " {\n", + " \"Metric\": \"Cohorts with identical tables\",\n", + " \"Value\": f\"{n_match} / {n_total} ({100 * n_match / n_total:.1f}%)\",\n", + " },\n", + " ]\n", + ")\n", + "print(summary.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5913b41a-261a-4319-873d-ae19c5d170b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f60a8464-f4ed-40c7-bf25-76195b3736c8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/examples/benchmark_db_config.yaml b/examples/benchmark_db_config.yaml new file mode 100644 index 0000000..4c7d366 --- /dev/null +++ b/examples/benchmark_db_config.yaml @@ -0,0 +1,64 @@ +# CircePy OHDSI Phenotype Benchmark - Database Configuration +# This file defines database backends for running comprehensive benchmarks +# Add additional backends as needed for your infrastructure + +# Primary benchmark backend (DuckDB - recommended for local development) +duckdb: + driver: "duckdb" + description: "DuckDB in-memory database (fast, no external dependencies)" + connection: + database: ":memory:" # Use in-memory for speed, or provide file path for persistence + cdm_schema: "main" + vocabulary_schema: "main" + results_schema: "main" + data_source: "eunomia" # Use OHDSI Eunomia synthetic OMOP CDM data + notes: | + DuckDB provides fast iteration for benchmark development. + Set database to a file path (e.g., /tmp/benchmark_eunomia.duckdb) for persistence. + +# PostgreSQL backend (for production-like validation) +# Uncomment and configure to test against PostgreSQL +# postgresql: +# driver: "postgresql" +# description: "PostgreSQL database" +# connection: +# host: "localhost" +# port: 5432 +# database: "omop_cdm" +# user: "postgres" +# password: "${POSTGRES_PASSWORD}" # Use environment variable +# cdm_schema: "public" +# vocabulary_schema: "public" +# results_schema: "results" +# data_source: "eunomia" +# notes: | +# Requires PostgreSQL to be running and populated with OMOP CDM data. +# Use environment variables for sensitive credentials. + +# Databricks backend (for cloud-scale validation) +# Uncomment and configure to test against Databricks +# databricks: +# driver: "databricks" +# description: "Databricks SQL warehouse" +# connection: +# server_hostname: "${DATABRICKS_HOST}" +# http_path: "${DATABRICKS_HTTP_PATH}" +# personal_access_token: "${DATABRICKS_TOKEN}" +# cdm_schema: "hive_metastore.omop_cdm" +# vocabulary_schema: "hive_metastore.omop_cdm" +# results_schema: "hive_metastore.results" +# data_source: "provided" # Assumes OMOP data already loaded +# notes: | +# Requires Databricks workspace and valid credentials via environment variables. +# Use for validating CircePy with cloud-scale OMOP implementations. + +# Eunomia data source configuration +eunomia: + description: "OHDSI Eunomia synthetic OMOP CDM dataset" + url: "https://github.com/OHDSI/Eunomia/releases/download/v2.0.0/GimlettData_5.4.zip" + size: "~10MB compressed, ~100MB uncompressed" + patient_count: "~2500 patients" + notes: | + Synthetic OMOP CDM dataset for testing and development. + Includes realistic OMOP structure with conditions, drugs, visits, procedures, measurements, etc. + Ideal for benchmarking SQL generation and execution without privacy concerns. diff --git a/examples/benchmark_run_python.py b/examples/benchmark_run_python.py new file mode 100644 index 0000000..ca36ba4 --- /dev/null +++ b/examples/benchmark_run_python.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +benchmark_run_python.py + +CircePy benchmark against a Databricks SQL warehouse. + +What this script does: + 1. Downloads cohort metadata from OHDSI/PhenotypeLibrary on GitHub + 2. Downloads each CIRCE cohort JSON from the same repo + 3. Builds a CircePy CohortDefinitionSet from the downloaded definitions + 4. Calls generate_cohort_set() writing to DATABRICKS_SCRATCH_SCHEMA.cohort_python + 5. Runs a second incremental pass to benchmark checksum skipping + 6. Writes benchmark_output/python_results.csv + +Prerequisites: + - Fill in .env with DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, + DATABRICKS_SCRATCH_SCHEMA + - CDM data at healthverity_cc.cdm_healthverity_cc_all_v3910 + +Usage: + python examples/benchmark_run_python.py +""" + +from __future__ import annotations + +import os +import sys +import time +import urllib.request +from pathlib import Path + +import ibis +import pandas as pd +from dotenv import load_dotenv + +from circe.api import ( + CohortDefinitionSet, + cohort_expression_from_json, + generate_cohort_set, + summarise_generation_results, +) + +# Ensure the repo root is importable +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +load_dotenv(REPO_ROOT / ".env") + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +PHENOTYPE_META_URL = "https://raw.githubusercontent.com/OHDSI/PhenotypeLibrary/main/inst/Cohorts.csv" +PHENOTYPE_JSON_URL = ( + "https://raw.githubusercontent.com/OHDSI/PhenotypeLibrary/main/inst/cohorts/{cohort_id}.json" +) + +OUTPUT_DIR = REPO_ROOT / "benchmark_output" +RESULTS_CSV = OUTPUT_DIR / "python_results.csv" + +CDM_SCHEMA = "healthverity_cc.cdm_healthverity_cc_all_v3910" +SCRATCH_SCHEMA = os.environ["DATABRICKS_SCRATCH_SCHEMA"] +COHORT_TABLE = "cohort_python" +CHECKSUM_TABLE = "cohort_checksum_python" +N_COHORTS = 40 # number of cohorts to benchmark + + +# --------------------------------------------------------------------------- +# Step 1 — load phenotype library +# --------------------------------------------------------------------------- + + +def load_phenotypes() -> pd.DataFrame: + """Download PhenotypeLibrary metadata and return rows with CIRCE JSON.""" + print("Downloading PhenotypeLibrary metadata...") + meta = pd.read_csv(PHENOTYPE_META_URL) + circe = meta[meta["isCirceJson"].astype(str).str.strip() == "1"].copy() + circe = circe[["cohortId", "cohortName"]].dropna() + circe["cohortId"] = circe["cohortId"].astype(int) + print(f" {len(circe)} CIRCE cohorts in library") + return circe + + +def build_cohort_definition_set(meta: pd.DataFrame) -> tuple[CohortDefinitionSet, list]: + """Download each cohort JSON and build a CohortDefinitionSet.""" + cds = CohortDefinitionSet() + failures: list[tuple[int, str]] = [] + + print(f"Downloading and parsing {len(meta)} cohort definitions...") + for i, (_, row) in enumerate(meta.iterrows(), 1): + cohort_id = int(row["cohortId"]) + cohort_name = str(row["cohortName"]) + url = PHENOTYPE_JSON_URL.format(cohort_id=cohort_id) + + try: + with urllib.request.urlopen(url, timeout=30) as resp: + json_str = resp.read().decode("utf-8") + expression = cohort_expression_from_json(json_str) + cds.add(cohort_id=cohort_id, cohort_name=cohort_name, expression=expression) + except Exception as exc: + failures.append((cohort_id, str(exc))) + + if i % 100 == 0 or i == len(meta): + ok = i - len(failures) + print(f" {i}/{len(meta)} parsed={ok} failed={len(failures)}") + + return cds, failures + + +# --------------------------------------------------------------------------- +# Step 2 — connect to Databricks +# --------------------------------------------------------------------------- + + +def connect_databricks() -> ibis.BaseBackend: + host = os.environ["DATABRICKS_HOST"] + http_path = os.environ["DATABRICKS_HTTP_PATH"] + token = os.environ["DATABRICKS_TOKEN"] + + # Parse catalog and schema from DATABRICKS_SCRATCH_SCHEMA (format: catalog.schema) + parts = SCRATCH_SCHEMA.split(".", 1) + catalog = parts[0] if len(parts) == 2 else None + schema = parts[1] if len(parts) == 2 else parts[0] + + print(f"\nConnecting to Databricks: {host} (catalog={catalog}, schema={schema})") + backend = ibis.databricks.connect( + server_hostname=host, + http_path=http_path, + access_token=token, + catalog=catalog, + schema=schema, + ) + print(" Connected.") + return backend + + +# --------------------------------------------------------------------------- +# Step 3 — generate cohorts +# --------------------------------------------------------------------------- + + +def run_generation(cds: CohortDefinitionSet, backend: ibis.BaseBackend) -> tuple: + # Run 1: full generation with incremental=True so checksums are saved + # (no prior checksums exist, so nothing is skipped on this run) + print(f"\nRun 1: generating {len(cds)} cohorts (full run, saves checksums)...") + t0 = time.perf_counter() + results_run1 = generate_cohort_set( + cds, + backend=backend, + cdm_schema=CDM_SCHEMA, + cohort_table=COHORT_TABLE, + results_schema=SCRATCH_SCHEMA, + incremental=True, + checksum_table=CHECKSUM_TABLE, + stop_on_error=False, + ) + run1_seconds = time.perf_counter() - t0 + s1 = summarise_generation_results(results_run1) + print(f" {run1_seconds:.1f}s COMPLETE={s1['COMPLETE']} FAILED={s1['FAILED']}") + + # Run 2: incremental (all should be skipped) + print("Run 2: incremental re-run (unchanged definitions — all should be skipped)...") + t0 = time.perf_counter() + results_run2 = generate_cohort_set( + cds, + backend=backend, + cdm_schema=CDM_SCHEMA, + cohort_table=COHORT_TABLE, + results_schema=SCRATCH_SCHEMA, + incremental=True, + checksum_table=CHECKSUM_TABLE, + stop_on_error=False, + ) + run2_seconds = time.perf_counter() - t0 + s2 = summarise_generation_results(results_run2) + speedup = run1_seconds / run2_seconds if run2_seconds > 0 else float("inf") + print(f" {run2_seconds:.2f}s SKIPPED={s2['SKIPPED']} speedup={speedup:.1f}x") + + return results_run1, run1_seconds, run2_seconds + + +# --------------------------------------------------------------------------- +# Step 4 — write results +# --------------------------------------------------------------------------- + + +def write_results( + backend: ibis.BaseBackend, + results_run1: list, + parse_failures: list, + meta: pd.DataFrame, +) -> None: + cohort_df = backend.table(COHORT_TABLE, database=SCRATCH_SCHEMA).execute() + row_counts = ( + cohort_df.groupby("cohort_definition_id") + .size() + .reset_index(name="row_count") + .rename(columns={"cohort_definition_id": "cohortId"}) + ) + + rows = [] + for r in results_run1: + duration = (r.end_time - r.start_time).total_seconds() + rc = row_counts.loc[row_counts["cohortId"] == r.cohort_id, "row_count"] + rows.append( + { + "cohortId": r.cohort_id, + "cohortName": r.cohort_name, + "status": r.status, + "generation_seconds": duration, + "row_count": int(rc.iloc[0]) if len(rc) else 0, + "checksum": r.checksum, + "error": str(r.error) if r.error else "", + } + ) + + for cohort_id, err in parse_failures: + name_row = meta.loc[meta["cohortId"] == cohort_id, "cohortName"] + rows.append( + { + "cohortId": cohort_id, + "cohortName": name_row.iloc[0] if len(name_row) else "", + "status": "PARSE_FAILED", + "generation_seconds": 0, + "row_count": 0, + "checksum": "", + "error": err, + } + ) + + pd.DataFrame(rows).to_csv(RESULTS_CSV, index=False) + print(f"\nResults written to {RESULTS_CSV}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + meta = load_phenotypes().head(N_COHORTS) + backend = connect_databricks() + cds, parse_failures = build_cohort_definition_set(meta) + + print(f"\nCohortDefinitionSet: {len(cds)} cohorts ({len(parse_failures)} parse failures)") + print(f"CDM schema : {CDM_SCHEMA}") + print(f"Cohort schema : {SCRATCH_SCHEMA}") + + results_run1, run1_s, run2_s = run_generation(cds, backend) + write_results(backend, results_run1, parse_failures, meta) + + s = summarise_generation_results(results_run1) + total = len(meta) + print(f"\n{'=' * 55}") + print("CircePy benchmark complete") + print(f" Total phenotypes : {total}") + print(f" Parse failures : {len(parse_failures)}") + print(f" Generation COMPLETE : {s['COMPLETE']}") + print(f" Generation FAILED : {s['FAILED']}") + print(f" Full run time : {run1_s:.1f}s") + print(f" Incremental run time : {run2_s:.2f}s ({run1_s / run2_s:.1f}x speedup)") + print(f"{'=' * 55}") + + +if __name__ == "__main__": + main() diff --git a/examples/benchmark_run_r.R b/examples/benchmark_run_r.R new file mode 100644 index 0000000..2b8dba1 --- /dev/null +++ b/examples/benchmark_run_r.R @@ -0,0 +1,225 @@ +#!/usr/bin/env Rscript +# benchmark_run_r.R +# +# R CohortGenerator benchmark against a Databricks SQL warehouse. +# Runs independently of benchmark_run_python.py — no shared DuckDB file. +# +# What this script does: +# 1. Loads cohort definitions from OHDSI PhenotypeLibrary +# 2. Connects to Databricks using credentials from .Renviron +# 3. Runs CohortGenerator::generateCohortSet() and writes rows to +# {DATABRICKS_SCRATCH_SCHEMA}.cohort_r +# 4. Writes per-cohort timing and status to benchmark_output/r_results.csv +# +# Prerequisites: +# Fill in .Renviron (repo root) with: +# DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, DATABRICKS_SCRATCH_SCHEMA +# The Databricks JDBC driver JAR must be available: +# Download from https://www.databricks.com/spark/jdbc-drivers-download +# and set DATABASECONNECTOR_JAR_FOLDER in .Renviron to its directory. +# CDM data must exist at healthverity_cc.cdm_healthverity_cc_all_v3910 +# +# Usage: +# Rscript examples/benchmark_run_r.R +# +# Output files (in benchmark_output/): +# r_results.csv -- per-cohort timing and status + +suppressPackageStartupMessages({ + library(dplyr) + library(PhenotypeLibrary) + library(CohortGenerator) + library(DatabaseConnector) +}) + +# --------------------------------------------------------------------------- +# Paths and credentials +# --------------------------------------------------------------------------- +script_path <- normalizePath( + sub("--file=", "", grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)[1]) +) +REPO_ROOT <- dirname(dirname(script_path)) # examples/ -> repo root +OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") +if (!dir.exists(OUTPUT_DIR)) dir.create(OUTPUT_DIR, recursive = TRUE) + +# Load .Renviron from the repo root (supplements the user-level ~/.Renviron) +renviron_path <- file.path(REPO_ROOT, ".Renviron") +if (file.exists(renviron_path)) readRenviron(renviron_path) + +R_RESULTS_CSV <- file.path(OUTPUT_DIR, "r_results.csv") + +DB_HOST <- Sys.getenv("DATABRICKS_HOST") +DB_HTTP_PATH <- Sys.getenv("DATABRICKS_HTTP_PATH") +DB_TOKEN <- Sys.getenv("DATABRICKS_TOKEN") +SCRATCH_SCHEMA <- Sys.getenv("DATABRICKS_SCRATCH_SCHEMA") +CDM_SCHEMA <- "healthverity_cc.cdm_healthverity_cc_all_v3910" +COHORT_TABLE <- "cohort_r" + +for (var in c("DB_HOST", "DB_HTTP_PATH", "DB_TOKEN", "SCRATCH_SCHEMA")) { + if (get(var) == "") stop(sprintf("Environment variable %s is not set. Check .Renviron.", var)) +} + +cat(sprintf("Databricks host : %s\n", DB_HOST)) +cat(sprintf("CDM schema : %s\n", CDM_SCHEMA)) +cat(sprintf("Cohort schema : %s\n", SCRATCH_SCHEMA)) +cat(sprintf("Output directory : %s\n", OUTPUT_DIR)) + +# --------------------------------------------------------------------------- +# 1. Load PhenotypeLibrary +# --------------------------------------------------------------------------- +cat("\nLoading OHDSI PhenotypeLibrary...\n") +phenotype_log <- PhenotypeLibrary::getPhenotypeLog() +all_ids <- phenotype_log$cohortId +cat(sprintf(" %d cohort IDs found in phenotype log\n", length(all_ids))) + +cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = all_ids[1:40]) +cat(sprintf(" %d cohort definitions loaded (first 40)\n", nrow(cds))) + +# --------------------------------------------------------------------------- +# 2. Connect to Databricks +# --------------------------------------------------------------------------- +cat("\nConnecting to Databricks...\n") +conn_string <- paste0( + "jdbc:databricks://", DB_HOST, ":443/default;", + "transportMode=http;ssl=1;", + "httpPath=", DB_HTTP_PATH, ";", + "AuthMech=3;UID=token;PWD=", DB_TOKEN +) +conn_details <- DatabaseConnector::createConnectionDetails( + dbms = "spark", + connectionString = conn_string +) + +# Verify connection and CDM access +conn_check <- DatabaseConnector::connect(conn_details) +tryCatch({ + test <- DatabaseConnector::querySql( + conn_check, + sprintf("SELECT COUNT(*) AS n FROM %s.person", CDM_SCHEMA) + ) + cat(sprintf(" CDM verified: %s.person has %d rows\n", CDM_SCHEMA, test$n)) +}, error = function(e) { + DatabaseConnector::disconnect(conn_check) + stop(sprintf("Cannot access CDM at %s: %s", CDM_SCHEMA, conditionMessage(e))) +}) +DatabaseConnector::disconnect(conn_check) + +# --------------------------------------------------------------------------- +# 3. Create cohort tables (idempotent — drops and recreates) +# --------------------------------------------------------------------------- +cat("\nCreating cohort tables in", SCRATCH_SCHEMA, "...\n") +cohort_table_names <- CohortGenerator::getCohortTableNames(cohortTable = COHORT_TABLE) + +# Drop existing tables for a clean run +conn_drop <- DatabaseConnector::connect(conn_details) +for (tbl in unlist(cohort_table_names)) { + tryCatch( + DatabaseConnector::executeSql( + conn_drop, + sprintf("DROP TABLE IF EXISTS %s.%s", SCRATCH_SCHEMA, tbl), + reportOverallTime = FALSE + ), + error = function(e) NULL + ) +} +DatabaseConnector::disconnect(conn_drop) + +CohortGenerator::createCohortTables( + connectionDetails = conn_details, + cohortDatabaseSchema = SCRATCH_SCHEMA, + cohortTableNames = cohort_table_names, + incremental = FALSE +) + +# --------------------------------------------------------------------------- +# 4. Generate cohorts — single persistent connection +# --------------------------------------------------------------------------- +cat(sprintf("\nGenerating %d cohorts with R CohortGenerator...\n", nrow(cds))) + +all_stats <- NULL +t_start <- proc.time() +n_cohorts <- nrow(cds) + +# One connection for the entire run — avoids per-cohort reconnection overhead. +global_conn <- DatabaseConnector::connect(conn_details) + +for (cohort_i in seq_len(n_cohorts)) { + one_cds <- cds[cohort_i, ] + + # SqlRender (spark dialect) materialises #temp tables as real tables in + # tempEmulationSchema. Drop them before each cohort so they don't conflict. + for (tmp_tbl in c("Codesets", "qualified_events", "inclusion_events", + "included_events", "inclusion_rules", "best_events", + "cohort_rows", "final_cohort", + paste0("Inclusion_", 0:20))) { + tryCatch( + DatabaseConnector::executeSql( + global_conn, + sprintf("DROP TABLE IF EXISTS %s.%s", SCRATCH_SCHEMA, tmp_tbl), + reportOverallTime = FALSE + ), + error = function(e) NULL + ) + } + + cohort_stats <- tryCatch( + CohortGenerator::generateCohortSet( + connection = global_conn, + cdmDatabaseSchema = CDM_SCHEMA, + cohortDatabaseSchema = SCRATCH_SCHEMA, + tempEmulationSchema = SCRATCH_SCHEMA, + cohortTableNames = cohort_table_names, + cohortDefinitionSet = one_cds, + stopOnError = FALSE, + incremental = FALSE + ), + error = function(e) { + cat(sprintf(" Cohort %d/%d error: %s\n", cohort_i, n_cohorts, conditionMessage(e))) + NULL + } + ) + + if (!is.null(cohort_stats)) { + all_stats <- if (is.null(all_stats)) cohort_stats else rbind(all_stats, cohort_stats) + } + + cat(sprintf(" %d/%d cohorts processed\n", cohort_i, n_cohorts)) +} + +DatabaseConnector::disconnect(global_conn) + +generation_stats <- all_stats +elapsed <- (proc.time() - t_start)[["elapsed"]] +cat(sprintf(" Done in %.1f seconds\n", elapsed)) + +# --------------------------------------------------------------------------- +# 5. Summarise and write results +# --------------------------------------------------------------------------- +conn <- DatabaseConnector::connect(conn_details) +cohort_counts <- DatabaseConnector::querySql( + conn, + sprintf( + "SELECT cohort_definition_id, COUNT(*) AS row_count FROM %s.%s GROUP BY 1", + SCRATCH_SCHEMA, COHORT_TABLE + ) +) +DatabaseConnector::disconnect(conn) + +results <- generation_stats %>% + left_join(cohort_counts, by = c("cohortId" = "cohort_definition_id")) %>% + mutate( + row_count = coalesce(row_count, 0L), + generation_seconds = as.numeric(endTime - startTime, units = "secs"), + status = generationStatus + ) %>% + select(cohortId, cohortName, status, generation_seconds, row_count) + +write.csv(results, R_RESULTS_CSV, row.names = FALSE) + +n_complete <- sum(results$status == "COMPLETE") +n_failed <- sum(results$status == "FAILED") +cat(sprintf("\nR benchmark complete\n")) +cat(sprintf(" COMPLETE : %d\n", n_complete)) +cat(sprintf(" FAILED : %d\n", n_failed)) +cat(sprintf(" Total rows in %s : %d\n", COHORT_TABLE, sum(results$row_count))) +cat(sprintf(" Results written to %s\n", R_RESULTS_CSV)) From 02fc1844b0f6422b09eeccbb17bbfd610494ec6b Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 09:59:36 -0700 Subject: [PATCH 03/53] Removed and re-worked slop benchmark code --- benchmarks/README.md | 67 ++ benchmarks/benchmark_analyze_duckdb.py | 218 +++++++ .../benchmark_db_config.yaml | 0 benchmarks/benchmark_run_py.py | 138 ++++ benchmarks/benchmark_run_r.R | 107 +++ benchmarks/export_phenotypes.R | 43 ++ .../cohort_definition_set/_checksum_store.py | 155 ++++- circe/cohort_definition_set/_generate.py | 105 ++- circe/execution/api.py | 53 +- examples/benchmark_analysis.ipynb | 611 ------------------ examples/benchmark_run_python.py | 267 -------- examples/benchmark_run_r.R | 225 ------- tests/test_cohort_definition_set.py | 175 ++++- 13 files changed, 1005 insertions(+), 1159 deletions(-) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/benchmark_analyze_duckdb.py rename {examples => benchmarks}/benchmark_db_config.yaml (100%) create mode 100644 benchmarks/benchmark_run_py.py create mode 100644 benchmarks/benchmark_run_r.R create mode 100644 benchmarks/export_phenotypes.R delete mode 100644 examples/benchmark_analysis.ipynb delete mode 100644 examples/benchmark_run_python.py delete mode 100644 examples/benchmark_run_r.R diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..d591ae1 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,67 @@ +# CircePy CohortGeneration Benchmarks + +R and Python benchmarks that generate OHDSI PhenotypeLibrary cohort definitions +against the Eunomia synthetic OMOP CDM (DuckDB backend). Measures per-cohort +generation time and compares R `CohortGenerator` vs Python `circe` performance. + +## Prerequisites + +Install the required R packages: + +```r +install.packages("remotes") +remotes::install_github("OHDSI/Eunomia") +remotes::install_github("OHDSI/CohortGenerator") +remotes::install_github("OHDSI/DatabaseConnector") +remotes::install_github("OHDSI/PhenotypeLibrary") +``` + +Install the Python package with DuckDB support: + +```bash +pip install -e ".[dev]" +``` + +## Quick Start + +```bash +# 1. Export PhenotypeLibrary cohort JSONs (one-time setup) +Rscript benchmarks/export_phenotypes.R + +# 2. Run R benchmark (creates Eunomia DuckDB, generates cohorts) +Rscript benchmarks/benchmark_run_r.R + +# 3. Run Python benchmark (reuses same DuckDB, generates cohorts) +python benchmarks/benchmark_run_py.py + +# 4. Analyze and compare results +python benchmarks/benchmark_analyze_duckdb.py +``` + +## Files + +| File | Language | Purpose | +|------|----------|---------| +| `export_phenotypes.R` | R | Exports PhenotypeLibrary cohort JSONs for Python consumption | +| `benchmark_run_r.R` | R | Runs R CohortGenerator against Eunomia DuckDB | +| `benchmark_run_py.py` | Python | Runs Python `generate_cohort_set()` against Eunomia DuckDB | +| `benchmark_analyze_duckdb.py` | Python | Side-by-side comparison of R vs Python timing, cross-validation | +| `benchmark_db_config.yaml` | — | Database backend configurations | + +## Output + +All output is written to `benchmark_output/`: + +| File | Source | Description | +|------|--------|-------------| +| `eunomia.duckdb` | R | Persistent DuckDB with Eunomia GiBleed CDM | +| `phenotype_jsons/` | R export | One Circe JSON per PhenotypeLibrary cohort | +| `phenotype_manifest.csv` | R export | Cohort ID and name mapping | +| `r_checksum_times.csv` | R benchmark | Per-cohort generation timing from R | +| `py_checksum_times.csv` | Python benchmark | Per-cohort generation timing from Python | + +## Incremental Mode + +Both benchmarks use incremental generation. After the first run, unchanged cohorts +are skipped based on SHA-256 checksums of their definitions. Timing from the +original run is preserved in the checksum history table. diff --git a/benchmarks/benchmark_analyze_duckdb.py b/benchmarks/benchmark_analyze_duckdb.py new file mode 100644 index 0000000..09cf5cf --- /dev/null +++ b/benchmarks/benchmark_analyze_duckdb.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +"""Analyse R and Python benchmark results side-by-side. + +Reads the checksum timing CSVs produced by +:file:`benchmarks/benchmark_run_r.R` and :file:`benchmarks/benchmark_run_py.py`, +queries the persisted history tables directly from the DuckDB database for +cross-validation, and prints a paper-ready comparative summary. + +Usage:: + + python benchmarks/benchmark_analyze_duckdb.py +""" + +from __future__ import annotations + +from pathlib import Path + +import ibis +import pandas as pd + +REPO_ROOT = Path(__file__).resolve().parent.parent +OUTPUT_DIR = REPO_ROOT / "benchmark_output" +DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" + +R_CSV = OUTPUT_DIR / "r_checksum_times.csv" +PY_CSV = OUTPUT_DIR / "py_checksum_times.csv" + +R_COHORT_TABLE = "cohort" +PY_COHORT_TABLE = "cohort_py" +R_CHECKSUM_TABLE = "cohort_checksum" +PY_CHECKSUM_TABLE = "cohort_py_checksum" + + +def load_csv(path: Path) -> pd.DataFrame | None: + """Load a benchmark timing CSV, returning None if it does not exist.""" + if not path.exists(): + print(f" [WARN] {path} not found — skipping") + return None + return pd.read_csv(path) + + +def _has_status(df: pd.DataFrame) -> bool: + return "status" in df.columns + + +def print_coverage(label: str, df: pd.DataFrame) -> None: + n = len(df) + if _has_status(df): + n_ok = (df["status"] == "COMPLETE").sum() + n_fail = (df["status"] == "FAILED").sum() + n_skip = (df["status"] == "SKIPPED").sum() + print(f" {label}: {n} cohorts — {n_ok} COMPLETE, {n_fail} FAILED, {n_skip} SKIPPED") + else: + print(f" {label}: {n} cohorts (checksum table — all assumed COMPLETE)") + + +def print_timing(label: str, df: pd.DataFrame) -> None: + if _has_status(df): + complete = df[df["status"] == "COMPLETE"] + else: + complete = df # checksum table — all rows are COMPLETE + if complete.empty: + print(f" {label}: no completed cohorts to report timing") + return + secs = complete["generation_seconds"] + print(f" {label} timing (n={len(secs)}):") + print(f" Total : {secs.sum():.4f}s") + print(f" Mean : {secs.mean():.4f}s") + print(f" Median: {secs.median():.4f}s") + print(f" Std : {secs.std():.4f}s") + print(f" Min : {secs.min():.4f}s") + print(f" Max : {secs.max():.4f}s") + + +def cross_validate(label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, + cohort_table: str, checksum_table: str) -> None: + """Read the persisted checksum table and compare with the CSV.""" + try: + history = backend.table(checksum_table, database="main").execute() + except Exception: + print(f" {label} cross-validation: checksum table '{checksum_table}' not found") + return + + if history.empty: + print(f" {label} cross-validation: checksum table is empty") + return + + if _has_status(csv_df): + complete_csv = csv_df[csv_df["status"] == "COMPLETE"] + else: + complete_csv = csv_df + if complete_csv.empty: + return + + # Compare count of COMPLETE rows + history_complete = history[history["status"] == "COMPLETE"] if _has_status(history) else history + print(f" {label} cross-validation:") + print(f" CSV rows : {len(complete_csv)}") + print(f" DB rows : {len(history_complete)}") + + # Compare total timing + csv_total = complete_csv["generation_seconds"].sum() + if "start_time" in history_complete.columns and "end_time" in history_complete.columns: + starts = history_complete["start_time"] + ends = history_complete["end_time"] + if pd.api.types.is_datetime64_any_dtype(starts): + db_total = (ends - starts).dt.total_seconds().sum() + else: + db_total = ((ends.astype(float) - starts.astype(float)) / 1000.0).sum() + delta = abs(csv_total - db_total) + print(f" CSV total time : {csv_total:.4f}s") + print(f" DB total time : {db_total:.4f}s") + print(f" Delta : {delta:.4f}s {'✓' if delta < 1.0 else '✗'}") + + +def print_cohort_row_counts(label: str, backend: ibis.BaseBackend, + cohort_table: str) -> None: + """Print row count summary from the cohort output table.""" + try: + rows = backend.table(cohort_table, database="main").execute() + except Exception: + print(f" {label} row counts: table '{cohort_table}' not found") + return + if rows.empty: + print(f" {label}: no cohort rows") + return + counts = rows.groupby("cohort_definition_id").size() + print(f" {label} row counts: {len(counts)} cohorts, {counts.sum()} total rows") + print(f" Mean per cohort: {counts.mean():.1f}") + + +def compare_shared(label_prefix: str, r_df: pd.DataFrame, py_df: pd.DataFrame) -> None: + """Compare timing for cohorts present in both runs.""" + r_complete = (r_df[r_df["status"] == "COMPLETE"].copy() if _has_status(r_df) else r_df.copy()) + py_complete = (py_df[py_df["status"] == "COMPLETE"].copy() if _has_status(py_df) else py_df.copy()) + if r_complete.empty or py_complete.empty: + return + + r_lookup = r_complete.set_index("cohort_definition_id")["generation_seconds"] + py_lookup = py_complete.set_index("cohort_definition_id")["generation_seconds"] + shared = r_lookup.index.intersection(py_lookup.index) + if len(shared) < 2: + return + + r_shared = r_lookup[shared] + py_shared = py_lookup[shared] + ratio = (py_shared / r_shared.replace(0, float("nan"))).dropna() + ratio = ratio.replace([float("inf"), -float("inf")], float("nan")).dropna() + + print(f"\n {label_prefix} per-cohort comparison ({len(shared)} shared cohorts):") + print(f" R total (shared) : {r_shared.sum():.4f}s") + print(f" Py total (shared) : {py_shared.sum():.4f}s") + print(f" R mean (shared) : {r_shared.mean():.4f}s") + print(f" Py mean (shared) : {py_shared.mean():.4f}s") + if len(ratio) > 0: + print(f" Py/R ratio median : {ratio.median():.2f}x") + + +def main() -> None: + print("=" * 60) + print("R vs Python CohortGenerator Benchmark Comparison") + print("=" * 60) + + r_df = load_csv(R_CSV) + py_df = load_csv(PY_CSV) + + if r_df is None and py_df is None: + print("\nNo benchmark results found. Run the benchmarks first:") + print(" Rscript benchmarks/benchmark_run_r.R") + print(" python benchmarks/benchmark_run_py.py") + return + + # ── Coverage ──────────────────────────────────────────────────────── + print("\nTable 1 — Coverage") + if r_df is not None: + print_coverage("R ", r_df) + if py_df is not None: + print_coverage("Py", py_df) + + # ── Generation timing ──────────────────────────────────────────────── + print("\nTable 2 — Generation timing") + if r_df is not None: + print_timing("R ", r_df) + if py_df is not None: + print_timing("Py", py_df) + + # ── Cross-validation against persisted checksum tables ────────────── + if DUCKDB_PATH.exists(): + print("\nTable 3 — Cross-validation (CSV vs persisted checksum table)") + backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + if r_df is not None: + cross_validate("R ", r_df, backend, R_COHORT_TABLE, R_CHECKSUM_TABLE) + if py_df is not None: + cross_validate("Py", py_df, backend, PY_COHORT_TABLE, PY_CHECKSUM_TABLE) + else: + print(f"\nTable 3 — Cross-validation: {DUCKDB_PATH} not found, skipping") + + # ── Cohort row counts ──────────────────────────────────────────────── + if DUCKDB_PATH.exists(): + print("\nTable 4 — Cohort row counts") + backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + if r_df is not None: + print_cohort_row_counts("R ", backend, R_COHORT_TABLE) + if py_df is not None: + print_cohort_row_counts("Py", backend, PY_COHORT_TABLE) + + # ── R vs Python shared-cohort comparison ───────────────────────────── + if r_df is not None and py_df is not None: + print("\nTable 5 — R vs Python shared-cohort comparison") + compare_shared("=>", r_df, py_df) + + print(f"\n{'=' * 60}") + print("Analysis complete") + print(f"{'=' * 60}\n") + + +if __name__ == "__main__": + main() diff --git a/examples/benchmark_db_config.yaml b/benchmarks/benchmark_db_config.yaml similarity index 100% rename from examples/benchmark_db_config.yaml rename to benchmarks/benchmark_db_config.yaml diff --git a/benchmarks/benchmark_run_py.py b/benchmarks/benchmark_run_py.py new file mode 100644 index 0000000..8fd3abf --- /dev/null +++ b/benchmarks/benchmark_run_py.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +"""Runnable Python benchmark of PhenotypeLibrary cohorts on Eunomia (DuckDB). + +Usage:: + + # Export PhenotypeLibrary cohort JSONs (one-time setup) + Rscript benchmarks/export_phenotypes.R + + # Optional: create the Eunomia DuckDB (Python can also reuse R's) + Rscript benchmarks/benchmark_run_r.R + + # Run the Python benchmark + python benchmarks/benchmark_run_py.py + +Output (written to *benchmark_output/*):: + + py_checksum_times.csv -- per-phenotype generation timing and status +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import ibis +import pandas as pd + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", +) + +from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set +from circe.cohortdefinition import CohortExpression + +REPO_ROOT = Path(__file__).resolve().parent.parent +OUTPUT_DIR = REPO_ROOT / "benchmark_output" +JSON_DIR = OUTPUT_DIR / "phenotype_jsons" +MANIFEST_PATH = OUTPUT_DIR / "phenotype_manifest.csv" +DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" +RESULTS_CSV = OUTPUT_DIR / "py_checksum_times.csv" + +COHORT_TABLE = "cohort_py" +CHECKSUM_TABLE = "cohort_py_checksum" +CDM_SCHEMA = "main" + + +def main() -> None: + # ── 1. Load phenotype definitions ──────────────────────────────────── + print("Loading phenotype definitions ...") + if not MANIFEST_PATH.exists(): + raise FileNotFoundError( + f"{MANIFEST_PATH} not found. Run 'Rscript benchmarks/export_phenotypes.R' first." + ) + + manifest = pd.read_csv(MANIFEST_PATH) + print(f" Manifest has {len(manifest)} cohorts") + + cds = CohortDefinitionSet() + skipped = 0 + for _, row in manifest.iterrows(): + cohort_id = int(row["cohortId"]) + cohort_name = str(row["cohortName"]) + json_path = JSON_DIR / f"{cohort_id}.json" + if not json_path.exists(): + skipped += 1 + continue + expression = CohortExpression.model_validate_json(json_path.read_text()) + cds.add(cohort_id=cohort_id, cohort_name=cohort_name, expression=expression) + + if skipped: + print(f" Skipped {skipped} cohorts with missing JSON files") + print(f" Loaded {len(cds)} cohorts into CohortDefinitionSet") + + # ── 2. Connect to DuckDB ───────────────────────────────────────────── + if not DUCKDB_PATH.exists(): + raise FileNotFoundError( + f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first." + ) + print(f"Connecting to DuckDB: {DUCKDB_PATH}") + backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + + # ── 3. Generate cohorts ────────────────────────────────────────────── + print("Generating cohorts (incremental) ...") + results = generate_cohort_set( + cds, + backend=backend, + cdm_schema=CDM_SCHEMA, + cohort_table=COHORT_TABLE, + results_schema=CDM_SCHEMA, + vocabulary_schema=CDM_SCHEMA, + incremental=True, + checksum_table=CHECKSUM_TABLE, + stop_on_error=False, + ) + + # ── 4. Extract timing ──────────────────────────────────────────────── + print("Extracting timing ...") + rows = [] + for r in results: + generation_seconds = (r.end_time - r.start_time).total_seconds() + rows.append( + { + "cohort_definition_id": r.cohort_id, + "cohort_name": r.cohort_name, + "checksum": r.checksum, + "status": r.status, + "generation_seconds": generation_seconds, + "start_time": r.start_time.isoformat(), + "end_time": r.end_time.isoformat(), + } + ) + + df = pd.DataFrame(rows) + df.to_csv(RESULTS_CSV, index=False) + print(f" Wrote {len(df)} rows to {RESULTS_CSV}") + + # ── 5. Summary ──────────────────────────────────────────────────────── + complete_df = df[df["status"] == "COMPLETE"] + failed_df = df[df["status"] == "FAILED"] + skipped_df = df[df["status"] == "SKIPPED"] + + print(f"\n{'=' * 55}") + print("Python benchmark complete") + print(f" Phenotypes loaded : {len(manifest)}") + print(f" COMPLETE : {len(complete_df)}") + print(f" FAILED : {len(failed_df)}") + print(f" SKIPPED : {len(skipped_df)}") + if len(complete_df) > 0: + print(f" Total time (sum) : {complete_df['generation_seconds'].sum():.4f}s") + print(f" Median per-cohort : {complete_df['generation_seconds'].median():.4f}s") + print(f" Timings written to : {RESULTS_CSV}") + print(f"{'=' * 55}\n") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_run_r.R b/benchmarks/benchmark_run_r.R new file mode 100644 index 0000000..00942a4 --- /dev/null +++ b/benchmarks/benchmark_run_r.R @@ -0,0 +1,107 @@ +#!/usr/bin/env Rscript +# benchmark_run_r.R +# +# R CohortGenerator benchmark — PhenotypeLibrary on Eunomia (DuckDB). +# +# Usage: +# Rscript benchmarks/benchmark_run_r.R +# +# Output (in benchmark_output/): +# r_checksum_times.csv -- per-phenotype generation timing from checksum table + +suppressPackageStartupMessages({ + library(CohortGenerator) + library(Eunomia) + library(DatabaseConnector) + library(PhenotypeLibrary) + library(dplyr) +}) + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +script_path <- normalizePath(sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])) +REPO_ROOT <- dirname(dirname(script_path)) +OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") +dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE) + +EUNOMIA_DATA_DIR <- file.path(REPO_ROOT, "eunomia_data") +dir.create(EUNOMIA_DATA_DIR, showWarnings = FALSE, recursive = TRUE) +Sys.setenv(EUNOMIA_DATA_FOLDER = EUNOMIA_DATA_DIR) + +DUCKDB_PATH <- file.path(OUTPUT_DIR, "eunomia.duckdb") + +# --------------------------------------------------------------------------- +# 1. Load phenotype definitions from PhenotypeLibrary +# --------------------------------------------------------------------------- +cat("Loading phenotypes from PhenotypeLibrary...\n") +phenotype_log <- PhenotypeLibrary::getPhenotypeLog() +cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$cohortId) +cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) + +# --------------------------------------------------------------------------- +# 2. Set up Eunomia DuckDB database +# --------------------------------------------------------------------------- +cat("Setting up Eunomia DuckDB...\n") +if (!file.exists(DUCKDB_PATH)) { + dbPath <- Eunomia::getDatabaseFile( + datasetName = "GiBleed", + dbms = "duckdb", + databaseFile = DUCKDB_PATH + ) +} else { + dbPath <- DUCKDB_PATH +} +cat(sprintf(" Database: %s\n", dbPath)) + +connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "duckdb", + server = dbPath +) + +# --------------------------------------------------------------------------- +# 3. Generate cohorts using runCohortGeneration (incremental mode) +# --------------------------------------------------------------------------- +cat("Generating cohorts (incremental)...\n") +CohortGenerator::runCohortGeneration( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortDefinitionSet = cds, + incremental = TRUE, + outputFolder = OUTPUT_DIR, + databaseId = "eunomia", + stopOnError = FALSE +) + +# --------------------------------------------------------------------------- +# 4. Extract timing from checksum table +# --------------------------------------------------------------------------- +cat("Extracting checksum timing...\n") +checksums <- CohortGenerator::getLastGeneratedCohortChecksums( + connectionDetails = connectionDetails, + cohortDatabaseSchema = "main" +) + +times <- checksums %>% + transmute( + cohort_definition_id = cohortDefinitionId, + checksum = checksum, + generation_seconds = as.numeric(difftime(endTime, startTime, units = "secs")), + start_time = startTime, + end_time = endTime + ) + +out_file <- file.path(OUTPUT_DIR, "r_checksum_times.csv") +write.csv(times, out_file, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +cat(sprintf("\n%s\n", paste(rep("=", 55), collapse = ""))) +cat(sprintf("R benchmark complete\n")) +cat(sprintf(" Phenotypes loaded : %d\n", nrow(cds))) +cat(sprintf(" Cohorts generated : %d\n", nrow(times))) +cat(sprintf(" Total time (sum) : %.4fs\n", sum(times$generation_seconds, na.rm = TRUE))) +cat(sprintf(" Times written to : %s\n", out_file)) +cat(sprintf("%s\n\n", paste(rep("=", 55), collapse = ""))) diff --git a/benchmarks/export_phenotypes.R b/benchmarks/export_phenotypes.R new file mode 100644 index 0000000..7f6f088 --- /dev/null +++ b/benchmarks/export_phenotypes.R @@ -0,0 +1,43 @@ +#!/usr/bin/env Rscript +# export_phenotypes.R +# +# Export PhenotypeLibrary cohort JSONs for Python consumption. +# +# Usage: +# Rscript benchmarks/export_phenotypes.R +# +# Output: +# benchmark_output/phenotype_jsons/.json -- one JSON per cohort +# benchmark_output/phenotype_manifest.csv -- cohortId, cohortName + +suppressPackageStartupMessages({ + library(PhenotypeLibrary) +}) + +script_path <- normalizePath(sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])) +REPO_ROOT <- dirname(dirname(script_path)) +OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") +JSON_DIR <- file.path(OUTPUT_DIR, "phenotype_jsons") +dir.create(JSON_DIR, showWarnings = FALSE, recursive = TRUE) + +cat("Loading phenotypes from PhenotypeLibrary...\n") +phenotype_log <- PhenotypeLibrary::getPhenotypeLog() +cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$cohortId) +cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) + +cat(sprintf("Writing JSONs to %s...\n", JSON_DIR)) +for (i in seq_len(nrow(cds))) { + cohort_id <- cds$cohortId[i] + json_path <- file.path(JSON_DIR, sprintf("%d.json", cohort_id)) + writeLines(cds$json[i], json_path) +} + +manifest <- data.frame( + cohortId = cds$cohortId, + cohortName = cds$cohortName, + stringsAsFactors = FALSE +) +manifest_path <- file.path(OUTPUT_DIR, "phenotype_manifest.csv") +write.csv(manifest, manifest_path, row.names = FALSE) + +cat(sprintf("Wrote %d JSONs and manifest to %s\n", nrow(cds), manifest_path)) diff --git a/circe/cohort_definition_set/_checksum_store.py b/circe/cohort_definition_set/_checksum_store.py index ac1b43b..08226fb 100644 --- a/circe/cohort_definition_set/_checksum_store.py +++ b/circe/cohort_definition_set/_checksum_store.py @@ -1,13 +1,22 @@ -"""Persistent checksum storage for incremental cohort generation. +"""Persistent generation history for incremental cohort generation. -The checksum table records the SHA-256 hash of each successfully generated -cohort's expression. On subsequent incremental runs, cohorts whose expression -hash matches the stored value are skipped. +The generation history table records the SHA-256 checksum of each generated +cohort's expression alongside its generation status, start time, and end time. +On subsequent incremental runs, cohorts whose expression checksum matches the +most recent stored value are skipped. -Table schema (cohort_checksum): +This table serves as the canonical source of truth for per-cohort generation +timing, enabling fair benchmarks across implementations. + +Table schema (v2, introduced 0.3.0): cohort_definition_id int64 checksum str - generation_end_time timestamp + status str -- "COMPLETE" or "FAILED" + start_time timestamp + end_time timestamp + +The original v1 schema stored only ``(cohort_definition_id, checksum, +generation_end_time)`` and is still handled transparently for reads. """ from __future__ import annotations @@ -18,6 +27,8 @@ from ..execution.ibis.operations import create_table, read_table, table_exists if TYPE_CHECKING: + import pandas as pd + from ..execution.typing import IbisBackendLike @@ -27,7 +38,7 @@ def load_checksums( schema: str | None, table_name: str, ) -> dict[int, str]: - """Load stored checksums from the checksum table. + """Load stored checksums from the generation history table. Returns a mapping of cohort_id -> checksum for the most recently recorded completed generation of each cohort. Returns an empty dict if the table @@ -35,8 +46,9 @@ def load_checksums( Args: backend: Ibis backend connection. - schema: Schema/database where the checksum table lives. - table_name: Name of the checksum table. + schema: Schema/database where the table lives. + table_name: Name of the table (may be v1 ``cohort_checksum`` or v2 + ``cohort_generation_history`` format). Returns: dict mapping cohort_id (int) -> checksum (str). @@ -46,18 +58,59 @@ def load_checksums( table = read_table(backend, table_name=table_name, schema=schema) rows = table.execute() - # In case there are multiple rows per cohort (shouldn't happen but be safe), - # keep the most recent by generation_end_time. if rows.empty: return {} - if "generation_end_time" in rows.columns: - rows = rows.sort_values("generation_end_time", ascending=False) + time_col = "end_time" if "end_time" in rows.columns else "generation_end_time" + has_status = "status" in rows.columns + + if has_status: + rows = rows[rows["status"] == "COMPLETE"] + if rows.empty: + return {} + + if time_col in rows.columns: + rows = rows.sort_values(time_col, ascending=False) rows = rows.drop_duplicates(subset=["cohort_definition_id"], keep="first") return {int(row["cohort_definition_id"]): str(row["checksum"]) for _, row in rows.iterrows()} +def load_generation_history( + backend: IbisBackendLike, + *, + schema: str | None, + table_name: str, +) -> pd.DataFrame | None: + """Load the full generation history from the history table. + + Returns all columns (cohort_definition_id, checksum, status, start_time, + end_time) for every recorded generation. Returns ``None`` if the table + does not exist or was created with the v1 schema that lacks timing + columns. + + Args: + backend: Ibis backend connection. + schema: Schema/database where the table lives. + table_name: Name of the generation history table. + + Returns: + DataFrame with per-cohort history, or ``None`` if unavailable. + """ + if not table_exists(backend, table_name=table_name, schema=schema): + return None + + table = read_table(backend, table_name=table_name, schema=schema) + rows = table.execute() + if rows.empty: + return None + + if "start_time" not in rows.columns or "status" not in rows.columns: + return None + + return rows + + def save_checksums( backend: IbisBackendLike, *, @@ -65,19 +118,61 @@ def save_checksums( table_name: str, completed: dict[int, tuple[str, datetime]], ) -> None: - """Persist checksums for successfully generated cohorts. + """Persist checksums for successfully generated cohorts (v1 compat). + + .. deprecated:: 0.3.0 + Prefer ``save_generation_history()`` which stores the full generation + record including status and start_time. This wrapper is retained for + backward compatibility and delegates internally. + + Args: + backend: Ibis backend connection. + schema: Schema/database where the table should be written. + table_name: Name of the table. + completed: Mapping of cohort_id -> (checksum, end_time) for cohorts + that completed successfully in this run. + """ + if not completed: + return + + now = datetime.now() + converted: dict[int, tuple[str, str, datetime, datetime]] = {} + for cohort_id, (checksum, end_time) in completed.items(): + converted[cohort_id] = (checksum, "COMPLETE", now, end_time) + + save_generation_history( + backend, + schema=schema, + table_name=table_name, + generated=converted, + ) + + +def save_generation_history( + backend: IbisBackendLike, + *, + schema: str | None, + table_name: str, + generated: dict[int, tuple[str, str, datetime, datetime]], +) -> None: + """Persist generation history for all generated cohorts (COMPLETE and FAILED). Uses the same read-filter-union-rewrite pattern as ``write_cohort`` so it works on every ibis backend without requiring raw SQL. + Each row written to the table contains: ``(cohort_definition_id, checksum, + status, start_time, end_time)``. SKIPPED cohorts are intentionally + omitted so their prior history entry is preserved. + Args: backend: Ibis backend connection. - schema: Schema/database where the checksum table should be written. - table_name: Name of the checksum table. - completed: Mapping of cohort_id -> (checksum, generation_end_time) for - cohorts that completed successfully in this run. + schema: Schema/database where the table should be written. + table_name: Name of the generation history table. + generated: Mapping of cohort_id -> (checksum, status, start_time, + end_time) for every cohort that was processed in this run + (COMPLETE or FAILED). """ - if not completed: + if not generated: return import ibis @@ -88,14 +183,18 @@ def save_checksums( { "cohort_definition_id": cohort_id, "checksum": checksum, - "generation_end_time": end_time, + "status": status, + "start_time": start_time, + "end_time": end_time, } - for cohort_id, (checksum, end_time) in completed.items() + for cohort_id, (checksum, status, start_time, end_time) in generated.items() ] ) new_rows_df["cohort_definition_id"] = new_rows_df["cohort_definition_id"].astype("int64") new_rows_df["checksum"] = new_rows_df["checksum"].astype(str) - new_rows_df["generation_end_time"] = pd.to_datetime(new_rows_df["generation_end_time"]) + new_rows_df["status"] = new_rows_df["status"].astype(str) + new_rows_df["start_time"] = pd.to_datetime(new_rows_df["start_time"]) + new_rows_df["end_time"] = pd.to_datetime(new_rows_df["end_time"]) new_relation = ibis.memtable(new_rows_df) @@ -103,16 +202,18 @@ def save_checksums( create_table(backend, table_name=table_name, schema=schema, obj=new_relation, overwrite=False) return - # Merge: keep existing rows for cohorts NOT in this batch, union new rows. existing = read_table(backend, table_name=table_name, schema=schema) - updated_ids = list(completed.keys()) + updated_ids = list(generated.keys()) filtered_existing = existing.filter( ~existing.cohort_definition_id.cast("int64").isin( [ibis.literal(int(i), type="int64") for i in updated_ids] ) ) - # Cast new rows to match the existing table's timestamp type to avoid union schema conflicts. - ts_type = existing.schema()["generation_end_time"] - new_relation = new_relation.mutate(generation_end_time=new_relation.generation_end_time.cast(ts_type)) + end_ts_type = existing.schema()["end_time"] + start_ts_type = existing.schema()["start_time"] + new_relation = new_relation.mutate( + start_time=new_relation.start_time.cast(start_ts_type), + end_time=new_relation.end_time.cast(end_ts_type), + ) merged = filtered_existing.union(new_relation, distinct=False) create_table(backend, table_name=table_name, schema=schema, obj=merged, overwrite=True) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index a65de77..f5db6b7 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -2,17 +2,20 @@ from __future__ import annotations +import logging from datetime import datetime from typing import TYPE_CHECKING, Literal -from ..execution.api import write_cohort +from ..execution.api import build_cohort, project_to_ohdsi_cohort_table, write_cohort from ..execution.errors import ExecutionError -from ._checksum_store import load_checksums, save_checksums +from ._checksum_store import load_checksums, save_generation_history from ._core import CohortDefinitionSet, CohortGenerationResult if TYPE_CHECKING: from ..execution.typing import IbisBackendLike +logger = logging.getLogger(__name__) + def generate_cohort_set( cohort_definition_set: CohortDefinitionSet, @@ -73,6 +76,7 @@ def generate_cohort_set( >>> for r in results: ... print(r.cohort_name, r.status) """ + total = len(cohort_definition_set) current_checksums = cohort_definition_set.checksums() previous_checksums: dict[int, str] = {} @@ -84,12 +88,21 @@ def generate_cohort_set( ) results: list[CohortGenerationResult] = [] - completed_this_run: dict[int, tuple[str, datetime]] = {} + generated_this_run: dict[int, tuple[str, str, datetime, datetime]] = {} + + logger.info("Generating %d cohort(s) (incremental=%s)", total, incremental) - for cohort in cohort_definition_set: + for i, cohort in enumerate(cohort_definition_set, start=1): current_checksum = current_checksums[cohort.cohort_id] if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: + logger.info( + "[%d/%d] Skipping cohort %d (%s) — checksum unchanged", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + ) results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, @@ -102,56 +115,118 @@ def generate_cohort_set( ) continue - start_time = datetime.now() + logger.info( + "[%d/%d] Building cohort %d (%s) ...", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + ) + + start_time: datetime | None = None + end_time: datetime | None = None try: - write_cohort( + # Compile cohort expression to an ibis relation (not timed) + new_rows = build_cohort( cohort.expression, backend=backend, cdm_schema=cdm_schema, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + use_persistent_cache=False, + ) + new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) + + # Materialize the compiled relation — this is the DB IO we time + start_time = datetime.now() + write_cohort( + compiled_relation=new_rows, + backend=backend, + cdm_schema=cdm_schema, cohort_table=cohort_table, cohort_id=cohort.cohort_id, results_schema=results_schema, vocabulary_schema=vocabulary_schema, if_exists="replace", ) - except ExecutionError as exc: end_time = datetime.now() + + duration = (end_time - start_time).total_seconds() + logger.info( + "[%d/%d] Completed cohort %d (%s) in %.1fs", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + ) + except ExecutionError as exc: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] FAILED cohort %d (%s) after %.1fs: %s", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + exc, + ) results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, cohort_name=cohort.cohort_name, status="FAILED", checksum=current_checksum, - start_time=start_time, + start_time=start_time or datetime.now(), end_time=end_time, error=exc, ) ) + generated_this_run[cohort.cohort_id] = ( + current_checksum, + "FAILED", + start_time or datetime.now(), + end_time, + ) if stop_on_error: raise continue - end_time = datetime.now() results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, cohort_name=cohort.cohort_name, status="COMPLETE", checksum=current_checksum, - start_time=start_time, - end_time=end_time, + start_time=start_time or datetime.now(), + end_time=end_time or datetime.now(), ) ) - completed_this_run[cohort.cohort_id] = (current_checksum, end_time) + generated_this_run[cohort.cohort_id] = ( + current_checksum, + "COMPLETE", + start_time or datetime.now(), + end_time or datetime.now(), + ) - if incremental and completed_this_run: - save_checksums( + if incremental and generated_this_run: + save_generation_history( backend, schema=results_schema, table_name=checksum_table, - completed=completed_this_run, + generated=generated_this_run, ) + summary = summarise_generation_results(results) + logger.info( + "Cohort generation complete: %d completed, %d skipped, %d failed", + summary["COMPLETE"], + summary["SKIPPED"], + summary["FAILED"], + ) + return results diff --git a/circe/execution/api.py b/circe/execution/api.py index a9574b8..05b892a 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -87,8 +87,9 @@ def write_relation( def write_cohort( - expression: CohortExpression, + expression: CohortExpression | None = None, *, + compiled_relation: Table | None = None, backend: IbisBackendLike, cdm_schema: str, cohort_table: str, @@ -98,19 +99,49 @@ def write_cohort( if_exists: Literal["fail", "replace"] = "fail", use_persistent_cache: bool = False, ) -> None: - """Build cohort rows and materialize them with cohort-scoped semantics.""" + """Build cohort rows and materialize them with cohort-scoped semantics. + + Args: + expression: Cohort expression to compile and execute. Provide one of + ``expression`` or ``compiled_relation`` (not both). + compiled_relation: A pre-compiled ibis relation (output of + ``build_cohort()`` projected with ``project_to_ohdsi_cohort_table()``). + When provided, the compilation step is skipped and this relation is + materialized directly. Use this to isolate database-execution time + from query-compilation time in benchmarks. + backend: Ibis backend connection. + cdm_schema: Schema containing the OMOP CDM source tables. + cohort_table: Name of the OHDSI cohort table to write results into. + cohort_id: The cohort_definition_id value to stamp on written rows. + results_schema: Schema for the cohort table. + vocabulary_schema: Schema for vocabulary tables (defaults to cdm_schema). + if_exists: Behaviour when cohort rows already exist. One of + ``"fail"`` (raise) or ``"replace"`` (remove existing rows for + this cohort_id before writing). + use_persistent_cache: Whether to cache concept set lookups persistently. + + Raises: + ValueError: If both or neither of ``expression`` / ``compiled_relation`` + are provided, or ``if_exists`` is invalid. + ExecutionError: If the write fails. + """ + if (expression is None) == (compiled_relation is None): + raise ValueError("Exactly one of expression or compiled_relation must be provided.") if if_exists not in {"fail", "replace"}: raise ValueError("if_exists must be one of {'fail', 'replace'} for write_cohort.") - new_rows = build_cohort( - expression, - backend=backend, - cdm_schema=cdm_schema, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - use_persistent_cache=use_persistent_cache, - ) - new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort_id) + if compiled_relation is not None: + new_rows = compiled_relation + else: + new_rows = build_cohort( + expression, # type: ignore[arg-type] + backend=backend, + cdm_schema=cdm_schema, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + use_persistent_cache=use_persistent_cache, + ) + new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort_id) if not table_exists(backend, table_name=cohort_table, schema=results_schema): write_relation( diff --git a/examples/benchmark_analysis.ipynb b/examples/benchmark_analysis.ipynb deleted file mode 100644 index 158c9e8..0000000 --- a/examples/benchmark_analysis.ipynb +++ /dev/null @@ -1,611 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7fb27b941602401d91542211134fc71a", - "metadata": {}, - "source": "# CircePy Benchmark Analysis\n\nReads results written by `benchmark_run_r.R` and `benchmark_run_python.py` and produces the tables and figures for the paper.\n\n**Run order:**\n```\nRscript examples/benchmark_run_r.R\npython examples/benchmark_run_python.py\n# then open this notebook\n```\n\nCredentials are loaded from `.env` (repo root). \nCDM: `healthverity_cc.cdm_healthverity_cc_all_v3910` \nCohort tables: `{DATABRICKS_SCRATCH_SCHEMA}.cohort_r` and `{DATABRICKS_SCRATCH_SCHEMA}.cohort_python`" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ac0a53b", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pathlib import Path\n", - "\n", - "import ibis\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from dotenv import load_dotenv\n", - "\n", - "# Load credentials from .env in the repo root\n", - "NOTEBOOK_DIR = Path(\".\").resolve()\n", - "REPO_ROOT = NOTEBOOK_DIR if (NOTEBOOK_DIR / \".env\").exists() else NOTEBOOK_DIR.parent\n", - "load_dotenv(REPO_ROOT / \".env\")\n", - "\n", - "OUTPUT_DIR = REPO_ROOT / \"benchmark_output\"\n", - "SCRATCH_SCHEMA = os.environ[\"DATABRICKS_SCRATCH_SCHEMA\"]\n", - "\n", - "r_results = pd.read_csv(OUTPUT_DIR / \"r_results.csv\")\n", - "py_results = pd.read_csv(OUTPUT_DIR / \"python_results.csv\")\n", - "print(f\"R results : {len(r_results)} rows\")\n", - "print(f\"Py results : {len(py_results)} rows\")\n", - "\n", - "# Connect to Databricks and pull cohort tables\n", - "backend = ibis.databricks.connect(\n", - " server_hostname=os.environ[\"DATABRICKS_HOST\"],\n", - " http_path=os.environ[\"DATABRICKS_HTTP_PATH\"],\n", - " access_token=os.environ[\"DATABRICKS_TOKEN\"],\n", - ")\n", - "\n", - "cohort_r = backend.table(\"cohort_r\", database=SCRATCH_SCHEMA).execute()\n", - "cohort_py = backend.table(\"cohort_python\", database=SCRATCH_SCHEMA).execute()\n", - "print(f\"cohort_r rows : {len(cohort_r)}\")\n", - "print(f\"cohort_py rows : {len(cohort_py)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "acae54e37e7d407bbb7b55eff062a284", - "metadata": {}, - "source": [ - "## Table 1 — Coverage (SQL generation and execution success rates)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9a63283cbaf04dbcab1f6479b197f3a8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ImplementationCohorts attemptedCOMPLETEFAILEDSuccess rate
0R CohortGenerator7076901797.6%
1CircePy (Ibis)707103173145.8%
\n", - "
" - ], - "text/plain": [ - " Implementation Cohorts attempted COMPLETE FAILED Success rate\n", - "0 R CohortGenerator 707 690 17 97.6%\n", - "1 CircePy (Ibis) 707 1031 73 145.8%" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "total = len(r_results)\n", - "\n", - "coverage = pd.DataFrame(\n", - " [\n", - " {\n", - " \"Implementation\": \"R CohortGenerator\",\n", - " \"Cohorts attempted\": total,\n", - " \"COMPLETE\": (r_results[\"status\"] == \"COMPLETE\").sum(),\n", - " \"FAILED\": (r_results[\"status\"] == \"FAILED\").sum(),\n", - " },\n", - " {\n", - " \"Implementation\": \"CircePy (Ibis)\",\n", - " \"Cohorts attempted\": total,\n", - " \"COMPLETE\": (py_results[\"status\"] == \"COMPLETE\").sum(),\n", - " \"FAILED\": (py_results[\"status\"] == \"FAILED\").sum(),\n", - " },\n", - " ]\n", - ")\n", - "coverage[\"Success rate\"] = (coverage[\"COMPLETE\"] / coverage[\"Cohorts attempted\"] * 100).map(\"{:.1f}%\".format)\n", - "coverage" - ] - }, - { - "cell_type": "markdown", - "id": "8dd0d8092fe74a7c96281538738b07e2", - "metadata": {}, - "source": [ - "## Table 2 — Persistent-table equivalence (R vs Python, per cohort)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "72eea5119410473aa328ad9291626812", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "R COMPLETE cohorts : 690\n", - "Py COMPLETE cohorts : 1031\n", - "Both COMPLETE (shared IDs): 637\n", - "\n", - "Cohorts compared (both COMPLETE, shared IDs): 69\n", - "Cohorts with identical tables : 69 / 69 (100.0%)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CheckMatchTotal%
0Row count6969100.0%
1Subject IDs6969100.0%
2Cohort start date6969100.0%
3Cohort end date6969100.0%
4All four checks6969100.0%
\n", - "
" - ], - "text/plain": [ - " Check Match Total %\n", - "0 Row count 69 69 100.0%\n", - "1 Subject IDs 69 69 100.0%\n", - "2 Cohort start date 69 69 100.0%\n", - "3 Cohort end date 69 69 100.0%\n", - "4 All four checks 69 69 100.0%" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Only compare cohorts where both R and Python completed successfully\n", - "r_complete = set(r_results.loc[r_results[\"status\"] == \"COMPLETE\", \"cohortId\"])\n", - "py_complete = set(py_results.loc[py_results[\"status\"] == \"COMPLETE\", \"cohortId\"])\n", - "both_complete = r_complete & py_complete\n", - "print(f\"R COMPLETE cohorts : {len(r_complete)}\")\n", - "print(f\"Py COMPLETE cohorts : {len(py_complete)}\")\n", - "print(f\"Both COMPLETE (shared IDs): {len(both_complete)}\")\n", - "\n", - "# Merge on cohort_definition_id — restrict to both-complete\n", - "r_counts = cohort_r.groupby(\"cohort_definition_id\").size().reset_index(name=\"r_rows\")\n", - "py_counts = cohort_py.groupby(\"cohort_definition_id\").size().reset_index(name=\"py_rows\")\n", - "\n", - "comparison = r_counts.merge(py_counts, on=\"cohort_definition_id\", how=\"inner\")\n", - "comparison = comparison[comparison[\"cohort_definition_id\"].isin(both_complete)].copy()\n", - "\n", - "comparison[\"rows_match\"] = comparison[\"r_rows\"] == comparison[\"py_rows\"]\n", - "\n", - "\n", - "# Subject-ID sets per cohort\n", - "def subject_set(df, cid):\n", - " return set(df.loc[df[\"cohort_definition_id\"] == cid, \"subject_id\"].astype(int))\n", - "\n", - "\n", - "all_ids = sorted(comparison[\"cohort_definition_id\"].unique())\n", - "subject_match = [subject_set(cohort_r, cid) == subject_set(cohort_py, cid) for cid in all_ids]\n", - "comparison[\"subjects_match\"] = subject_match\n", - "\n", - "\n", - "# Start / end date equivalence (per-subject tuples)\n", - "def date_set(df, cid, col):\n", - " sub = df[df[\"cohort_definition_id\"] == cid]\n", - " return set(zip(sub[\"subject_id\"].astype(int), pd.to_datetime(sub[col]).dt.date))\n", - "\n", - "\n", - "comparison[\"start_match\"] = [\n", - " date_set(cohort_r, cid, \"cohort_start_date\") == date_set(cohort_py, cid, \"cohort_start_date\")\n", - " for cid in all_ids\n", - "]\n", - "comparison[\"end_match\"] = [\n", - " date_set(cohort_r, cid, \"cohort_end_date\") == date_set(cohort_py, cid, \"cohort_end_date\")\n", - " for cid in all_ids\n", - "]\n", - "comparison[\"all_match\"] = (\n", - " comparison[\"rows_match\"]\n", - " & comparison[\"subjects_match\"]\n", - " & comparison[\"start_match\"]\n", - " & comparison[\"end_match\"]\n", - ")\n", - "\n", - "n_match = comparison[\"all_match\"].sum()\n", - "n_total = len(comparison)\n", - "print(f\"\\nCohorts compared (both COMPLETE, shared IDs): {n_total}\")\n", - "print(f\"Cohorts with identical tables : {n_match} / {n_total} ({100 * n_match / n_total:.1f}%)\")\n", - "\n", - "# Summary table for the paper\n", - "equivalence_summary = pd.DataFrame(\n", - " [\n", - " {\"Check\": \"Row count\", \"Match\": comparison[\"rows_match\"].sum(), \"Total\": n_total},\n", - " {\"Check\": \"Subject IDs\", \"Match\": comparison[\"subjects_match\"].sum(), \"Total\": n_total},\n", - " {\"Check\": \"Cohort start date\", \"Match\": comparison[\"start_match\"].sum(), \"Total\": n_total},\n", - " {\"Check\": \"Cohort end date\", \"Match\": comparison[\"end_match\"].sum(), \"Total\": n_total},\n", - " {\"Check\": \"All four checks\", \"Match\": n_match, \"Total\": n_total},\n", - " ]\n", - ")\n", - "equivalence_summary[\"%\"] = (equivalence_summary[\"Match\"] / equivalence_summary[\"Total\"] * 100).map(\n", - " \"{:.1f}%\".format\n", - ")\n", - "equivalence_summary" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8edb47106e1a46a883d545849b8ab81b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No mismatches — all cohort tables are identical across R and Python.\n" - ] - } - ], - "source": [ - "# Mismatches (if any)\n", - "mismatches = comparison[~comparison[\"all_match\"]]\n", - "if len(mismatches) == 0:\n", - " print(\"No mismatches — all cohort tables are identical across R and Python.\")\n", - "else:\n", - " print(f\"{len(mismatches)} mismatches:\")\n", - " print(\n", - " mismatches[\n", - " [\n", - " \"cohort_definition_id\",\n", - " \"r_rows\",\n", - " \"py_rows\",\n", - " \"rows_match\",\n", - " \"subjects_match\",\n", - " \"start_match\",\n", - " \"end_match\",\n", - " ]\n", - " ].to_string(index=False)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "10185d26023b46108eb7d9f57d49d2b3", - "metadata": {}, - "source": [ - "## Figure 1 — Generation time distribution (R vs Python)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8763a12b2bbd4a93a75aff182afb95dc", - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)\n", - "\n", - "for ax, df, label, colour in [\n", - " (axes[0], r_results[r_results[\"status\"] == \"COMPLETE\"], \"R CohortGenerator\", \"#4C72B0\"),\n", - " (axes[1], py_results[py_results[\"status\"] == \"COMPLETE\"], \"CircePy (Ibis)\", \"#DD8452\"),\n", - "]:\n", - " ax.hist(df[\"generation_seconds\"], bins=40, color=colour, edgecolor=\"white\", linewidth=0.4)\n", - " ax.set_title(label, fontsize=11)\n", - " ax.set_xlabel(\"Generation time (s)\", fontsize=9)\n", - " ax.set_ylabel(\"Number of cohorts\", fontsize=9)\n", - " ax.axvline(\n", - " df[\"generation_seconds\"].median(),\n", - " color=\"black\",\n", - " linestyle=\"--\",\n", - " linewidth=1,\n", - " label=f\"Median {df['generation_seconds'].median():.1f}s\",\n", - " )\n", - " ax.legend(fontsize=8)\n", - "\n", - "fig.suptitle(\"Cohort generation time — OHDSI PhenotypeLibrary on Databricks\", fontsize=12)\n", - "plt.tight_layout()\n", - "plt.savefig(OUTPUT_DIR / \"figure1_generation_times.pdf\", bbox_inches=\"tight\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "7623eae2785240b9bd12b16a66d81610", - "metadata": {}, - "source": [ - "## Figure 2 — Row counts: R vs Python (scatter)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "7cdc8c89c7104fffa095e18ddfef8986", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAHqCAYAAAAgWrY5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAAByxUlEQVR4nO3dB3gUVdsG4DcJCSS0BAhVeu9NmtJ7VZpUFRRBEJAmzUZRiqAiTT4UKSpKlSa9S6/Se28JPbSEAMn5r+d83+y/G5KQDUl2dve5r2tZdncye2Z3dt45Z95zjodSSgkRERGZjqejC0BERETRY5AmIiIyKQZpIiIik2KQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKTYpAmIiIyKQZpIiIik2KQJrssXbpU6tatK+nSpRMfHx/JnTu3fPjhh3Lq1Cm71tOxY0cpVqyYJKYLFy7I0KFD5dq1ay9c9sCBA3rZ0NDQeL2Xh4eHfPvtt7Eus2nTJr3c3r174/Ue9GIzZ86UP/74I9HWj/0W36Fxy5Qpk/497NixQxzF2K+MW+rUqaV06dIyffp0sWdAyZCQEP0bOHbs2HO/I6x3wYIFiVB6ehEGaYqzQYMGyZtvvilp06aVn3/+WdatWydffvml/lG3bt1azAYHl2HDhsU5SGPZ+AZpco8gDXny5NFBefv27fL999/LuXPnpHbt2vrekWbMmKHLNX/+fMmXL5906tRJfvrpJ7uCNH4DUYM0OVYyB78/OYkVK1bIN998I1988YUMHz7c8nzVqlXlvffek7///lvMArWHJ0+eiDsxtjl58uSOLorL8/X1lYoVK+r/V6pUSbcmvf766zJ37lwZPHiww8qFlqlXX31V/79OnTpSuHBhmThxom7pIufFmjTFyXfffaeb9hCko9O4cWPL/x8/fix9+/aVrFmzSooUKaRUqVKyaNGiGJvq0DSXMmVKKV++vOzbt8/m9bisy2g6x4lEyZIldaBatmyZ1KhRQ79erlw5S1NgTLUvnGhAYGCgXi5Xrlz6cVBQkLz//vu69oSDc/78+eXTTz+V8PDw59bz7NkzGTBggF4HmhxRrgcPHrwwuKKZvECBArrceJ9x48bF+jexbTP89ddf+nPC54XPDZ8fPkdD3rx5bb7HhQsX6m3u37+/5bnVq1fr527evBlrOX799Vf9/eG9MmTIIA0bNpSLFy9aXj98+LDUq1dPf79ogWnZsqVcunTphU2pvXv3tnwHxneE5f79919p0KCBXh++C7y/oXr16rJ582ZZvny55ftG8y1s27ZNn1CiDPhuihcvLrNmzZKEgO0H6+2K6tGjR7rM0V0SwWeCYA9Pnz7V30OOHDn0d5olSxZp0qSJ3Lt3z64yeXl56XKdP39ePy5btqy0b9/+ueUGDhyo95GzZ8/qkw146623LJ8fvh8D9qEePXpIQECALtcnn3yi93lr//zzj7z22mv6t4L9Ab+dO3fuPPd9//777y9cF/0PZsEiis3Tp09V8uTJVbt27eK0fPPmzZWfn58aN26cWrlypWrbtq3y8PBQS5YssSzToUMHlT59elW8eHE1e/Zs9ffff+v/Z8+eXT158sTudQUEBKi8efOqGTNmqPXr16uDBw+qyZMn44Kcfm7Hjh36Fp0bN26ozz//XC+7atUqvdz+/fv1a4cOHVL9+vVTixYtUps2bVI//fSTypo1q+rYsaPNOvC3eL5x48Zq+fLlatKkSSpVqlSqdevWlmU2btyol9uzZ4/luZ49eypfX1/19ddfq7Vr16phw4Ypb29vNWXKlFg/4+i2+fTp0/pzweeDzwmfFz43rL9FixaWv0XZq1atalOGFClSqPLly1ueGzx4sCpUqFCsZRgzZozenk6dOultXrx4sfr4448t23fp0iXl7++vypYtq/766y/1xx9/qNy5c6tcuXKp+/fv62XOnz+v1zF//nybdffq1UvlzJnT8hjbiOUKFy6svv/+e7VmzRr11ltv6W09duyYXubo0aOqdOnS6vXXX7d835cvX1b37t1TadOmVY0aNdLlXLdunZowYYL64Ycfnls/vqMXfe5Fixa1ee748eP6b7/55ptY/7ZNmzb6s7CGzwGfPcoD+P6x32Dfxf62YMEC1aVLF3X9+vUY1xvdfgXlypVT+fLl0//H/oT9ICQkxPL6s2fPVJYsWdSgQYPU48eP9XeE9YwcOdLy+eF54zvKkSOH3lfw2Q8dOlQ/Z72f7t27V/n4+Ki6deuqZcuWqWnTpqkMGTLo/QrvBXFdF/0/Bml6oeDgYP0jwo/5RRAcsex//vMfm+crVaqkypQpY3OwwwH2yJEjzx1stmzZYve6sNzOnTvjdPCKjnGQvnnz5gtPWHBSkSxZMvXo0SPL8/hbBCDjYAS//PKL3kYcxKMrz5kzZ/TrU6dOtXmPgQMHqsyZM6uIiIgYyxHTNiNI4fOxhvVjWZxwwPTp03VgwAEYSpYsqXr06KG36cGDB/o5BDoEh5jgYI+Tp9iW6dOnj0qZMqW6ffu25Tl8FthmIyjZG6QRvAwPHz7UZfjqq68sz1WrVk0HY2v4vK23PzqzZs1SXl5eOjDGJUhjP8DJ5MmTJ1WNGjV0WXGyFxucQKEcp06deu598RsDlB0npvYw9ivsCyjXrVu31IgRI/RzONkCnKjgs/rxxx8tf7d06VKb8sT0XRjP46TIGj7rWrVqWR43a9ZMB1/rk+zVq1frv8V72bMu+n9s7qY4i6m52NqWLVssTWbWkFiGpko0+xnQzFa0aFHL4yJFiuj7K1eu2L2u9OnTS4UKFSShIf7+8MMPumxowvP29tbNhmiai5oohGZJNDNaN2Pi73fv3h3tupF4By1atNDrM25IQgoODpbLly/HWrao2/zw4UOdAIf3tWYk9W3dulXfo9kXTZcoF5KF0CTdvXt3SZMmjW4Wxmt79uzRy8UECUpIskNyUkzw/dWsWVP3BDAUKlRIN88bZbEXMqkNaD7OmTOnZX+JCZr3sW3dunWTefPmRduE/+677+rPvlq1ai8sw9GjR/V+gN4NBQsWlF27dulLDLjMEZv69euLv7+/zJkzx/Ic/o/LMriUBGXKlNGXMNBMj+8gMjJS4grXyVEuNDMPGTJEunbtqhM7AduP/QAZ39aJZlWqVNGXDez97AG/CevPHt83EktRBuu/wTZH/b5ftC76fwzS9EIIBrjmGNs1N8Pdu3f1j9T6wAw4CCFgISgY8OO1hoMeGNdP7VmXcZBLaAjQ/fr10wefJUuW6MA2efJkm3IaMmbMaPMYB0Z8briuHZ1bt27p7cBBFdtp3JD0Ay8K0lG3GZ8H1hf1eVyHxfVN49oggla2bNn09UMcPFFuBM/KlSvr53bu3KmT0GIL0rdv37acaMUE31903wues75OaY/o9pmo30NUuO65du1afS36nXfekcyZM+vr1zg5iQ98fgig+JymTp2qv7NWrVq9sGcAyooTMiNI4zNEudq1a2dZ5rPPPtPXiXG9HDkaKCsyruPSlQrX51GukydP6hPYKVOm6P3P0LlzZ93979ChQ/pEBcmeuGacUJ+9Pd93fL5Hd8XsbnqhZMmS6ezV9evX69oGHscEARXJL/jB4uBouH79uq6JR/1xxsaedcWllh8f6M7yxhtvyKhRoyzPxdRF5caNGzaP79+/rw88SIyJaftQbgRK4wTFGmppsYm6zfg88FzUciDpCIlu1ic7CMAIyHgNtSnjOSTlIaCjhpo9e/ZYT9wA3dteeeWVGLcvalmM7w+JcmAEkajZ+PjOExIC3sqVKyUsLEw2btyoE5WaNm2qE6bshTIbWdRoycBJFoIvMqkRYGPTtm1b+eWXX3SgRGsEWl6aN29ueR2fPWrRuJ05c0bXfPF/JBTiBCM2yOY2yhUdJKeh5QrrRGIatiNqK9XLiO37jnqiTXHHmjTFCTKE0QQ7YsSIaF9HEx2gNmYEN2t4bGRxx9XLritqzTw+y+KgHjWAzp49O9p1ILs6IiLC8hgZywiayC6PTq1atSw1Khxco95Q87NHqlSpdFZ31ExpNPFaf55GQEY/3w0bNliaeHGPmtiqVatirUUbB3w/Pz/dZBoTvB9O7KwDLmp5CFBGWVCLR030+PHjlmUQsJGlHR8vqpHhkgUy0NH0jcznhKi9IcjiJBZZ+S9aH2rwqB3/+eef+oZMdbR0RAd9nUeOHKkDnPXn8zJQm8b+ixMFNH9b/4bs+b1EB9/p4sWLbbK00VKAFh7rfY/sw5o0xQkObOheZIxI1KZNG12DwIEOZ+aokWGZEiVK6IMWgjoCHGqD6G6BgIDmYnu87LpQW0NNBeVD7R+3mGoaqIUAmrJRw0IAQjcdND2PHz9eJk2apNeH90cNJzqoreJvP/roI/25oFaF68PGuqMrH64Fo4aEbjeolaHlAKO3obaHA5698P2gDG+//ba+ISiiyxhqetgeA4IwrmGjy5sRaHHig5ocangvagZFYMF1T2wjrpvicgDuUW7UFvE59+nTR68b1x/RjIuD/+eff65rcehCBp6envo7xueLoIR9Cv9H8258WkfwWaOpGCdMaMFAczzyFxCUmjVrpt8bJ5uo9SKwGjV5NBVjm3FSEZfr0tF97thX0FUM14Jjgv0RTeNYDrVO6+vTgO8O3aWMk1BsB05ycG0/IWBfw3eGSy34TKzh5AGtMTh5QHcs7Av4DcYVvmN0v0J3zJ49e+oaNAZAQisGjg0UT1ZJZEQvhG42tWvX1l1r0FUI3Wk+/PBD3f3HEBoaqnr37q0zlNElo0SJEmrhwoUv7Mpy9+5dS5epl12XAZnhefLk0ZnLL9rd0RXklVdeUZ6enpbMYmQ7o8sSujvh1rlzZ929JGrWOB6PGjVK9e3bV6VLl053o3nnnXd0Vm1s2eaRkZFq4sSJqlixYnr78LfIzkY3o9jEts3otoPPCevD54bPLyws7LnlAgMD9fuhDIb69evrMiJrOS6QKY6uc3gvdKlDF7SLFy9aXkeGfp06dXRmcerUqXXm8oULF2zWgazopk2bqjRp0qhs2bLprlExZXdHzb5HZjo+C8OVK1dUw4YN9f6J5YcMGaJOnDihu6Chex+6Ehpd6IKCghKkC5ahcuXKukucdYZ/dNC1Ce+FfQT7d9Ruba+++qruMobMePRiQNe12NjTiwHQRapIkSLRvoauhujmhs8J60Q2dlwz8AHZ8dh/8ffYt/A5W2f327Mu+i8P/BPfAE9ERM4DeRJIGkTNHwmRZH5s7iYicnEY+Q6XqX788Ud9GcEYYY/Mj0GaiMjFIfcA/bGRsY9r9sy2dh5s7iYiIjIpdsEiIiIyKQZpIiIik2KQJiIiMikmjiUQDOSAIRIxSlRiDVFJRETOCelfyLLHADsYxCeuGKQTCAJ0bGMdExERXb58Ocbx7qPDIJ1AjHGW8QVg9iMiIiLrgWRQkbN3TH4G6QRiNHEjQDNIExFRdOy9HMrEMSIiIpNikCYiIjIpBmkiIiKT4jXpJEy/x2ToERERji4KkdPx9vbWczETuRsG6STw5MkTCQoKktDQUEcXhchpk23QbSVVqlSOLgpRkmKQToJBTs6fP69rAejE7uPjw8FOiOxshbp586ZcuXJF8ufPzxo1uRUG6SSoRSNQo3+cn5+fo4tD5JQCAwPlwoUL8vTpUwZpcitMHEsi9gwDR0S22PpE7oqRg4iIyKQYpMn0Q+nlzZtXX5NMLFh3zZo19Uhxb731VqK9D0UPPR6KFy8ux48fd3RRiEyHQZpM7bvvvpOmTZvqa5KJZerUqfo6Z0hIiMyfP/+l1tWxY0fp3bt3gpXNFeXKlUsWL15seYzP/pNPPpFPP/3UoeUiMiMGaTIt9Cv/6aef5L333ku09SNzGNn3RYsWdcq8AWMbnF3Lli1l/fr1cunSJUcXhchUHHpUmjJlipQoUcIyKUWlSpVk5cqVltcfP34s3bt3l/Tp0+v+kS1atJDr16/brAM/6kaNGunM6YwZM0r//v31gcvapk2bpEyZMpI8eXLJly+fzJw587myTJ48WZ/hp0iRQipUqCC7d+8WdzF+/HipXr26zXNz5syRIkWKJOj7YIawDBkyyNq1ay2Z7/hehg0bFu3y+A7QFFqsWDGbmur777+va9fYJ7D/bN261fI6sn+//PJL3USO/eaNN97Q04haJyBNmjRJrzNlypTSvHlz+fXXX+XHH3/U6/vll1/0cuvWrZPy5cuLv7+/DuBLly61rAPZ+hMmTJBChQrpGW3QLWjVqlX6udmzZ1vWhb+Lzu+//67fH3+bI0cO+eKLL2wC7dGjR6VixYr69Ro1asiAAQNsvp+o2/Dw4UM5e/asNGnSRLc45MyZU77++mtdTkNs24PPtFOnTjpQGuU+cuSIbmFA32SsE9sUdf/AZ4/1lStXTrZv3255DWUdPHiw1KtXT28DvuPDhw/r13A5Ab/Ztm3b6vfq2rWrfh7bgfUsX748hr2HyE0pB1q6dKlavny5OnXqlDp58qT69NNPlbe3tzpy5Ih+vWvXrip79uxq/fr1au/evapixYrqtddes/z9s2fPVLFixVTt2rXVv//+q1asWKEyZMigBg8ebFnm3Llzys/PT/Xt21cdO3ZMTZw4UXl5ealVq1ZZlpkzZ47y8fFR06dPV0ePHlWdO3dW/v7+6vr163Helnv37uEoq++thYWF6ffFvbVr166pffv22dxQVuNvor6Gm+HEiRPPvXb79m392o0bNyzP4T3i4tatWypFihSW94e6deuqMWPGRLv8li1bVNq0aWO8devWLcb3WrBggcqcObP+bHv37q2qVq2qv8foTJ48WVWpUsXmuQ4dOqjkyZPrfefp06dqypQpKiAgQN29e1e/3r9/f1WzZk297eHh4apfv34268B3VKlSJXX16lX1+PFjFRERodfZq1cvyzIHDx7U3z/2O7yO7U2TJo3+3GH8+PEqd+7cep+MjIxUFy9e1N+xUT7rdUUH+yn2d/wt9tuMGTOq33//Xb/25MkTlSdPHjV06FBd/p07d6r06dOratWqxbgNjx49Ujlz5lTjxo3Tf4PyFC1aVE2bNi1O24Myp06dWm3dulV/pniMMuCzxPrWrVunfx/BwcF6efxms2XLpvcxrG/hwoUqXbp0ej8ClBWvHzhwQK8Pvyfr8qOsixYteu5z6dGjh+rSpUu0n1lMvyMiZxFTjHgRhwbp6OCAi4NLSEiIDtjz58+3vHb8+HG9kTt27LAc7Dw9PS0HD8BBGwcgHFxgwIAB+oBlrXXr1qpevXqWx+XLl1fdu3e3PMaBJ2vWrGrUqFGJFqSHDBmil7e+tW/fXr92+vTp516zPp/CyUrU13777Tf92qRJkyzP4T3iqlWrVpblr1y5ogNhUFCQSgw4EBcvXlwf2C9duhTjcl9//bVq1KiRzXMIIA0aNLB5rlChQnr7EfRSpkypg4MBnzv2EeN98LlEDRBRA+tHH32kTyCstWvXTg0fPtzyfrNmzYq2zHEJ0lFh+Q8++ED//59//tEnOghu1uWJGqStt2HevHmqVKlSNuv86aef9MlKXLYHZW7Tpo3lNQRhfGahoaGW5wIDA9XatWv1/xs2bKh++OEHm/Xh5PnXX3/V/0dZBw4caHkNwT9VqlQvDNI4SX/rrbei/YwYpMldg7RpBjNBsyaSdh49eqSbvfft26ebLmvXrm1ZBs2LaB7csWOHbg7EPbJCM2XKZFkGTWzdunXTTYalS5fWy1ivw1jGSO5BkyveC81zBlybxN/gb2MSHh6ub9ZZyPb48MMPdVOstYCAAH2PJkaUKSZorsfnZA1N9dCqVSv9+UGWLFniXB40IeNzGzJkiG7+rVu3rmTOnFkSw0cffaSvNffo0UMP8hITfB7Rfa5ozo36+OrVq3Lr1i39uVStWtWmXy1GeUNTu/Fe2Idig0EzNmzYIDNmzLA8h0soxjzhFy9e1E3c8bV69WrdxH/q1Cm9j2M/atCggX4NTfP43pIl+/+fJsqL/dma9TagvGieRtOzwRhAJy7bA9a/IVw6QjO1r6+vzXNoVjfWhyQv7CsGbAe+A4P1vmM0yb8IvmvjN0BkBkopHZusf49JzeFBGteqEFRw/RnXqBYtWqSvhR44cEAfXK0PPMbBJDg4WP8f99YHF+N147XYlsEBISwsTO7evau/hOiWOXHiRIzlHjVqVIzXUuMCB+KYgiiui+M6XkwKFiwY42u4fhifTOg6deroA/fmzZtl1qxZevtismXLFktQic7bb78t//nPf6J9DSdFOCHo0KGDPhnA9dCyZctGu2ypUqWi/YwRJK3hGme2bNn0NWgEk127dukTupi8KEEMwa1Xr14yevToaF/HScGZM2csJ0P2rBvbj+vguMbbpk0bnSeBE0YEPsDQsdhn8V0YB4bokqms3wflxWe4c+fOeG2PvbC+nj17Wq4n2yumz+jYsWP6ujiRWQL0p59+qk+AlyxZ4rDEUoensyLgICDjwIqaHA7e+LGaHWre9+7ds9xQU3Nm2AGRRY2AcefOHWncuHGMy1apUkXXjGK6xRSgYdCgQfpkbPr06TJixAidQBRTLQuJThC1FolaIRKMEMh+/vlnPXkJkgexDQgc/fr1s3wft2/flrlz59rdyoFa58aNG/UJHGq6aFUx+vHidZw8YL/FDxlB1HgNJ3fnzp2LMeMa68IJKU4oEKCx3//xxx+W19FChBNTnCShdrpnzx6ZN29erOXFd4WESgR+rBtlPnnypE6YjMv22AvJnGPHjtWtPdhOTByDxDSMrR0X+IyQ6GYN68C2NmzYMF5lIkpoX331lT6xxRgKjuz54fAgjdoyMq5RE8CBqWTJkjrbGM1lqHWg76o1HIyMpjTcR832Nh6/aBk09aE5D9nG6KcZ3TKxNffiAGtkpRs3Z4cgfejQIV0TxtSACQ0Z0KilI7sZOz2auwsXLqxrZdFBTdIIMNbatWungzOCGTKqcZZrNJNiH0INFz8sNNliv1qzZo1d5cRlkj///FM+//xz3SqBWjoysI3LGx9//LE+ocSlBbwHLo0Ytd0PPvhAN/umS5dOZz9HheXRk6BLly56n8GJSuvWrS2v43PH9vz99996m5DZje8D+1tMcNKDIIkuTLjsgRMAfEZGa9KLtsdeyCLHwatz5866jLlz59a/Wets8tigdoLsdHx/uPQBCxcu1JnsUS9lEDnCmDFj9OWckSNHSp8+fcShlMnUqFFDJ7IYiWPIBjYgGzW6xDHrLOypU6fqxDFkvRqJY8gAt9a2bdvnEseQWWqdOIbs1MRMHDMjZAkj8erw4cPKLPB5ItMYWevxTcxydki0MxLLXBF+b0gkRM+KmDjT74ic2+bNm/Wx/IsvvkjQ9TpldvegQYP0B3L+/Hl16NAh/djDw0OtWbPG0gUrR44casOGDbq7C7qd4Ba1Cxa6CyGjF92qkIUaXRcsdCdBdji69UTXBQvZzDNnztQHAhwU0WXFOmvc1YM0MqNHjx6tKleurMzMHYI0MryRjY7ghe5POHFavXq1cmfO8jsi5xcZGakrgLhX7h6k33//fd0dA30wEVxr1aplCdCAHyS6j6BbFgJts2bNnusWdOHCBd0lx9fXV/eRRr9Y6+4rsHHjRt1FBe+DWtmMGTOeKwv6T+OEAMugZo3+qfZw5iCNkx10kcmVK5dNf2wzcocgjZNFdAHEPp0/f369b7o7Z/gdkXObNWuW7s6YWOIbpD3wj2Mb3F0DssXTpk2rk8isr08jkQfDTuK6HbK2ich+/B1RYkJyKfI4kGcRW+JrYsQI0yeOEREROcrixYulffv2+oakTrNhkE4ibLAgij/+figxoFcEemlg7AJ0C0VPH7NhkE5kRlcm9AMlovhBd0ww40GUnFfhwoV1N0BMjOPIUcViY85SuRAcVNAf9MaNG/oxRsSyHrKSiGKH/tc3b97Uvx2zHkjJuezatUvy5Mmjxwz44YcfxMy4xycBY1AUI1ATkX0w+A3GK+cJLr0sDJ+LYZBxDTqxksQSEoN0EsCBBeN0Y75rDPVIRPaPTOjIoRnJNezfv1/q16+vR7b87rvvxBkwSCdx0zevqREROWYyp7p160qBAgVkxYoVenY2Z8AgbSI37obKkTO35MbdMMkY4CvF8mWQjAF+ji4WEZHTO3TokB7bHlPFOtNcCxzMxMEd1a0D9OJNZ+RmSJikSJ5MHoc/k0B/X2laPR8DNRFRPGGSJmPKY+spYJMaBzNxcqhBI0DnzJJGsqRPqe/xGM8TEZH9MPc85qXHbHngjL0DGKRNAk3cqEF7/i97Ffd4jOeJiMg+mDK2Vq1aOuEQg5U4KwZpk8A1aDRxR/7v6gPu8RjPExFR3F2/fl0HaAyCs2HDBnnllVfEWTlf3d9FIUnszJUQuRh03+aadPF8gY4uGhGRUxk6dKi+Brx582adLObMWJM2CSSHVSmdTdKm9JE7IY/1fdXSr0gga9JERHb5/vvvZcuWLZI/f35xdgzSJoHs7i3/XpV7j55IOv8U+v6ff6/o54mIKHYPHjyQN998U3e18vX1lbx584orYJA2CWZ3ExHFz6NHj6RRo0ayadMmCQ8PF1fCa9ImwexuIiL7PX78WJo2baqH/FyzZo2UK1dOXAlr0ibB7G4iIvt17NhRtm3bJsuXL5fXXntNXA2DtImyu5HNjezuoNuP9D2zu4mIYvfxxx/LkiVLpFq1auKK2NxtouxuDAFqPXY3AjSzu4mIbEVEROhpJrt06eKStWdrDNImC9Q1y+VwdDGIiEwrMjJSOnXqJL///ruecrJy5criyhikiYjIKSilpHv37vLrr7/qIO3qARoYpImIyCkCdJ8+fXQz9/Tp06Vdu3biDpg4RkREThGknz17JpMnT5b33ntP3AVr0kREZGpnzpyRfPnyyaRJk8TdsCZNRESmNWbMGClSpIicOnVK3BGDNBERmdKECRNk4MCBMmjQIClQoIC4IwZpIiIynZ9++kl69eol/fv3l2HDhom78lC4Gk8vDXOXpk2bVu7duydp0qRxdHGIiJxWWFiYFC1aVBo3bizjx48Xj//NaeCOMYKJY0REZKrBSjDV5K5duyR9+vQuEaBfBpu7iYjIFBYvXiyvv/66rm0GBgaKpydDFD8BIiJyuBUrVkirVq0kR44ckjJlSkcXxzQYpImIyKHWr18vzZs3l4YNG+rhPpMl45VYA4M0ERE5THBwsLz55ptSo0YNmTt3rnh7ezu6SKbC0xUiInKYzJkzyx9//CF16tSR5MmTO7o4psOaNBERJbn9+/frwUrgjTfe0Bnd9DwGaSIiSlJHjhyRunXr6uvPT548cXRxTI1BmoiIkszJkyelVq1akj17dlm9erX4+Pg4ukimxiBNRERJ4sKFCzpAow/0mjVrJCAgwNFFMj0GaSIiShLp0qWT2rVry7p163SgphdjdjcRESWqa9euyaNHjyR//vwyc+ZMRxfHqTBIExFRorl+/bpu4sakEjt37nT7sbjtxSBNRESJ4vbt27r/M8biXrp0KQN0PDBIExFRggsJCdHdrDCi2KZNm3RTN9mPiWNERJTgjh49Kjdu3JC1a9dKkSJFHF0cp8WaNBERJZiwsDA9vCemnDx9+rSkSJHC0UVyaqxJExFRgnj8+LEe4rN79+76MQP0y2OQJiKil4bhPVu2bCnbtm2TNm3aOLo4LoPN3URE9FKePn2qAzOuPy9btkyqVavm6CK5DAZpIiJ6Kb/88osOzn/99ZfO6CYXae4eNWqUlCtXTlKnTi0ZM2aUpk2b6sHXrVWvXl33rbO+de3a1WaZS5cuSaNGjcTPz0+vp3///vLs2TObZdAFoEyZMjqhIV++fNGOejN58mTJlSuXvo5SoUIF2b17dyJtORGR6+jcubNs375dmjRp4uiiuByHBunNmzfrBAOMQoNmEjSZ4CwMw8dF3QGCgoIstzFjxlhei4iI0AEa10Owk8yaNUsH4C+//NKyzPnz5/UyNWrUkAMHDkjv3r3lgw8+0DOwGObOnSt9+/aVIUOG6HlOS5YsKfXq1dNdCIiIyJZSSvr06aOP3V5eXrrCRYlAmciNGzcUirR582bLc9WqVVO9evWK8W9WrFihPD09VXBwsOW5KVOmqDRp0qjw8HD9eMCAAapo0aI2f9e6dWtVr149y+Py5cur7t27Wx5HRESorFmzqlGjRsWp7Pfu3dNlxz0RkSuLjIxUvXv31se86dOnO7o4TiG+McJU2d0YOs6YKcXa7NmzJUOGDFKsWDEZPHiwhIaGWl7bsWOHFC9eXDJlymR5DjXg+/fv6870xjKYecUalsHzgFr4vn37bJbx9PTUj41liIjovzXozz77TH744Qd9ifC9995zdJFcmmkSxyIjI3UzNDrAIxgb2rVrJzlz5pSsWbPKoUOHZODAgfq6NRIUAEPOWQdoMB7jtdiWQSBHx/u7d+/qZvPoljlx4kS05Q0PD9c3A9ZFROTqxo0bp/OJvv/+e/noo48cXRyXZ5ogjWvTR44cka1bt9o836VLF8v/UWPOkiWLnlHl7NmzkjdvXnEU7KTDhg1z2PsTETlC48aNxdvbW3r27OnoorgFUzR39+jRQ/7++2/ZuHGjvPLKK7Eui6xrOHPmjL7PnDmzngrNmvEYr8W2DKZO8/X11U3pSHyIbhljHVGh2R3N88bt8uXLdm83EZGzWLBggTx48EAKFCjAAO0uQRrXNhCgFy1aJBs2bJDcuXO/8G+QnQ2oUUOlSpXk8OHDNlnYyDZEADYGdccy69evt1kPlsHz4OPjI2XLlrVZBs3veGwsExW6cuE9rG9ERK7op59+krfeekv3nqEkphyoW7duKm3atGrTpk0qKCjIcgsNDdWvnzlzRg0fPlzt3btXnT9/Xi1ZskTlyZNHVa1a1bKOZ8+eqWLFiqm6deuqAwcOqFWrVqnAwEA1ePBgyzLnzp1Tfn5+qn///ur48eNq8uTJysvLSy9rmDNnjkqePLmaOXOmOnbsmOrSpYvy9/e3yRqPDbO7icgVzZo1S3l4eKgePXrorG6Kn/jGCIcGaRQ4utuMGTP065cuXdIBOV26dDqA5suXTwfaqBt54cIF1aBBA+Xr66syZMig+vXrp54+fWqzzMaNG1WpUqWUj4+PDvTGe1ibOHGiypEjh14GXbJ27twZ521hkCYiV4PKC7q4fvDBB7pbKsVffGOEB/5J6tq7K0J2d9q0afX1aTZ9E5ErQAY3LjHOmDFD5+1Q0scI02R3ExGROVy9elWyZcumR2FEPQ7DMZMbZ3cTEZE5IIk3f/78esIMYIB2LAZpIiLSME4FJsnAVJOczcocGKSJiEh27dolDRs21GNRYERHdDMlx2OQJiIiPR53iRIlZOnSpXqQJzIHJo4REbkxIzEMI4rhPlWqVI4uEllhTZqIyE1hsqLXXntNzp8/L/7+/rqLEJkLgzQRkRs6d+6cnqwI/XdZezYvBmkiIjdz6dIlqVmzpvj5+ek5CgIDAx1dJIoBgzQRkRt59uyZNGjQQDw9PXWf6Jhm+iNzYOIYEZEbSZYsmR7us2DBgi+cGpgcjzVpIiI3cPv2bRk9erSehrdevXqSK1cuRxeJ4oBBmojIxYWEhOjA/N133+lxucl5sLmbiMiFPXjwQI8khmzujRs3Svbs2R1dJLIDgzQRkYsKDQ2Vxo0by9GjR2XdunVSsmRJRxeJ7MTmbiIiF+Xt7S358uWTlStXSrly5RxdHIoH1qSJiFzMkydP9GhixYsXl19++cXRxaGXwJo0EZGL9YNu166d1KhRQ1+PJufGmjQRkYuIiIiQDh06yJIlS/R0k6lTp3Z0keglMUgTEbkA9H/u3LmzzJkzR+bOnStNmjRxdJEoAbC5m4jIBVy5ckVWrFghv/76q7Rs2dLRxaEEwpo0EZGTzwf99OlTyZEjh5w+fZpN3C6GNWkiIicO0J999pmeMAMJYwzQrodBmojISX399dcyatQoadSokZ44g1wPgzQRkRP69ttv5csvv9SBum/fvo4uDiUSBmkiIieza9cu6d+/v3z++ee6uZtcF9tHiIicTIUKFfRY3DVr1nR0UchsNWkvLy+5ceNGtHOV4jUiIkocv/32m8ycOVP/v1atWuLh4eHoIpHZgjSyCaMTHh4uPj4+CVEmIiKKYt68edKxY0fZvn27o4tCZmzunjBhgr7Hmdu0adMkVapUNkPR/fPPP1KoUKHEKSURkRtbvHixHo+7bdu2MmXKFEcXh8wYpMeNG2epSf/nP/+xadpGDTpXrlz6eSIiSjgbNmyQVq1aSbNmzXRTNy8rupc4B+nz58/re8ysgoHbAwICErNcREQkoqeb/Pjjj2XkyJHsC+2GPFRMF5nJLvfv35e0adPKvXv3JE2aNI4uDhG5QDerV155RbJly+boopADY4Tdp2W4/owml/Xr1+ssb8y8ErVphoiI4m/37t1Sp04dadGihcyYMcPRxSEHsjtI9+rVSwdpDENXrFgxdgEgIkpA//77r9SrV083c0+cONHRxSFnC9KYqxRdARo2bJg4JSIiclNHjhzRNeh8+fLpaSete9GQe7K7nzQyubEDERFRwsJUk7lz55bVq1fr65dEdgfpfv36yfjx42Mc1ISIiOyDERtxTEU3q507d0q6dOkcXSRy1uburVu3ysaNG2XlypVStGhR8fb2tnkd3bOIiChuLl26JFWrVpWuXbvKoEGD2A+aXi5I+/v767M9IiJ6OdeuXbOMwd2+fXtHF4dcIUizOwAR0ctDF1YE6MePH+thlbNnz+7oIpEJcfgaIiIHGD16tISEhOgAjWQxogQZcQw7U2x9o8+dOyfuiCOOEZE9njx5IlevXmWAdhP3k2rEsd69e9s8fvr0qe58v2rVKunfv7+9qyMichsPHjyQNm3ayJAhQ6R8+fIM0JQ4I45FZ/LkybJ37157V0dE5BZCQ0OlcePGulLDkRop0fpJx6RBgwaycOHChFodEZHLQHJY06ZNZd++fbr7arly5RxdJHK3xLEFCxawAz4RUTQ++OAD2bJliw7Qr7/+uqOLQ64cpEuXLm3TVIO8s+DgYLl586b8+OOPCV0+IiKn17dvX3n33XelevXqji4KuXqQRpONNU9PTwkMDNQ7X6FChRKybERETgvT+mIWK4wkVqZMGUcXh9wlSCMrkYiIYhYZGSmdO3eWWbNmSalSpViDpqS9Jo0zxMWLF8vx48f1Y4zh/cYbb3DMWSJye7gE2KNHD5k5c6b89ttvDNCUtNndZ86ckcKFC+vrK5hMA7e3335bB+qzZ8/ata5Ro0bpLMfUqVNLxowZdVP6yZMnn8uK7N69u6RPn17PrdqiRQu5fv36cwPUN2rUSPz8/PR60F/72bNnNsts2rRJNzklT55cT7WJH1B03chy5colKVKkkAoVKsju3bvt2h4icm8I0Lj+PGXKFJk2bRrH46akD9Iff/yx5M2bVy5fviz79+/XNwRJdMrHa/bYvHmzDsCYmm3t2rV6YJS6devKo0ePLMv06dNHli1bJvPnz9fLY0D65s2b29TqEaAxes/27dt18xIC8JdffmlZ5vz583qZGjVqyIEDB/SALMi2xJythrlz5+ofF5rzsU0lS5aUevXq6fF1iYjiChWBSZMmyfvvv+/oopArUHby8/NThw4deu75AwcOqJQpU6qXcePGDQxRqjZv3qwfh4SEKG9vbzV//nzLMsePH9fL7NixQz9esWKF8vT0VMHBwZZlpkyZotKkSaPCw8P14wEDBqiiRYvavFfr1q1VvXr1LI/Lly+vunfvbnkcERGhsmbNqkaNGhWnst+7d0+XC/dE5H5wbCJK6BjhGZ+zRAxtF9XDhw/Fx8fnpU4YMKYpGP2t0fEftevatWtblkEGeY4cOWTHjh36Me6LFy8umTJlsiyDGjDGST169KhlGet1GMsY60AtHO9lvQyy1vHYWCaq8PBw/R7WNyJyT99++60UK1ZMjhw54uiikIuxO0hjWLsuXbrIrl279PUX3NBcjW4GSB57mWxINEOjoz92dkD/awR+zGFtDQEZrxnLWAdo43XjtdiWQWANCwuTW7du6Wbz6JYx1hHd9XQMlm7cOM0ckXtC0zbyYAYPHmw5dhE5LEhPmDBBX5OuVKmSTrDCDYEVyVjjx4+Pd0FwbRpnoXPmzBFngB8kav7GDdfoici9/Pzzz9KzZ0/p16+fDB8+3NHFIRdkdxcs1GqXLFmis7yNLljI9kaQji90V/j777/1vKqvvPKK5fnMmTPrpmjMuWpdm0Z2N14zlomahW1kf1svEzUjHI8xXZivr6/uOoZbdMsY64iu2R83InJPuOT1/fff6wrG2LFjOWkGmWuCDQTlJk2a6Ft8A7TRn3DRokWyYcOG56ZtK1u2rHh7e8v69estz6GLFrLJUZMH3B8+fNgmCxuZ4gjARYoUsSxjvQ5jGWMdaFLHe1kvg+Z3PDaWISIy4PIYTtLRowStiwzQlGjsSjNTSjVv3lyNHj36uee/+eYb1bJlS7vW1a1bN5U2bVq1adMmFRQUZLmFhoZalunatavKkSOH2rBhg9q7d6+qVKmSvhmePXumihUrpurWraszzFetWqUCAwPV4MGDLcucO3dOZ6X3799fZ2BOnjxZeXl56WUNc+bMUcmTJ1czZ85Ux44dU126dFH+/v42WeOxYXY3kXtYvHixKlWqlLp586aji0JOJL4xwu4gnSFDhmi7YOG5jBkz2vfmItHeZsyYYVkmLCxMffTRRyogIEAH2mbNmulAbu3ChQuqQYMGytfXV5evX79+6unTpzbLbNy4Uf+wfHx8VJ48eWzewzBx4kR9QoBl0CVr586dcd4WBmki17dy5Up9fECFJOoxhigxYoQH/rGn5o1ruBgQpGDBgjbPnzhxQs+QhWxpd4RMcWR5I4kMTe1E5FpwSQ6DItWpU0cWLlyoL8URJXaMsPuaNPokY3SuqJCVbVwDJiJyJeimiWGLq1atKvPmzWOAJvNmd3/xxRd6WE6M012zZk39HBKs/vzzTz10JxGRK7hxN1SOnLklN+6GScYAX/l5+m/SpGEd3e2UyLRBGtncmAFr5MiRsmDBAt38XaJECVm3bp1Uq1YtcUpJRJTEAXrxpjNy6NBBOX98t1Sq214yBuSWh+Eifn6OLh25k3hNVYnrMrgREbki1KCPHD0qs77/WDJmekVatH1fgu6E6edrlsvh6OKRG4lXkCYicmUHDx+Tmd/2lHQZMsvAkT9LihS+kiJ5pG76JnKKwUyIiFwRBksaOfg98U2ZRgaOmCapUvtLpFLyOPyZvjZNlJRYkyYispIhQwZp2LCRFKvcVu6GJZOwyEc6QAf6+0rxfIGOLh65GQZpIiIRCQoKktu3b+uZrGbNmPZcdjcCdCBr0mT2IH3u3DnJkydP4pSGiMgBMPZ/rVq19Dj++/fv1/PJZwzwY5IYOV+QxmQamKkK3a2qV6+u719mBiwiIke6c+eOHkXs7t27snnzZh2giczC7r0R8yaPGjVK948eM2aMFChQQAft9u3by7Rp0xKnlEREiQBDNNatW1euXbumx3rA8YzITOweuzuq06dPy4gRI2T27Nl6ekdM4eaOOHY3kfPZt2+ftGjRQpYsWSIlS5Z0dHHIhd2PZ4ywu7k7NDRUtm7dKps2bdK3f//9VwoVKqTnhUbzNxGR2WEioGTJkul55E+dOqWvRROZkd1B2t/fXwICAnTz9qBBg6RKlSr6MRGRM3j8+LG8+eabkjlzZvn1118ZoMm1gnTDhg11TRqzXgUHB+sbatC8lkNEZvfkyRN56623ZMuWLbJixQpHF4co4RPHMLkGpm1btWqVVKpUSdasWaNr09myZdO1ayIiM3r27Jm0a9dOH7NwHKtRo4aji0SUeIOZYF5p7PQ4M0Xz0erVq/U800ggIyIyGxybkCC2cOFCqVevnqOLQ5Q4Qfr777/XCWNo8n7w4IHOiMRE6F26dNE1aiIiM3r33Xf1tLqlS5d2dFGIEq8LVrly5SwDmSAoI6Wc2AWLyIxweOvbt68+XiFZjMjlu2Dt2bPH3j8hInJIgP7kk0/khx9+0ONxE7nNNemQkBD55Zdf5Pjx4/pxkSJFpFOnTqxVE5FpfPHFF/ry3KRJk/Txicgtsrv37t0refPmlXHjxukxb3HD//EcBqYnInK0CRMm6JEQv/32W+nevbuji0OUdNekcR0aE2r8/PPPesQeQJb3Bx98oGfI+ueff8Qd8Zo0kXlcvHhR/v77bwZocvoYYXeQxsQaxlCg1o4dOyavvvqqHjbUHTFIEzkeuoFiRqt06dI5uihECRIj7G7uxsovXboU7exYqVOntnd1REQJArPwtWnTRmbNmuXoohAlGLuDdOvWrXUSBs5YEZhxwxChaO5u27ZtwpWMiCiOfv/9dz1WA5q3e/fu7ejiEDkuuxuJGB4eHnpgAFyLBm9vb+nWrZuMHj064UpGRBQH8+fPlw4dOsj777+vE8ZwfCJyFXZdk8Zc0du2bdNDgiZPnlzOnj2rn0dmt5+fn7gzXpMmcoypU6fq49KMGTPEy8vL0cUhcmziWIoUKXT/6Ny5c9vzZy6PQZoo6TO4c+bMqf+Pwxhr0GRmSZY4hpF70NWKiMhRNmzYoHuYzJs3Tz9mgCZXZXeQ/vrrr/VQe+iDGBQUpM8OrG9ERIkJTdtNmjTRE/u88cYbji4OUaKyu7nb0/P/47r12avR3ITr1u6Izd1EiW/37t1Su3ZtKVu2rCxfvtztc2HIeSTZBBsbN26090+IiBIEWvKQuLps2TIGaHILdtekKXqsSRMlHqOl7uHDh7q1jpP5kLNJssQxIqKkdOrUKT3k8MmTJyVVqlQM0ORWGKSJyLTQk6RmzZoSFhbG8bjJLTFIE5EpYcjhWrVq6Ul91q9fL4GBgY4uElGSY5AmItOJjIyUxo0bW/pEZ8mSxdFFInK+II2xukNCQhKuNERE/+vqOXHiRF2Dzp49u6OLQ+ScQXrkyJFy586dhCsNEbk1HE+GDRumM7gxWEmePHkcXSQi5w3S7L1FRAkFXVPq1asnkyZN0tejiSgeg5kQESU09H9u2LChnDlzRg+YlCtXLkcXicgUGKSJyKHQvQpjcR8+fFhfgy5VqpSji0RkGszuJiKH8vHxkaJFi8qKFSukXLlyji4OkamwJk1EDvHkyRNde8ZkGbgOTUTPY02aiJLcs2fPpH379no0sbt37zq6OESmxZo0ESUpdK/q2LGjLF68WBYsWCABAQGOLhKRaTFIE1GSjiT24Ycfyp9//ilz5syRN99809FFIjI1NncTUZK5fv26rFmzRmbNmiVvvfWWo4tD5No1aczvSkQUl4GPwsPD9RjcJ06cED8/P0cXicgpOHTEsX/++Uf3j8yaNasO+LhGZQ3XrfC89a1+/frPDSOIBBRMou3v7y+dOnXSAyNYO3TokFSpUkVSpEihxwEeM2bMc2WZP3++FCpUSC9TvHhx3R2EiBLGF198oZPEnj59ygBNlFRB+sGDBy81tu6jR4+kZMmSMnny5BiXQVAOCgqy3HAtyxoC9NGjR2Xt2rXy999/68DfpUsXy+v379+XunXrSs6cOWXfvn0yduxYGTp0qPz000+WZbZv3y5t27bVAf7ff/+Vpk2b6tuRI0fivW1E9F9ff/21jBgxQpo3by7e3t6OLg6Rc1EmgaIsWrTI5rkOHTqoN998M8a/OXbsmP67PXv2WJ5buXKl8vDwUFevXtWPf/zxRxUQEKDCw8MtywwcOFAVLFjQ8rhVq1aqUaNGNuuuUKGC+vDDD+Nc/nv37umy4J6I/uvbb7/Vv4vhw4c7uihEDhXfGGH6xLFNmzZJxowZpWDBgtKtWze5ffu25bUdO3boJu5XX33V8lzt2rX1NHe7du2yLIPZdDCqkQGD+J88edLSPxPL4O+sYRk8T0Txg1apTz75RD799FP5/PPPHV0cIqdk6i5YaOpGE1nu3Lnl7Nmz+sfeoEEDHTy9vLwkODhYB3BryZIlk3Tp0unXAPf4e2uZMmWyvIY+mrg3nrNexlhHdJAEg5t1szoR/b/SpUvry0+VK1dmkimRKwbpNm3aWP6PZK4SJUpI3rx5de26Vq1aDi3bqFGj9Ly3RGRr9uzZupWqR48eOmGTiOLP9M3d1pCkliFDBj2dHWTOnFlu3Ljx3HCDyPjGa8Yy6JtpzXj8omWM16MzePBgPf+tceP8t0T/7SXx7rvv6qZuzjdPlMQ16ZCQEFm0aJFs2bJFLl68KKGhoRIYGKibtXAN97XXXpPEdOXKFX1NGn0toVKlSrpMyNrGIP2wYcMGPapRhQoVLMt89tlnuuuHkVmKTHBc4zaGI8QymCKvd+/elvfCMng+JsmTJ9c3IvqvpUuXSrt27XQLGHpPsImbKIlq0teuXZMPPvhAB0d0p8D8r5jzFU3Or7zyip6kvU6dOlKkSBGZO3dunN8c/ZkPHDigb3D+/Hn9/0uXLunX+vfvLzt37pQLFy7oIIohBPPly6dPCKBw4cL6unXnzp1l9+7dsm3bNt3EhoME+l4DDhpIGkP3KnTVQvnGjx8vffv2tZSjV69esmrVKvnuu+/0QAvoorV37169LiJ6sc2bN+sRxPAbxWhiyBkhogQQlxTwjBkzqv79+6ujR4/GuExoaKj6448/VMWKFdXYsWPjlFq+ceNGnZIe9YauV1hf3bp1VWBgoPL29lY5c+ZUnTt3VsHBwTbruH37tmrbtq1KlSqVSpMmjXrvvffUgwcPbJY5ePCgqly5skqePLnKli2bGj169HNlmTdvnipQoIDy8fFRRYsWVcuXL1f2YBcscmd3795VgwYNsunqSEQvHyM88M+LAjmamNOnTx/nwG/v8q4A2d1p06bV16cx+hmRO0BLF3pCRO1BQUQJEyPi1Nxtb8B1twBN5I727NmjR/NDzgcRmSS7G9ebli9fbnk8YMAAPaAIksaQTEZErg+5I8gNKVasmEydOtXRxSFyWXYH6ZEjR4qvr6/+PwYVwbjbmLACXaP69OmTGGUkIhNBAiYSRTFmwcqVKyV16tSOLhKRy7J7MBP0B0aGNWDWqhYtWugJLV5//XWpXr16YpSRiEwExwCMWYAAjWtsRGSimnSqVKks42dj8nacUQOmeETXLCJyTRg4CGMQoNsjWtEw/C4RmSxIIyijzzRup06dkoYNG1qawHLlypUYZSQiE9SeMUCQMRQuJrEhosRn9y8N16AxEtfNmzdl4cKFlkxujPqFOZmJyLVgHveaNWvqYT4xKBARJZ049ZOmF2M/aXLVJm7kmmD/xoxWuBZNREkXI+KUOHbo0CHd1QJNXPh/bDBTFRG5BgyhiwlrGKCJTFyTRnA25m7G/zFwvvWfGY9xHxERIe6INWlyJcbvGbPKXb16VXLmzOnoIhE5tUStSWPiC8x2ZfyfiFwXJrdp2bKlno61WrVqDNBEDhSnIG39I+UPlsh1YfrZJk2a6ERQY9AiInKiwUzg5MmTMnHiRDl+/LhlysiePXvqOZqJyDk9fvxYmjVrpqd9Xb16tZQvX97RRSJye3Z3wUK3KySR4Uy7ZMmS+rZ//379HF4jIufUrVs3nSC2bNkyqVy5sqOLQ0Tx6YKF8Xrbt28vw4cPt3l+yJAh8vvvv8vZs2fFHTFxjJzdsWPHdJKYMYogETnJVJVRBzZ49913n3v+7bff1q8RkfNAb4yxY8fKo0ePpEiRIgzQRCZjd5DGwAZbtmx57vmtW7dKlSpVEqpcRJTIMA73hx9+KIMGDZLt27c7ujhEFN/EsaVLl1r+/8Ybb8jAgQP1NemKFSvq53bu3Cnz58+3jOtLROZ0426oHDlzS67fCZU500bKsoW/y6+//soaNJGzD2YSp5VxMBNekyZTB+jFm87o+3ULJ8vmFb/LO92GyLcjBkjGAD9HF4/Ipd1PzGvSaBaLy81dAzSRM0AN+mZImOTKmlYyBqaXd7t9JgXLNdDPE5EL9ZMmIudz426Y3L1+QXJnLSlN23bVzwXdfqSfJyInD9ITJkyI03Iff/zxy5SHiBLJphW/yU8TRsrwCfMlT/6iEqmUPA5/JhkDOLIYkdMH6XHjxsXpmjSDNJH5YB74qeNHSIMWH4hHyld0DRoBOtDfV4rn+++4/ETkxEGaE2sQOadp06ZJjx49pE+fPjLw86/k6NnbuokbNWgE6EDWpIlcZ8Qxih6zu8mMnj59qsfgrlSpkq5No7WLiFwsu3vOnDlxXuHly5dl27ZtcV6eiBIH5oL29vaWzZs3y6RJkxigiZxQnIL0lClT9ExXY8aMscx8ZQ1nBitWrJB27dpJmTJl5Pbt24lRViKKI0ySUbx4cQkODtZn7XEd64CInPCaNM7EMeoYpqfERPApU6aUTJkySYoUKeTu3bv6QJAhQwbp2LGjHDlyRL9GRI6BaSZbtmyp54XG75KI3Oia9K1bt/Q43RcvXpSwsDB9EChdurS+ufPZOq9Jkxls3LhRGjZsKLVr19ZTx/r4+Di6SEQk8Y8RTBxLIAzS5GghISGSO3duKVeunG75QksXETl3jOCIY0Quwt/fXxYtWqSzuRmgiVyD+7ZPE7mIgwcPyhdffCFoFMNUsn5+nCyDyFUwSBM5sWPHjunrzytXrpTQ0FBHF4eIEhiDNJGTOn36tNSqVUuyZs2qM7rR64KI3DxII3uUiBzrypUrUrNmTQkICJC1a9dK+vTpHV0kIjJDkK5fv77kzZtXvv76az26GBElvYwZM0rz5s1l3bp1+v9E5JrsDtJXr17Vg/UvWLBA8uTJI/Xq1ZN58+bJkydPEqeERGQRFBQk+/bt0/2fx48fr5u6ich1vVQ/6f3798uMGTPkzz//1I8xLGinTp2kZMmS4m7YT5oS282bN3X2Nhw6dEi8vLwcXSQiMsMEGzHBON0YJhQ164cPH8r06dOlbNmyUqVKFTl69OjLrJqIrNy5c0fq1Kmjx8VHX2gGaCL34Bnf6e/Q3I3hB3PmzKkzSzHLzvXr1+XMmTP6ubfeeivhS0vkhnDmjctKSBbDNegCBQo4ukhElETsHnGsZ8+eunkbreTvvPOOnhmrWLFiltfRDeTbb7/ltTKiBHLp0iU9kQ2yuK1/a0Tk+pLFZ/AEzIaFzNLkyZNHuwwm3WBXLaKXgwlsMGkNppw8ceKEJEvGUXyJ3I3diWPh4eF6MnkOnGCLiWOUkPA7e/PNNyVVqlT60hIRObdETxxDZmmDBg30QQNvULFiRX39mYgSFrozIqcD87h/9NFHji4OETlQnIP0wIED5cCBAzJ8+HB9zRnT4nXu3DlxS0fkZtBK1b59e52MiSxujCpGRO4rzhe5kLQyc+ZMnWUKjRs3lsKFC+tmuZiuTRORfRYuXCiLFy/WTdwY3Y+I3Fucr0mjXyZGG8ucObPlOVyXRn/oXLlyibvjNWlKCPg54jfFLG4i15Ikg5lEHUABj19iwDIi+l9g7t27t/zxxx/i4eHBAE1E9gdpHEgwiEK6dOksN4wyVrp0aZvniCju8Lvq37+/Hof70aNHji4OETnrNWmM0U1ECevLL7+U7777TiZMmMBETCKKf5Du0KFDXBclojiYMmWKnvIVo/ZhJD8iogQZuxvdr6ZNm6Yn18DA/8aMWEgss8c///wjTZo00UOI4locslqjNgWippElSxbx9fWV2rVry+nTp22WwfujywouxPv7++tZuNAMbw0zBmHSjxQpUkj27Nn1QTGq+fPnS6FChfQyGOFpxYoVdm0Lkb2aNm2qAzWau4mIEiRII+Dh2vQ333xj6S8Nf/31lw7a9sA1OExrOXny5GhfRzBFM+B//vMf2bVrl84mRxewx48fW5ZBgEY2LLqI/f333zrwd+nSxSajrm7dunrSD8zDO3bsWBk6dKj89NNPlmW2b98ubdu21QH+33//1QdP3I4cOWLvx0P0QkgQw2Q0OPns2rWro4tDRGam7FSrVi3Vv39//f9UqVKps2fP6v9v27ZN5cyZU8UXirJo0SLL48jISJU5c2Y1duxYy3MhISEqefLk6s8//9SPjx07pv9uz549lmVWrlypPDw81NWrV/XjH3/8UQUEBKjw8HDLMgMHDlQFCxa0PG7VqpVq1KiRTXkqVKigPvzwwziX/969e7osuCeKybRp0/R+Mnr0aEcXhYiSUHxjhN016T179siHH3743PPZsmWT4ODghDp3kPPnz+v1oYnbgD5mFSpUkB07dujHuEcT96uvvmpZBstjUgLUvI1lqlatKj4+PpZlUBs/efKknlnIWMb6fYxljPeJDgZxQS3d+kYUm9mzZ+vksG7dusmAAQMcXRwicgJ2B2mMLhZdQDp16pQEBgYmVLksAT9Tpkw2z+Ox8RruM2bMaPM6ZgpCVzDrZaJbh/V7xLRMbCcdo0aN0icNxg3XuolighHEkHzZsWNHPfc6cjCIiBI8SL/xxht6/O6nT5/qxzjYYL5bjO3dokULcRe4/o6RY4zb5cuXHV0kMjG0vLRr105+/vln3dJDRBQXdh8t0KcT2dOowWK+22rVqkm+fPkkderUMmLECEkoxvCjSLCxhsfGa7i/cePGcxMUIOPbepno1mH9HjEtYz0EanQtCsgot74RRXX27FndSwEJjr/++utzo/YRESVokEbTrpFJjczrHj166O5KmFYvIeeYzp07tw6S69evtzyHZnZca65UqZJ+jHtklyNr27BhwwaJjIzU166NZZDxbdT8AeUvWLCgBAQEWJaxfh9jGeN9iOJj06ZNujvfb7/95uiiEJGzUg704MED9e+//+obivL999/r/1+8eFG/jgxYf39/tWTJEnXo0CH15ptvqty5c6uwsDDLOurXr69Kly6tdu3apbZu3ary58+v2rZta5MRnilTJvXOO++oI0eOqDlz5ig/Pz81depUyzLITE+WLJn69ttv1fHjx9WQIUOUt7e3Onz4cJy3hdndZA37VMqUKVWdOnVs9lcick/34hkj7A7SPXv2VOPHj3/u+YkTJ6pevXrZta6NGzfqQke9dejQwdIN64svvtBBFl2v0P3r5MmTNuu4ffu2DsroDpYmTRr13nvv6eBv7eDBg6py5cp6HdmyZYu2+8u8efNUgQIFlI+PjypatKhavny5XdvCIE0GdAnEvli1alX16NEjRxeHiEwgvjEizlNVWne1Wrp0qZQtW9bmeYw4hqSyK1euiDviVJVkaNOmjU6mXL16tc7VICK6H88YEeexuw23b9/WbxQV3vTWrVv2ro7IZSAXApnbM2fO1NncDNBElOSJY8jkXrVq1XPPr1y5UvLkyfPSBSJyRhhTvlSpUnrYXIz/Ht2JLBGRveyuSfft21dndN+8eVNq1qypn0NmNLpm/fDDD3YXgMjZYXQ8/BZSpUoVa7c9IqJED9Lvv/++bspDn+ivvvpKP5crVy49m8+7775rdwGInBlyMGrVqqX7zeNkNeoIeERESRakMVAIZvBp3ry5Hn8YtWlMIYkaBJG7Qc5ls2bN9LVo9InGlKtERAnJ7uxuPz8/OX78uJ76kf4fs7vd0+7duyV9+vSSN29eRxeFiFwwRtidOFa+fHk95zKRu8Kwsxi7/cmTJ/r3wABNRKa5Jv3RRx9Jv3799LU49JWOOhRoiRIlErJ8RKaCs+D69evLuXPnpFOnTrq3AxGRaZq7o5vBBzNhYTW4j4iIEHfE5m7Xh4llEKCPHj2qk8TKlCnj6CIRkZNIssFM0N2EyN2gRwNG1EM/aEy+wgBNREnB7iDNhDFyRz4+PlKuXDkZNmyYZYY1IiJTNHdjrO4GDRqIt7e3/n9sUNtwR2zudk2Y4nTv3r2ctpSIHBIj4hSkcR06ODhYD9QQ3TVpy8p4TZpB2gXcuBsqR87ckqBbD+Tn7wbKnp2b5ML58xIYGOjoohGRk0rUa9IYrCG6/xO5muMXbssfq07IjbsPZctf38mJ/Ruka/9vRSWz7cVARJQU7O4nTeTKNWgE6AtB92Tr4glyYt96afD2Z5Ihd3ldsyYiMm2Q3rBhgxQpUkRX2aNC9b1o0aLyzz//JHT5iJIMAvGtkMfiLWESdP6QNGw/SLIXriKhT57Jjbthji4eEbmhOAdpzHDVuXPnaNvS0c7+4Ycfyrhx4xK6fERJ5vqdUEmeLEJ8fNPK+5/OkOIVG0gyL095FPpEMgb4Orp4ROSG4hykDx48qAdyiEndunVl3759CVUuoiS3Yv4UmTOhh/h4RcrDxyL3Q8Pl/qMnkiGtrxTPx6QxIjJxkL5+/brughWTZMmS6VmxiJzRqFGj5PdfJsjrNRqLf9pUktrPRyIjRHJmTi1v1y8igaxJE5GZBzPJli2bHDlyJMaxijESU5YsWRKybERJApdpPv30Uz1QSdeen+hr07gGjSZu1KAZoInI9GN39+zZU8+Zu2fPHkmRIoXNa2FhYXo2oBo1asiECRPEHbGftHM6duyYFCtWTAYNGiQjRozQff2JiJxqMBOjuRvjFXt5eUmPHj2kYMGC+vkTJ07I5MmT9SAm+/fvl0yZMok7YpB2Xrt27dInmQzQROS0QRouXrwo3bp1k9WrV+tZr/QKPDykXr16OlDnzp1b3BWDtHP5448/5NKlS7oGTUTkErNgYXKNFStWyN27d+XMmTM6UOfPn18CAgLiU2Yih1iwYIG8++678s4771imWCUicolZsABBGTMCETmbZcuWSdu2baVVq1Yybdo0BmgiMjUOC0puY9u2bdKyZUs9U9usWbN0fgURkZkxSJPbKFmypAwcOFD+/PPPWPv8ExGZhV2JYxQzJo6Z186dO8Xf318KFSrk6KIQkZu6H88YwZo0ubS9e/fq3gdffPGFo4tCRGQ3BmlyWRhvHmPKY/a26dOnO7o4RER2Y5Amlx1JrHbt2rrv/sqVKyV16tSOLhIRkd0YpMklYbIX9OFfs2aNvh5NROSMmDiWQJg4Zg7BwcESGBiou1dxoBIiMgsmjpHbu3Llirz22muWoT4ZoInILUccIzKLG3dD9dSSJ89clBEDO4inROgZ24iIXAGDNDl1gF686YxcuHRNpo/tLmGhD2XAiBmSInUGRxeNiChBsLmbnBZq0DdDwuTUvhXyOPS+fD5mpkiKDPp5IiJXwJo0Oa3rd0IlRfJk0rRtV6lWt5mkD8wiQbcfyY27YY4uGhFRgmBNmpzSw4cP5dsvO8uhvVuQIaYDdKRS8jj8mWQM8HV08YiIEgSDNDmdsLAwPZPVqWMHJWumDHIx6L6uQeM+0N9XiucLdHQRiYgSBJu7yamEh4dLs2bNZNeuXbJ69WopULSMvgaNJm7UoBGgA1mTJiIXwSBNTuXjjz+WzZs3y/Lly6Vy5cr6uZrlcji6WEREiYIjjiUQjjiWNM6ePSvnzp2TOnXqOLooRERxxhHHyGVFRETIyJEj9c6dN29eBmgichsM0mRqkZGR0rVrVz0f9Pbt2x1dHCKiJMVr0mRauBLTq1cv+eWXX2TmzJnSoEEDRxeJiChJMUiTaQ0cOFAmTZokU6dOlXfffdfRxSEiSnJs7ibTypYtm4wfP166dOni6KIQETkEa9JkOvv375cyZcropm4iInfGmjSZyrhx46Rs2bKyY8cORxeFiMjhTB2khw4dKh4eHja3QoUKWV5//PixdO/eXdKnTy+pUqWSFi1ayPXr123WcenSJWnUqJH4+flJxowZpX///vLs2TObZTZt2qRrbsmTJ5d8+fLpJCVKej/++KP07dtXBg8eLBUrVnR0cYiIHM7UQRqKFi0qQUFBltvWrVstr/Xp00eWLVsm8+fP16NQXbt2TZo3b27TvxYB+smTJ7r7zqxZs3QA/vLLLy3LnD9/Xi9To0YNOXDggPTu3Vs++OADPeQkJZ3p06frEy58/iNGjNAnZEREbk+Z2JAhQ1TJkiWjfS0kJER5e3ur+fPnW547fvw4Rk9TO3bs0I9XrFihPD09VXBwsGWZKVOmqDRp0qjw8HD9eMCAAapo0aI2627durWqV6+eXWW9d++efm/ck32ePXumKlWqpLp27aoiIyMdXRwiogQX3xhh+pr06dOnJWvWrJInTx5p3769br6Gffv2ydOnT6V27dqWZdEUniNHDsv1TNwXL15cMmXKZFmmXr16eni2o0ePWpaxXoexDK+JJg20cnh5ecnatWtl8uTJrEETEVkxdZCuUKGCbp5etWqVTJkyRTdNV6lSRR48eCDBwcHi4+Mj/v7+Nn+DgIzXAPfWAdp43XgttmUQyDElYmyzMWEZ6xvZB5cqChcuLJcvX5aUKVOKp6epd0cioiRn6i5Y1iNMlShRQgftnDlzyrx588TX17HTEY4aNUqGDRvm0DI4szVr1kjLli2lcePGkjlzZkcXh4jIlJyq6oJac4ECBeTMmTP6wI6m0pCQEJtlkN1tHPRxHzXb23j8omUwS0lsJwLIQMaED8YNtUGKGyT5NW3aVE+U8eeff4q3t7eji0REZEpOFaQfPnyopyrMkiWL7kuLg/v69estr588eVJfs65UqZJ+jPvDhw/LjRs3LMvg2icCcJEiRSzLWK/DWMZYR0zQXQvrsb5R3L5DdJV7/fXXZcGCBfqSBRERxUCZWL9+/dSmTZvU+fPn1bZt21Tt2rVVhgwZ1I0bN/TryAbOkSOH2rBhg9q7d6/OEMbNOmu4WLFiqm7duurAgQNq1apVKjAwUA0ePNiyzLlz55Sfn5/q37+/zg6fPHmy8vLy0svag9ndcbd9+3b16NEjRxeDiCjJxDdGmDpIoytUlixZlI+Pj8qWLZt+fObMGcvrYWFh6qOPPlIBAQE60DZr1kwFBQXZrOPChQuqQYMGytfXVwd4BP6nT5/aLLNx40ZVqlQp/T558uRRM2bMsLusDNKxw0kSPvuIiAhHF4WIKMnFN0Z44J+YatkUd8juTps2rb4+zaZvW8eOHZPq1atL9uzZ9ehuqVOndnSRiIicIkY41TVpcj7o545+6EjQQ0Y3AzQRUdwxSFOiwTCttWrV0mePSMbDGOtERBR3DNKUaDChSdu2bWXdunXPDRhDREQvxiBNCQ6juGFCk2TJksk333wj2bJlc3SRiIickqlHHCPnc+vWLX0NGgPNIGEMgZqIiOKHR1BKMHfv3tWjiCFQY1QxBmgiopfDoyglWPcCzB6G4VHRzapgwYKOLhIRkdPjNWlKEEFBQXp2MnSzKlasmKOLQ0TkEliTJrsdv3Bb1u++JEG3QiV9Gk+pViablC1SUI4cOaLnhiYiooTBIE12B+gpCw5KyMMn4unxTGaOGybjvTxk5coVUiR3BkcXj4jIpbC5m+yCGjQCdMYAH9n+1zdy/cIhKVC+mWzYw6k6iYgSGmvSZBc0cXt5Kln+6wg5e3SntOgySlJnK6GfJyKihMWaNNklSwY/uXBsp5w8sFmavj9MchcpL0+fRurniYgoYbEmTXapVT6HnLpUTdJkeEX8s+TUNWj/VD5Sp3wuRxeNiMjlMEhTnGBG0z59+kjRokWlW8vmsn53gA7QqEEjQBfMFeDoIhIRuRwGaYpTgB44cKCMHz9epk6dKoVzpdc3IiJKXLwmTS80dOhQGTt2rA7SXbp0cXRxiIjcBoM0xernn3+W4cOHy+jRo+Xjjz92dHGIiNwKm7spVs2bN9fN3axBExElPQZpkw63iYQsZFI76trv7NmzpWrVqpI9e3YGaCIiB2Fzt8mG29x97LoE332k7/EYzyeVG3dDZcOeS9K130h5++23ZfJ/piXZexMR0fNYkzbZcJuoQXt6eEqkitQ1ajyfFLVpBOjFm87IulWLZcHPQ6V89WaSu2xz/XzGAA5UQkTkCAzSJoGA7O3tqQM04B6PE3u4TQThI2duyc4jQbJt8ypZ9dtXUqV2U+nU+yu5fP2hfq1muRyJWgYiIooeg7RJoAZ9+cYDXYM2atKJPdymUXu+GRImwbcfSVh4hBQqW1vaffilJPPykhTJk8mNu2GJ9v5ERBQ7BmlTDbd511KjRoBO7OE2UUtGgPaJuC2ZAtLJkxJVpEDJqnLzXrikTJlcHoc/k4wBvon2/kREFDsmjpkErjt3a1lSyhfJJJkDUur77i1LJ+pwm6glXz13UD7v0UJO7F0hqXx95PGTCLl+J1QuBt2XQH9fKZ4vMNHen4iIYseatIkk9XCbNy8fk1/G9pH8RUpL7YYt5MkzDzl2/o5OFKtYLLMO0IGsSRMROQyDtJvat2+ffNano+TIW0hadBkptx88083bRXKnk2bV8zM4ExGZAIO0m5o8ebIULVpEZs9dLBeuh+umb1x/Zu2ZiMg8PBTGfKSXdv/+fUmbNq3cu3dP0qRJI2YVEREhXl5e8vTpUwkLCzN1WYmI3D1GMHHMjZw5c0aKFy8ue/bsEW9vbwZoIiKTY3O3CzMGKkFTdmTYLRnQvY2kSpVScuTg4CRERM6AQdpFWQ9U8vjhbZn8VRfx8vSQ5Qv/lkyZMjm6eEREFAds7nZRxkAlOTKnlrlTPhNPj0jp0G+i3A71cXTRiIgojliTNmnzNDKti+XLEK/JLbAejMWNoT5FibTp/Jn4+6cWlTwDh/kkInIirEmbrHl6x5EguXrrob7HYzwfn/VcunpdNvw1Sc5duSmPPAIllX8WDvNJRORkGKRN1jydM0sayZI+pb7HYzxv73ouB92S1b9+JucOrpdH927J7fuP9UhiHOaTiMi5sLnbJNAMjVmnPD089GPcx2cWqktBt2X2hH5y49pF6Tf8Z0kRkEuPxY1mc44kRkTkXBikTQLN0GevhkikUjpA497e5uknT57IhK96yNULp2TwqF+kQJGSej2I+xiLmwGaiMi5MEibBJLEzlwJ0bNPoQaNAP2i5unth6/Ksn/Oyc27jyUwIIU0rpJb6tSuKTXf/EC8/XNL0O1HcVoPERGZE4O0SaA5umn1fDbZ3bGNo40APWneQR2ERUXIqaN75GLwA+nRrqvkeyUgzushIiLzYpA2WaCuWS5uo4GhBo0AncrPS7Yu/F4undghdTtP1c+P6l4lzushIiLzYna3E0I3q0vBDyQiMkK2/DVOLhzdItVaDhK/NOl10zcREbkGBmknY/SDRkLYgTX/kQtHNkr5Jv0ka8GKEhmp9LVpIiJyDWzuNpGBkzbLsfMhlsdFcvvLNz2qWR4fv3Bbfl1+TNeiPVWY3A06IaXq9pCM+V+X+w+fSEpfb2laNZ+DSk9ERAmN80mbZD7pqAE6aqBGgJ6y4KBcu/VInoaHik+KlKIin4lviuT62nSaVD7S5c0SUqF4lgTaIiIicnSMYE3aJKIL0MbzyORGgL736Kmc3v6nXD21Td7oOlHCJZmkS5NcMqZLJ5WKZWGAJiJyMbwm7QRGzdwrIQ+fyuldC+XEjrmSrXBNCY9MJmgEuRXymP2giYhcFIO0kzi3f6mc2Pqb5K/YWvKVby4RzyLEw8NDT0XJ4T6JiFwTg7RJ4NpzTB7evSbHN8+UvK82kwKV2ujnnkWKZEnvJx0bFWOAJiJyUQzSJoHksJgCdaqArFK53VgpVOVdXXuGtCm9pXvL0lIwV0ASl5SIiJIKg3QUkydPlly5ckmKFCmkQoUKsnv37iQN1Kn9/j+X7+qJf+TEttn62nPaTHktARp6MEATEbk8Bmkrc+fOlb59+8qQIUNk//79UrJkSalXr57cuHEjSd6/Sb8l8iD0mf5/0OkdcmDlD/L4AeaTtu0l91rxTMzkJnrJQYE27Lkkc9ac1Pd4TGRG7CdtBTXncuXKyaRJk/TjyMhIyZ49u/Ts2VMGDRqUqP2kEaAN18/tkb1Lv5Es+StJ6Qa9xcPTy/Jao9dyStcWpexePxHZjtp3M+S/c7gbM8VhghuMn0+UGOIbI1iTtpqLed++fVK7dm3Lc56envrxjh07nls+PDxcf+jWt4RwN+iU7Fv2jWTK86qUqt/LJkBnTe/HAE30kjBDHAJ0zixpJEv6lPoej/E8kdkwSP/PrVu3JCIiQjJlymTzPB4HBwc/t/yoUaP0WZFxQ407IaQJzCX5K7SS0g37iaeX7VgzxfKyLzTRy8IUrqhBe/4vxwP3eIznicyGQTqeBg8erJstjNvly5cTZL1eyXwkf8VW4pXM2+b5FD6eUjRPugR5DyJ3hjnW0cSNSWoA93iM54nMhsOC/k+GDBnEy8tLrl+/bvM8HmfOnPm55ZMnT65vSSGZp0jlktk4qhhRAiiWL4OcuRIiF4Pu21yT5u+LzIg16f/x8fGRsmXLyvr16y3PIXEMjytVqpTo77/suzejfT531jTSsVFRaVevMActIUoASA5DkhjGu8+WIZW+56h9ZFasSVtB96sOHTrIq6++KuXLl5cffvhBHj16JO+9916SvH9MgZqIEj5Q1yyXw9HFIHohBmkrrVu3lps3b8qXX36pk8VKlSolq1atei6ZjIiIKCmwn3QCedl+0kRE5Lrus580ERGRa2GQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKTYpAmIiIyKQZpIiIik2KQJiIiMikGaSIiIpNikCYiIjIpBmkiIiKT4gQbCcQYAh3jsxIREVkzYoO902UwSCeQBw8e6Pvs2bM7uihERGTiWIGJNuKKs2AlkMjISLl27ZqkTp1aPDw8XupsC4H+8uXLLj+bljttq7ttrzttq7ttrztta0JuL0ItAnTWrFnF0zPuV5pZk04g+NBfeeWVBFsfdgZ3+AG427a62/a607a62/a607Ym1PbaU4M2MHGMiIjIpBikiYiITIpB2mSSJ08uQ4YM0feuzp221d2215221d2215221Qzby8QxIiIik2JNmoiIyKQYpImIiEyKQZqIiMikGKRNZvLkyZIrVy5JkSKFVKhQQXbv3i1mNnToUD14i/WtUKFCltcfP34s3bt3l/Tp00uqVKmkRYsWcv36dZt1XLp0SRo1aiR+fn6SMWNG6d+/vzx79sxmmU2bNkmZMmV08ka+fPlk5syZSbJ9//zzjzRp0kQPQIBtW7x4sc3rSOn48ssvJUuWLOLr6yu1a9eW06dP2yxz584dad++ve5j6e/vL506dZKHDx/aLHPo0CGpUqWK/t4xcMKYMWOeK8v8+fP1Z4tlihcvLitWrEjSbe3YseNz33X9+vWdcltHjRol5cqV04MPYZ9r2rSpnDx50maZpNx3E/t3H5ftrV69+nPfb9euXZ1ue6dMmSIlSpSw9GuuVKmSrFy50nm/VySOkTnMmTNH+fj4qOnTp6ujR4+qzp07K39/f3X9+nVlVkOGDFFFixZVQUFBltvNmzctr3ft2lVlz55drV+/Xu3du1dVrFhRvfbaa5bXnz17pooVK6Zq166t/v33X7VixQqVIUMGNXjwYMsy586dU35+fqpv377q2LFjauLEicrLy0utWrUq0bcP5fnss8/UX3/9hQRLtWjRIpvXR48erdKmTasWL16sDh48qN544w2VO3duFRYWZlmmfv36qmTJkmrnzp1qy5YtKl++fKpt27aW1+/du6cyZcqk2rdvr44cOaL+/PNP5evrq6ZOnWpZZtu2bXqbx4wZoz+Dzz//XHl7e6vDhw8n2bZ26NBBb4v1d33nzh2bZZxlW+vVq6dmzJihy3DgwAHVsGFDlSNHDvXw4cMk33eT4ncfl+2tVq2afm/r7xffl7Nt79KlS9Xy5cvVqVOn1MmTJ9Wnn36q9x9suzN+rwzSJlK+fHnVvXt3y+OIiAiVNWtWNWrUKGXmII2DcnRCQkL0j2P+/PmW544fP64DwI4dO/Rj/AA8PT1VcHCwZZkpU6aoNGnSqPDwcP14wIAB+kTAWuvWrfWBJylFDVyRkZEqc+bMauzYsTbbnDx5ch18AD9g/N2ePXssy6xcuVJ5eHioq1ev6sc//vijCggIsGwvDBw4UBUsWNDyuFWrVqpRo0Y25alQoYL68MMPk2RbjSD95ptvxvg3zrqtcOPGDV32zZs3J/m+64jffdTtNYJ0r169YvwbZ97egIAANW3aNKf8XtncbRJPnjyRffv26eZS66FG8XjHjh1iZmjeRRNpnjx5dFMnmooA2/P06VObbUITZo4cOSzbhHs0Z2bKlMmyTL169fR4uUePHrUsY70OYxlHfy7nz5+X4OBgm7Jh2D80a1lvH5p9X331VcsyWB7f7a5duyzLVK1aVXx8fGy2D82Rd+/eNdVngCY+NP8VLFhQunXrJrdv37a85szbeu/ePX2fLl26JN13HfW7j7q9htmzZ0uGDBmkWLFiMnjwYAkNDbW85ozbGxERIXPmzJFHjx7pZm9n/F45drdJ3Lp1S+9Q1jsG4PGJEyfErBCQcC0GB+2goCAZNmyYvt545MgRHcBwMMaBO+o24TXAfXTbbLwW2zL40YSFhelrwY5glC+6slmXHUHNWrJkyfTB0XqZ3LlzP7cO47WAgIAYPwNjHUkB15+bN2+uy3r27Fn59NNPpUGDBvqg4+Xl5bTbislxevfuLa+//roOTkZZkmLfxYlJUv/uo9teaNeuneTMmVOfcCNvYODAgfrk6a+//nK67T18+LAOyrj+jOvOixYtkiJFisiBAwec7ntlkKaXgoO0AckaCNr4oc+bN89hwZMSR5s2bSz/R00D33fevHl17bpWrVrirJBEhJPKrVu3ijuIaXu7dOli8/0iGRLfK07I8D07k4IFC+qAjBaDBQsWSIcOHWTz5s3ijNjcbRJoYkJtJGqWIR5nzpxZnAXOUAsUKCBnzpzR5UazT0hISIzbhPvottl4LbZlkLnpyBMBo3yxfWe4v3Hjhs3ryBJFFnRCfAaO3DdweQP7Lb5rZ93WHj16yN9//y0bN260mcUuqfbdpP7dx7S90cEJN1h/v86yvT4+PjrjumzZsjqzvWTJkjJ+/Hin/F4ZpE0COxV2qPXr19s0S+Exmm2cBbrb4MwbZ+HYHm9vb5ttQvMZrlkb24R7NE1ZH9zXrl2rd3Y0TxnLWK/DWMbRnwuabfGDsy4bmrtw/dV6+3BAwPUpw4YNG/R3axwEsQy6P+FamfX2oTaA5l+zfgZXrlzR16TxXTvbtiI3DgELzaAoY9Qm+KTad5Pqd/+i7Y0OaqJg/f06y/ZGhfcIDw93zu81HolylEiQso/M4JkzZ+pM2S5duuiUfessQ7Pp16+f2rRpkzp//rzuOoNuC+iugOxRo7sDunps2LBBd3eoVKmSvkXt7lC3bl3dNQRdGAIDA6Pt7tC/f3+diTl58uQk64L14MED3Q0DN/xcvv/+e/3/ixcvWrpg4TtasmSJOnTokM5+jq4LVunSpdWuXbvU1q1bVf78+W26JSHjFN2S3nnnHd1NBPsBtjdqt6RkyZKpb7/9Vn8GyKpP6G5JsW0rXvvkk090Biy+63Xr1qkyZcrobXn8+LHTbWu3bt101znsu9ZdjkJDQy3LJNW+mxS/+xdt75kzZ9Tw4cP1duL7xf6cJ08eVbVqVafb3kGDBumsdWwHfpN4jB4Ga9asccrvlUHaZNDfDjsQ+tchhR/9Tc0M3Q6yZMmiy5stWzb9GD94A4LVRx99pLtAYKdu1qyZPjhYu3DhgmrQoIHuL4sAj8D/9OlTm2U2btyoSpUqpd8HBw/0+UwKeF8ErKg3dEcyumF98cUXOvDgB1mrVi3dN9Pa7du3daBKlSqV7sbx3nvv6aBnDX2sK1eurNeBzxHBP6p58+apAgUK6M8A3T/QFzSpthUHcxy0cLBCwMyZM6fu9xn1gOMs2xrdduJmvV8l5b6b2L/7F23vpUuXdEBOly6d/l7Qvx0ByLqftLNs7/vvv6/3T6wb+yt+k0aAdsbvlbNgERERmRSvSRMREZkUgzQREZFJMUgTERGZFIM0ERGRSTFIExERmRSDNBERkUkxSBMREZkUgzQREZFJMUgTvcDQoUOlVKlSji4GEbkhBmkyvY4dO4qHh4e+YXB8TA4wYMAAPVfsi2De1549e+oZm5InTy7Zs2eXJk2aPDc4vhlPAP79919p3bq1nuAAZccUoI0bN5Zly5bpCROcwYULF/T3ZkzWQObEE1Hz4nzS5BTq168vM2bM0LMnYZYlzA+Lg/8333wTa4DAxPaYPnPs2LF6jlz8/erVq/Wcugk5yby9EGQxKXxMlixZIq1atZLatWvLrFmz9LR7mMVn+/bt8vnnn0uVKlWem7g+KWG6P8z0k5Tw3eEkzZW3keg5do/2TZTEMMEDZpey1rx5cz3bUmwwQD4mcHj48OFzr929e9fyf8zy9MYbb6iUKVOq1KlTq7feestm4gjMwlSyZEn166+/6oH7MXEEJhK5f/++ZRnMBNWzZ089oD8mKHj99dfV7t27n5u8YsWKFXr2KExSgQH5o5vwAOVNnz69Hvg/JpjYw4DZoTD7FMqfMWNG9fbbb6ubN29aXq9WrZouGyZMwKQCmAwE2xT18+jUqZOeTACfQY0aNfQMQFE/g59//lnlypVLzyoEK1eu1NuKGZYwOUOjRo1sJliJun0oC0RERKhhw4bp7weTD2DdWJcBMxhhecwkhIkf8Jkm9qQq+G4xExRm6MJnYEyismDBAlWkSBFdTiyD2bmsJ1DABCCGRYsW6XJPmTLF8hwmePjss89ifN/Lly+rNm3aWCZ8KFu2rM1EDD/++KOewAH7DCYdwX4Y9XPCbGXW3yWewz5nve9h5jKsG5NGYNanEydO6Ndj2g/JHBikyemCNIJS5syZVYUKFWL8G8zGhEAycuTIWNeNYIGZbDArE6atw8ERBzIjmBgBCrM64cQA7/3PP//o9//0008ty3z88ccqa9asOggfPXpUlxkHXZTD+kBZokQJPSMPAtmVK1f07Do4yFtPHfjXX3/pZTEt5IvggGxMo4cp8/bv36/q1Kmjg6wB24ITi6FDh6pTp06pWbNm2UzdB5hitEmTJmrPnj16GZQLJwpG+fEZ4CQAJwN4D8xkZQSwhQsXqtOnT+tAgXUUL15cf66AExUjQGD7jPVhGkyU6c8//9TBYsCAAToI4b2tgw9OCLB+TA147do1lZiMEzAEYXw/uGGf8PT01MEbs5sheCHIGUEMUyHiszSmZu3du7c+0cFJHDx58kQH3rVr10b7npghDAG4SpUqasuWLfpznDt3rtq+fbt+HfsCPhdMhYj3/+677/SUiJhm0d4gjd8LpqrE/on3e+211/Tr2Oei2w/JHBikyfQQ8HBgQpBAjQoHHBw4ESBigvmMsRwOcrFBoMK6MVWfAQcx/K1RE0aAwoHWuuaMWqlxkoCaLw6ks2fPtryOgzOC9pgxY2wOlIsXL7Z5f6OGag1TN2LZO3fuWJ5DWbD9xm3ZsmX6+a+++kpPIRm1Zoa/N6bMRJDGSYi1cuXKqYEDB+r/IzggOFnPCw158+a1zPNszOlsBKOYoAaP9zbmfo4uiAA+mxEjRjxXJkwhaP13P/zwg0oqCNJNmza1ea5du3b6pMcavnvUrI0WDZzMzJ8/Xz/GCd+oUaP0SRxgTm18bo8ePYr2PfH5otZunLxEhUCKKUGtoaWnYcOG8apJGzD1J54z5j2Pbj8kc2DiGDmFGjVq6OSjXbt26evR7733nrRo0SLG5eOaWHX8+HGdTIaboUiRIvp6L14z5MqVS1KnTm15jGSuGzdu6P+fPXtWXy/F9W8Drp2WL1/eZh3w6quvSnyUKFFCbz9ujx49kmfPnunnDx48KBs3bpRUqVJZboUKFbKUy/rvrVmXH+t4+PChpE+f3mY958+ft1kHEtcCAwNt1nP69Glp27atTsxLkyaN/pzg0qVLMW7L/fv35dq1azafF+CxvZ9X165dbcr8olvRokVjXV/U90N5oisnths5BciLqFq1qmzatElCQkLk2LFj8tFHH+n8AeQ8bN68WcqVKyd+fn7Rvh++z9KlS0u6dOmifT2m94/6OcWF9T6A7x+MfYDMi4lj5BRSpkypk6dg+vTpUrJkSfnll1+kU6dO0S6fP39+fQBNqOSwqAlLWHdkZGS8tuNFUHY4efKkVKxYUf8f2d3G9ltDcEW2enQJdMaB+EXlxzqwLAJNVNbJadGVHe+N4P3zzz9L1qxZ9TqLFSumk64Swos+r+HDh8snn3wS5/W9KPEsLt9PVNWrV5effvpJtmzZogMuTlaMwI0gXa1atRj/1tfXV16Gp6fncyelOGF80bbj+4f47MOUtFiTJqeDA9Onn36qs5zDwsKiXQY1k3r16snkyZN1zTMq1HqgcOHCcvnyZX0zoDaE11Gjjou8efPqLOBt27bZHCj37NnzwnXg76JmedetW1eXP7bMdUOZMmXk6NGjugaLIG59i2vAwTrQVS1ZsmTPrSNDhgwx/t3t27f1iQS+h1q1aunP8u7du89tH1hvI4IYArr15wV4HNfP3JAxY8bnyhzbDScU9sA2RVfOAgUKiJeXl36MIIx9Zv78+TpgA+7XrVunlzWei62F5M6dO3a9v/E5GS0bQUFBltfj090tuv2QzIFBmpzSW2+9pQ+SCMIxwWs48KDZeeHChbqJEs2EEyZMkEqVKull0MUJXbPat28v+/fvl927d8u7776rD7xxbZpGMOzWrZv0799fVq1apQ/YnTt3ltDQ0Bhr+gYEVzQr48B669Yt3UyKZtlp06bJ8uXLpVGjRrrL2Llz5+TQoUMyZswY/XdGgEBXMhzg0eSMkwI0T2N5XA6I60EXnwE+j6ZNm8qaNWt01zV09frss89k7969Mf5dQECAbiJHLfLMmTOyYcMG6du373NBFLVFfC7Xr1+Xe/fu6efxWeEkZO7cuTrQDxo0SH8GvXr1EjPp16+f7lP/1VdfyalTp3R3uEmTJtnU3hFo8Vn88ccfNkF68eLF+vuM2lxtDd9b5syZ9WeP4IvvGfvqjh07LJ/TzJkzZcqUKXr//f777+Wvv/6yvD8+W7S2jB49Wu/bqLnjpMle0e2HZBKOvihOFJ8uWIAEHWQ2R9fFyoCM4O7du+ukIHShQZcfdLcykmrs6YJlbdy4cXqdBiTgoJsTMntj64Jl3fULkKzVokUL5e/v/1zXF2Rat2zZUnerSpYsmU5Qqlevnu6WZN0FCxnR6K6FdSDzuFChQjrL2FgGiWO9evWyeV98nkYXI0BSHMqPhC4kOmXPnl21b9/eklAXU2IRspYLFy6stxmZ68gexnagK5IB3bawPiT7WXfBQrY5vg+8X0xdsKImnCUmfJ/4XqMyumChnDly5FBjx459bhl8nviOkK1tbB+y+ytWrPjC971w4YLeB5C8hwTFV199VSc+xqULFhw7dkx3qcJ3j8Q1JENGlzhmve/hc8Vz+JxftB+SY+nOjo4+USAiIqLnsbmbiIjIpBikiYiITIpBmoiIyKQYpImIiEyKQZqIiMikGKSJiIhMikGaiIjIpBikiYiITIpBmoiIyKQYpImIiEyKQZqIiMikGKSJiIjEnP4PCGHl+phN7XAAAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Off-diagonal points (mismatch): 42\n" - ] - } - ], - "source": [ - "merged = (\n", - " r_results[[\"cohortId\", \"row_count\"]]\n", - " .rename(columns={\"row_count\": \"r_rows\"})\n", - " .merge(py_results[[\"cohortId\", \"row_count\"]].rename(columns={\"row_count\": \"py_rows\"}), on=\"cohortId\")\n", - ")\n", - "\n", - "fig, ax = plt.subplots(figsize=(5, 5))\n", - "ax.scatter(merged[\"r_rows\"], merged[\"py_rows\"], alpha=0.5, s=15, color=\"#4C72B0\")\n", - "lim = max(merged[[\"r_rows\", \"py_rows\"]].max()) * 1.05\n", - "ax.plot([0, lim], [0, lim], \"k--\", linewidth=1, label=\"y = x (perfect agreement)\")\n", - "ax.set_xlabel(\"R CohortGenerator — row count\", fontsize=10)\n", - "ax.set_ylabel(\"CircePy (Ibis) — row count\", fontsize=10)\n", - "ax.set_title(\"Cohort table row counts: R vs Python\", fontsize=11)\n", - "ax.legend(fontsize=9)\n", - "plt.tight_layout()\n", - "plt.savefig(OUTPUT_DIR / \"figure2_row_count_agreement.pdf\", bbox_inches=\"tight\")\n", - "plt.show()\n", - "\n", - "off_diagonal = merged[merged[\"r_rows\"] != merged[\"py_rows\"]]\n", - "print(f\"Off-diagonal points (mismatch): {len(off_diagonal)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "b118ea5561624da68c537baed56e602f", - "metadata": {}, - "source": [ - "## Figure 3 — Incremental (checksum) speedup" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "938c804e27f84196a10c8828c723f798", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkwAAAGGCAYAAACJ/96MAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAABRT0lEQVR4nO3dCdxM5f//8Y99C1myZomKyJZKZGsjyRJt9kq00GJLSkJFKaGFoqJEtIhosS/ZJRJKkaXFmi1ky/wf7+v7P/ObmXuZe5gbY17Px2O4Z+bMOdc5c+bMZ67rc11XGp/P5zMAAAAkKW3STwEAAEAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYkEDv3r3tvPPOO9PFOGvNmTPH+vXrd1Kv3bRpk6VJk8Y+/fRTi0eDBw+2r776KsHjxYsXt44dO9rZ7HS/d7H8Obznnnvs8ssvt1g4H/WenopRo0a5dezateuMvZe1a9e2W2+9NdXWj/9J////BxBBwPTKK6/YU089daaLEnP0BaUL+y233BL0+Oeff265cuU6Y+UCUtP9999v9evXT7X1Dx061NKlS5dq68f/EDDhjPr3338tS5YsZ7oYOEmaivLo0aOWKVOmU1pPpUqVolYm4Gxz4YUXultqKVOmTKqtG/+HJjmkuCniww8/dM0mqgkoWLCgde3a1Y4fPx607E8//WRNmjSx3LlzW9asWa1ChQr20Ucf+Z/Xel588UXr3r27FShQwPLly+f/4lWtzaWXXuq+fEuUKGGDBg1KtFp7xYoVVrVqVRdoXXHFFe7+4cOH7aGHHnJl04VJNRmhFi1aZNdff71ly5bNcubMac2bN7cdO3ZEtJ8qQ58+fezgwYNuWd1UHS4///yz3X333VakSBG377qIDRw40E6cOBHxMd+3b5+1bNnSsmfP7o6RarO0rtDmg71799rDDz/syqnjVrlyZZs2bVqi1fVqSipVqpQ7hjoOGzZsCFruyJEjbjvFihVz67rsssts7NixiTa1qFlN762Wmzx5sjseOmZav/ZdTWwPPvig2w+PHtu8ebO9+eab/mOn5oykmuQmTJhgFStWtMyZM1uhQoWsc+fO7n0OrOnTOqZPn+7eSx0rlX3AgAF2snSO1KlTx3LkyOHWV6VKFbf+QCpDSj4HjRo1cueZzjfVLoQeb50Xr776qjvOOo76PNxxxx1BxyxU37593fH1mjXXrFnjauvy5MnjHtfxD9z/xJpqVq5c6Y6bjp9H91966SV7+umn3fl2/vnn2xNPPOE+lzNnznTvg86bG264wX7//fcUH8+vv/7anS96D3VuLl68OOj5Dz74wKpXr+6uFzqeKu/SpUuDlvnjjz/szjvvtPz587v1XHTRRdapU6eIj/f+/futdevW7n294IIL3P6Fvm+J0WesXbt2VrhwYbd9fb71OU/OyJEjLWPGjPbuu+8m2iTnnbt6H3W9VJl1LoU29XuvW7ZsmV199dVu+zpfpkyZErRc6Pvsve7HH390x1fnht6HqVOnBr1OP3YeffRRd/z1nj/wwAPuM6+y6XqIED4gxLPPPuvLli2b//7GjRt9OlWKFi3qe+SRR3zTpk3z9e7d2z02bNgw/3K//PKLL2fOnL7LL7/cN3r0aN/06dN9gwYN8r344ov+ZfSaAgUK+Bo3buybMmWKb+LEie5xrTdLliy+559/3r2uT58+vgwZMgStX+XKmDGjr1y5cr733nvP9+WXX7ptFS9e3NeiRQtfp06dXNk6dOjgtrNgwQL/axcuXOheq+1OnjzZN27cON/FF1/su+aaayLaz99//93Xtm1bV9ZFixa525o1a9xzM2bM8PXq1cv3xRdf+GbPnu32PUeOHG4dodv45JNPkn0PbrvtNncshw4d6vZT5S5SpIh7refIkSO+K6+80j3+7rvv+r755htfy5YtfenTp/etWrXKv1ytWrV8hQsX9l199dW+CRMmuG3rNYH7Lg0bNvTlzp3bN2TIELfvjz/+uC9NmjS+r776yr9MmzZtfLly5fKVLFnSN3LkSN/MmTN9v/76q2/Hjh2+Bx980K17zpw57v0vXbq0r3bt2v7Xfv/99+69v/322/3HTq+TYsWKuffNM2nSJLftZs2a+b7++mt3LHXMmzZt6l9Gx1jHo0SJEu7c0Hnjvfd6TehyKm9y5s+f786RmjVr+saPH++bOnWq74UXXvC98847KT4/ZMOGDb7zzz/fV716dXe8dY5fddVVbh8PHz7sX+7hhx/2pUuXzte1a1e3rU8//dR37733+v74449EP4da7rzzznP747nooovc+/j555/7Zs2a5Rs+fLg7BwPf+/r16wft54oVK1yZA9ej+xdeeKE7f3Qe6fOnxzp37uw+b/q8aF+0zE033eQLR+eJziV9NkeNGuXez6pVq7rPw/bt2/3LaTtvv/22++zoPGvVqpUvU6ZMvnXr1vmXue6663ylSpVyZVCZ33//fXf8Iz3eOnd0/N544w33mdJx0eci3Neg3hOdt9oPndsfffSRr3Xr1v7ndV5pHTt37nT3X3vtNXceaTlP6HvpnZPavt5XHXP9H3ouedc8vc9euW+99dZEP+OB73PgtXLEiBFu/TqOKsOuXbv8y+maqeV0jdYy999/v/86o/MdwQiYkOKA6Y477ghaTh/SG264wX+/efPmvgsuuMC3b9++JNet9ZQpU8Z34sQJ/2Pr1693X466cAbq3r27u1D9999//nLp9YFf4Ap+9Nhdd93lf+z48eO+fPnyuS98j74Eq1WrFrRdBTrari5Ckexn6PFJjLZz7Ngx94VbsGDBiAImlUvLfPDBB/7HdAwuueSSoIu7gkZdOL2AzVOlSpWgfVD5VV4vOAm8yCsAFH3Z6r6+uAPpuOrLJ/CLUMstXrw42f3XvisA0bKBX36hgVFSj1eqVMl9wQbS+aH1eV8U3pdOt27dgo67vqQV1Hr0JafARF+0ydH5oXNT509iUnp+6MtUQdy///7rf0zHXl/Wb775pruvY6Jzr1+/fkmWxzvPtE8KRhWoBh53fUGrPArQkxJJwKSAOlDlypVdGdeuXet/7PXXX3fL7tmzx5cc7zxRQO3Zu3evL3v27L4nn3wy0dfoHNd5o+CoR48e/sd1DBSEJCUlx9v7rOuHhUfvswKRcAFT2bJlXeCYlMCASe+nAj4FiIGSCpgUIAbSfQVRode8xMp99913Jxsw6XXetS3w/NWPGfn77799mTNn9vXt2zeoDDqXCZgSR5McUkxNFYHU5KTqco+q7m+//XbXnJGcevXqBTUtzZgxw/3ftGlTV0Xu3W688Ubbtm1bUBNA2rRpXbOAR014omU9Sn4sWbKk/3WHDh2yBQsWuOaO//77z79+vVbV66rujmQ/k6KmmmeffdYuvvhi18SSIUMG18SxdetWO3DggKWUV56GDRsG7XeDBg2CllPTW7ly5dx+BB63m266KcE+qUlFzRCB+yTefmldqpZXU13outTkqePmUfOPmqpCjR492uUiqSlA+66mAPnll18sEjpWajbSuRTorrvucv/Pnz8/yfdL55WaLALfr1q1arl9UXNMUnSOqLmoTZs2YZNnw50fOpZ679KnT+8/jmpu0rHx3pdZs2a55q62bdsmuy0to3KreXL27NlBx13vg5oge/ToYe+//36KztHk6L0OpPNKTaE6noGPibetwM+Tbv+Lvf5HzWM6nwLv63O6ZMmSoKa02267zTW36bjrvFm3bl3QOaNmdzXXDxs2zNavX5+g3Ck53vpfZdO2PNpe48aNwx4XbV9NxyrD6tWrk1xOn/UXXnjBNZcFfnaTE1ge0Tn/559/JngvEyt34HFMjK4ZgddFNXsrjcFbt5rrdM0KLauaNpE4AiakmNq4A6mNPjCn5O+//3YX2HB0cQyk7ri6mOXNm9ddML2bdwEPDJj0gdd2A8sQrmx79uxxF3blPQSuX7ctW7YkyMkIt59JUV7Wyy+/7PIdlJugi3TPnj3dcyl5vUcBlsqmL5hAXr5X4HFTMBO6T88//3yK9imwXFrX7t27E6xLvXv0BaQyJfX+eb3c9MWuPIuPP/7YBR96LNJ993JGdD6EbkfHQ4Goyhlu3yLdps4R5RSl5PwNtz0dS+XQhR7Lb7/91v++6LOiL/jQ9zSUcky++OILF3wqOA6k4FDBggKaDh06uOD/yiuvtHnz5kW078ntV7jzRj9eAvdx7ty5/mUDA3SP3lPvXPrnn39c8Km8NuVy6fjoM6PcuMDjOX78eLcdBSSXXHKJlS5d2gWQkRxv7zMV2hMzsXM51Ouvv26tWrVyOYR6D4oWLeqCt1DKEdTz3g+FlAh9/73yBH7ekip34DKJCb1Whp6r3utD36dw52Q8o5ccoka/eP/666+wy4UmLqtmQ4+p5iD0Ay5KZD0Vuuhr/UpoTuwXpQK1aPjkk09c0qQCJ8+XX34Z8XqU/Hns2DGX/BsYNAUmqHvHrXz58v7E0lOhdenCmdgYSaEX0cTGrdG+qxbr7bff9j8W+OV5Mu9X6P7qeCgxXWWNNm1Tv8hTcv6Go/Ip6VjJ+KGUcOx9VhSIah+T+4JSgKhz6Oabb3adGgKPr1fjo2Ov82XhwoXuHFdNpGopVNOnJGEFXaHBYbSoPAp8Evus7ty5M8Hy27dvd+e3l2Cv2g7VyChICnyfA3uUafn33nvP3nnnHVu+fLn7QaDaRtVEqXNISo6395nSvgcGHypPOPoMKiDTTbUyQ4YMcdtSEnWNGjX8yymwVQK3asonTpzoAp1wQs9xrzzeMZKkyh24zMnwXq/3KfCHQmiZ8H+oYULUqPpXv7ICL6Ap4TWx6Ve3fiGH3ryL3slSDxT1qlP1f2LrV1V1JBTU6Ys7sSESAgM+1WqNGzcu4vKqTDJp0iT/Y6r9UG+00OP922+/uYtdYvsVCa1LF06VP7F1JRbIJrfvMmbMmATLpaT2R1/0Cr5CB4hUzZVE8gs+0nNEvbYCmx9Pho6lmm7UJBR6HL2AQk1VCgrVmyoc7a++jFW2xx9/PNFl9OWspscnn3zS9QbzAj8FHgosApvKQntRngrtT1KfVQU+anoMvK/md69ZUeeMBJ43CvqS6p2lgPaqq65yAZOCTa95LiXHW68Tr9ZT9D4rsImEapC83ru6noQeC+2fmsqaNWuWovMosDyic16f59AhCBIrd2LN4pHwei8GXmck0mMST6hhQtQof0e/FnWBV5dd/YJZu3atyw/R/aToV7KaFFTt3a1bN3ch0K8q5TEobyMaH2A1lelLSr9M1SVYv9b061Zdxu+9917/0AApoSYQXbD1S7NatWouZ0sXSzUhjhgxwuW0qNZKg8klFliFU7ZsWZezoO6+OnbKUxk+fLj7ggms3VETmH7hq+zq2q7jqOYsNdOpVqF///4p3qbKrpoJ1WTovVLNlYYKULd1fTHp13241+s9fO6551zgoZoq5bQlduz0JarjrvdAXcRV2xJK3aJVG6ihFXTTl75qT/TrPbRpKhzVdCkoVy1FcnlMGu5C54i+gFWDoPJ9//337r287777Urw9DTuhL+i6deta+/btXfOJcvFUDtVI6MtU75WGXVCTrZoYVT6916pN0r6rC3sglUvNUDom6iKu7uerVq2yLl26uHNaOXsKSPSe6weA7ns5MaqBfOSRR9xrFZCcrpHKVfOjHC0dD9Xg6fgqcPOCvmuuucYFxzpvFOipVkzXkMB91z7pOOraoM+Yzms1kWl9yi1K6fHWZ1KfKW1bAbuOkT6fobVvibn22mvdaxVgKH9IgauCvMDaJY/OTQWker+UD6dlFeglRZ8FXfP0+dFnQnmAGnYj8DXaloJElVufF5VbTY2nel3U5061lsq7UuCkHymqrfTyx5Ird9xKIhkccSypXnKhPbsee+wx17spkHqjqHu6ug9nzZrVV7FiRdcd2KP1vPzyywm2qZ5A6oGjYQLUzVVdktVL6tVXX02yXMmVLbHeQcuWLfPdcsstrru+uqir15l6H3k9xVK6n+rJoy7h+fPndz1vtC3Ztm2b6/6vnkB6Tr381KU3sMtxSocVUC8kDZWg/c2TJ4/rpdOzZ0/XfTqQeiSqa7C6umsYBvXI0z5qyIZIe0ppmAJ189Zx0XugHo/qihzYW0+9n9RrKJR67nTp0sW9RvuvoQPUoyt0X1evXu2rUaOGWyawq39ivefUzb58+fKuLOotqV6PgT2hvJ5Gel8DNWrUyP+eRDKsgGgoCu2zzl2VUV321eU90s+Bhti488473XunXlPquafeXNp/j3pCDRgwwB1vvXfaR/VK9HqZJna+a/gA9YxUzyZ1z9cwAOohpm2oZ6i6zmvbgbQNdRXXutTDT/uTWC+50M9lYu91Usc8lPdanYeXXXaZew/V8zFwqA/R8A9aTr219F6rB2zg+aphAdTVXT3n9JnVdaFOnTq+pUuXRny8E/tMaZ/DfQ2qF6a656vXna5r1157bVBv0tBhBURDZmh5lV3XtqR6yen46Hqp803XjOeeey5o297r9FlSr0UdRx2L0F54ifWSS6wnr659ei7wM9+xY0d3XdG+6X3T8AUqm3o1Ilga/XOmgzYA4dWsWdP9wlWtG4DYpYErr7vuOpfknlzzuWob1Tsvkl62p0q1econ3bhx42nbZqygSQ44C3322WeuB5+q+NVUo9F31esnNOcBAE6Wmi015IpGYVeepFIqlHuoXotIiIAJOAspt0P5DL/++qvLs1BXak3ZkpJxYwAgpdcZBUmaFkc5ksqRUrCUVOeCeEeTHAAAQBikwQMAAIRBwAQAABAGARMAAEAYJH2ngHoPaORcjWKb2LQQ5zr1onjttdfchKgaEE69KG699dagofQ14JwGYdNAcxrMUQNFeoPniUakfuaZZ9x0CEpi1kB9WiZwWggNKKmpBzTitAam04CMGpQuJVMAKBVPg/RppN3Q8gEAkNz3h2ao0CjryQ3YSdJ3CmhEaE1sCQAAzk0aQT10WppA1DClgDc/kg6mpsGIZ5qIMrAGR9NmaAwPzU6vaS+8GjnNKt6rVy83PYCmyFDtj2Yl946faqI05YfGFdIAbonR9BrNmzd3NU7JTWSpKSI0PYQGg9OUE4Hl06SVmnpAtV+a6kO/IDSdhKbbAABg//79rlIk3LylBEwp4DXD6cs+3gMm0VxW3nHwAhnNtxV4bDQ3kWYW1xxW6dOnd8fwggsucLOvi/5X1afmPWvUqFGCbWh+LQVTat5LbK4xjwZ11PxRml9JQVpo+TT/mMYy+uabb1wZFeBpvBHeRwBAoHApNyR945RoQMWiRYtajx49XG2O8pM0CJqaMbdu3eqfZFOzwXfv3t0FOKrp0WSxmnXbW8ajZbSsgiSNdB06k3aoTp06uaAqsaBLtA5vFnNNuKmJVTXJLAAAkSBgwilRDZNmUdcM15qdXLU7muusXr16/uQ51SxpFuzJkye7kWXVrLd3714323hogp2az1TrpBm/NW+aZpdPKs3uiy++cE1tgwcPTrJ8mo173LhxbibuJ554ws3WDgBApGiSwylTDpN60CkvSTVMCpCqVKkSNKlknTp1bMOGDbZr1y7XRKdecAUKFLASJUoErUvNZropF0k5UWpXVn5U1apVE2xXwZLWqXUFatq0qdWoUcPlNClwU+6U8qGmT5/ueud16NDBTWgJAEBKUcOEqFHNkYIl5Qx99913iTaTKRhSgKNgR8MRNGzYMMn1KXlcjhw5kujzTz75pEv4VrDm3WTQoEE2cuRI/3Iqk5LPNRebaqOGDx8ehb0FAMQTapgQ1oEDB1yytGfjxo0uOFETnPKX1NymoER/axylxx57zE0Sq1oljwIY1RhpOY3FpGWUf1SqVCn3/JIlS2zZsmVWvXp1y5Url6s50rhNGsvJq136888/XQ3RBx98YFdffbWrodItlMqhSSRFPfVUA1a2bFkXeGmiSa83HwAAKUXAhLBUWxTY9b9z587uf9XajBo1yiVu67Ht27e7QSaVd6RgJ9C6detcYrh6vyn5+umnn3YBk0e5T8qF0gCYSgrXem6++Wbr2bOnv2fdsWPH3HqUOJ5SGTNmdNvdtGmTZcmSxTXVKacJAIBIMHBlCsdoUHOTcnTojg4AQPx9x5/RHKb+/fvbVVdd5QaL0hQZasZRDUKgw4cPuyRddTNXDysl9KomI7TreP369V0thdajnlbHjx8PWkYJwOqVpdqKiy++2NWMAAAApMQZDZjmzp3rgiH1glIPJjW5KO9FTTIeNduoO7ryZLS85nRr0qSJ/3mN5aNgSb2z1GX8/fffd8GQclcCc260jJqVlHvz+OOP2/33329Tp0497fsMAABiz1nVJKcpMFRDpMCoZs2arnpMScJjx451U2vIzz//7JJ2lTisARG//vprNw2GAqn8+fO7Zd566y03AKLWpxwW/f3ll1/a6tWrgyZ61VhAGgE6HJrkAAA4N8VEk1woFVbU+0o0tYZqnTQ6c+jI0gqYRP+XK1fOHyxJ3bp13QFYs2aNf5nAdXjLeOsIpd5Uen3gDQAAxK+zJmDSmDtqKrv22mvt8ssvd49t27bN1RCFDkyo4EjPecsEBkve895zyS2jQEjziiWWW6Vo07tp8EQAABC/zpphBZTLpCaz+fPnn+miuG7oXtf5wJmMU0uDLsnPl4b4MXlg4nPiAQDOrLMiYOrYsaMbUHDevHl24YUX+h/XoIRK5lauUWAtk3rJeQMW6v+lS5cGrc/rRRe4TGjPOt1XW6XG5gmlnnTe2D8AAABntElO+eYKlj7//HM3VYY3OrNHIzRrcteZM2f6H9OwAxpGwBv9Wf9rdGlNs+FRjzsFQ2XKlPEvE7gOb5nE5icDAAA4q2qY1AynHnCTJk1yYzF5OUfKG1LNj/5v27atax5TIriCoEceecQFOuohJxqGQIFRq1atbMCAAW4dGh1a6/ZqiR588EF744033Gz19913nwvOPv74Y9dzDgAA4KyuYRo2bJjrGVe7dm03FYZ3Gz9+vH8ZTaSqYQM0YKWGGlDzmqbQ8KRLl8415+l/BVItW7Z0U3P07dvXv4xqrhQcqVapQoUKNnDgQHvnnXdcTzkAAICYGofpbJXa4zCR9A0PSd8AcHrF5DhMAAAAZyMCJgAAgDAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgLM5YJo3b541aNDAChUqZGnSpLGJEycGPa/HEru9/PLL/mWKFy+e4PkXX3wxaD2rVq2yGjVqWObMma1IkSI2YMCA07aPAAAg9p3RgOngwYNWoUIFe/PNNxN9fuvWrUG39957zwVETZs2DVqub9++Qcs98sgj/uf2799vderUsWLFitny5ctdsNW7d28bPnx4qu8fAAA4N6Q/kxuvV6+euyWlQIECQfcnTZpk1113nZUoUSLo8ezZsydY1jNmzBg7evSoC7YyZsxoZcuWtZUrV9qrr75q7du3j9KeAACAc1nM5DBt377dvvzyS2vbtm2C59QElydPHqtUqZKrQTp+/Lj/uUWLFlnNmjVdsOSpW7eurVu3zvbs2ZPoto4cOeJqpgJvAAAgfp3RGqZIvP/++64mqUmTJkGPP/roo3bFFVdY7ty5beHChdajRw/XLKcaJNm2bZtddNFFQa/Jnz+//7lcuXIl2Fb//v2tT58+qbo/AAAgdsRMwKQmtRYtWrjE7UCdO3f2/12+fHlXk/TAAw+4oCdTpkwntS0FXYHrVQ2TksUBAEB8iomA6dtvv3VNaOPHjw+7bJUqVVyT3KZNm6xUqVIut0nNeYG8+0nlPSnQOtlgCwAAnHtiIofp3XfftcqVK7sedeEooTtt2rSWL18+d79q1apu+IJjx475l5k+fboLphJrjgMAADirAqYDBw64AEc32bhxo/t7y5YtQc1hn3zyid1///0JXq+E7sGDB9sPP/xgv/32m+sR16lTJ2vZsqU/GGrevLlrplOy+Jo1a1wt1ZAhQ4Ka3AAAAM7aJrnvvvvODRPg8YKYNm3a2KhRo9zf48aNM5/PZ82aNUvwejWb6XmNq6SebUruVsAUGAzlzJnTpk2bZh06dHC1VHnz5rVevXoxpAAAAEixND5FI0iWarkUeO3bt89y5MgR9fU36DIp6utEbJo8sNGZLgIAxJX9KfyOj4kcJgAAgDOJgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAACCM9BahjRs32rfffmubN2+2Q4cO2QUXXGCVKlWyqlWrWubMmSNdHQAAwLkTMI0ZM8aGDBli3333neXPn98KFSpkWbJksd27d9uGDRtcsNSiRQvr3r27FStWLHVLDQAAcLYFTKpBypgxo91zzz322WefWZEiRYKeP3LkiC1atMjGjRtnV155pQ0dOtTuuOOO1CozAADA2Rcwvfjii1a3bt0kn8+UKZPVrl3b3V544QXbtGlTNMsIAABw9gdMyQVLofLkyeNuAAAAcdtL7vvvv7cff/zRf3/SpEnWuHFje+qpp+zo0aPRLh8AAEDsBUwPPPCA/fLLL+7v3377ze6++27LmjWrffLJJ/bEE0+kRhkBAABiK2BSsFSxYkX3t4KkmjVr2tixY23UqFEuIRwAAMDiPWDy+Xx24sQJ9/eMGTPslltucX+r59yuXbuiX0IAAIBYC5g0bMDzzz9vo0ePtrlz51r9+vX9A1pqfCYAAACL94Bp8ODBLvG7Y8eO9vTTT9vFF1/sHv/000+tWrVqqVFGAACA2JoapXz58kG95Dwvv/yypUuXLlrlAgAAiK0aJuUthaOpUTJkyBDRxufNm2cNGjRw06ykSZPGJk6cGPS8RhbX44G3m2++OWgZTc2iKVly5Mhh559/vrVt29YOHDgQtMyqVausRo0arozKtRowYEBE5QQAAPEtRQFT2bJl3bQn4cZZ+vXXX+2hhx5yI4OnxMGDB61ChQr25ptvJrmMAqStW7f6bx999FHQ8wqW1qxZY9OnT7cpU6a4IKx9+/b+5/fv32916tRx89stX77c1YT17t3bhg8fnqIyAgAApKhJ7vXXX3eT6j788MN20003ucRv1QqpxmbPnj22du1amz9/vgtclNukoCkl6tWr527J0bQrBQoUSPS5n376yb755htbtmyZK5NXVvXce+WVV1wZNWmwAr333nvPzYen4G/lypX26quvBgVWAAAApxQw3XDDDfbdd9+5oGj8+PEuCNm8ebP9+++/ljdvXjc5b+vWrV1tT65cuSya5syZY/ny5XPrvf76610PPW/qFU34q2Y4L1iSG2+80dKmTWtLliyx2267zS2jsaIULAVO9fLSSy+5YC+x8moyYd0Ca6kAAED8iijpu3r16u52uqg5rkmTJnbRRRfZhg0b3PQrqpFSEKQE823btrlgKlD69Oktd+7c7jnR/3p9IG/4Az2XWMDUv39/69OnT6ruGwAAOId7yZ1OmnbFU65cOddDr2TJkq7WSbVeqaVHjx7WuXPnoBomJYsDAID4FPE4TGdSiRIlXBPg+vXr3X3lNu3YsSNomePHj7uec17ek/7fvn170DLe/aRyo5Q3pV53gTcAABC/Yipg+uOPP+zvv/+2ggULuvtVq1a1vXv3ut5vnlmzZrmpW6pUqeJfRj3njh075l9GPepKlSoV9XwrAABwbjqjAZPGS1KPNd286VX095YtW9xz3bp1s8WLF9umTZts5syZ1qhRIzeyuJK25bLLLnN5Tu3atbOlS5faggULXC89NeWph5w0b97cJXxrfCb14lPS+pAhQ4Ka3AAAAM7agEk979TDTjdREKO/e/Xq5ZK6NeBkw4YN7dJLL3UBT+XKle3bb791TWYe9dgrXbq0y2nScAJKSg8cYylnzpw2bdo0F4zp9V26dHHrZ0gBAACQUml8KRnGO4R6rI0cOdL9r9oa9VT7+uuvrWjRom6co3ONkr4VeO3bty9V8pkadJkU9XUiNk0e2OhMFwEA4sr+FH7HR1zDNHfuXNdjTeMcTZgwwT8NyQ8//GDPPvvsqZUaAADgLBRxwPTkk0+6wSOVOB04GKQGlVS+EQAAgMV7wPTjjz+6EbRDqVlu165d0SoXAABA7AZMmopEk+CGWrFihRUuXDha5QIAAIjdgEld9jURr6YVSZMmjRvzSN35u3bt6uaTAwAAsHgPmPr16+e68WuqECV8lylTxk1uW61aNevZs2fqlBIAACCW5pJToveIESPsmWeesdWrV7ugSWMnXXLJJalTQgAAgFidfFdjLukGAABwros4YNI4l59++qnNnj3bTXyrHKZAGpsJAAAgrgOmxx9/3N5++2277rrrLH/+/C7xGwAA4FwWccA0evRoV4ukedsAAADiQcS95DTfSokSJVKnNAAAAOdCwNS7d2/r06eP/fvvv6lTIgAAgFhvkrvzzjvto48+clOhFC9e3DJkyBD0/Pfffx/N8gEAAMRewNSmTRtbvny5tWzZkqRvAAAQFyIOmL788kubOnWqVa9ePXVKBAAAEOs5TJoSJUeOHKlTGgAAgHMhYBo4cKA98cQTtmnTptQpEQAAQKw3ySl36dChQ1ayZEnLmjVrgqTv3bt3R7N8AAAAsRcwDR48OHVKAgAAcC71kgMAAIgnKQqY9u/f70/01t/JISEcAADEZcCUK1cu27p1qxus8vzzz0907CWfz+ce/++//1KjnAAAAGd3wDRr1izLnTu3+3v27NmpXSYAAIDYC5hq1arl//uiiy5yYzGF1jKphun333+PfgkBAABibRwmBUw7d+5M8LiGE9BzAAAAFu8Bk5erFOrAgQOWOXPmiNY1b948a9CggRUqVMitc+LEif7njh07Zt27d7dy5cpZtmzZ3DKtW7e2v/76K2gdmgBYrw28vfjii0HLrFq1ymrUqOHKp9qxAQMGRLrbAAAgjqV4WIHOnTu7/xWQPPPMM27QSo8SvZcsWWIVK1aMaOMHDx60ChUq2H333WdNmjQJek6DY37//fduW1pmz5499thjj1nDhg3tu+++C1q2b9++1q5dO//97Nmz+/9Wr746derYjTfeaG+99Zb9+OOPbntKXm/fvn1E5QUAAPEpxQHTihUr/DVMCjoyZszof05/K6jp2rVrRBuvV6+euyUmZ86cNn369KDH3njjDbv66qtty5YtVrRo0aAAqUCBAomuZ8yYMXb06FF77733XDnLli1rK1eutFdffZWACQAARDdg8nrH3XvvvTZkyJAzMt7Svn37XA2XaocCqQnuueeec0FU8+bNrVOnTpY+/f92bdGiRVazZs2gAK9u3br20ksvuVorDZkQ6siRI+7mCTf2FAAAOLdFPNL3yJEj7Uw4fPiwy2lq1qxZULD26KOP2hVXXOGGPVi4cKH16NHDjRmlGiTZtm1bgmT0/Pnz+59LLGDq37+/9enTJ9X3CQAAnKMB05mgBPA777zTNQcOGzYs0dwqKV++vKtJeuCBB1zQkylTppPanoKuwPWqhknJ4gAAID6lj5VgafPmzW4AzXBNgVWqVLHjx4/bpk2brFSpUi63afv27UHLePeTyntSoHWywRYAADj3RDyswJkIln799VebMWOG5cmTJ+xrlNCdNm1aN42LVK1a1Q1foHV5lEyuYCqx5jgAAICzqoZJYzetX7/ef3/jxo0u4FE+UsGCBe322293QwtMmTLFDV2gnCPR82p6U0K3hjO47rrrXE853VfCd8uWLf3BkJLAlY/Utm1blwO1evVql7Q+aNCgM7bfAAAgDgIm1fio19yOHTvsxIkTQc/16tUrxevReEoKdjxe3lCbNm2sd+/e9sUXX7j7oeM7adu1a9d2zWbjxo1zy6pXm5K7FTAF5h9peIJp06ZZhw4drHLlypY3b15XRoYUAAAAKZXGp0zqCIwYMcIeeughF3goByhw1G/9rRqhc42SvhV4aViD1BhOoUGXSVFfJ2LT5IGNznQRACCu7E/hd3zENUzPP/+8vfDCC655CwAAIB5EnPStwR7vuOOO1CkNAADAuRAwKVhSThAAAEC8iLhJ7uKLL3YT4i5evNjKlStnGTJkCHpeI28DAADEdcA0fPhwO++882zu3LnuFkhJ3wRMAADA4j1g0lhJAAAA8eSURvrWiAQRjkoAAAAQHwHTBx984PKXsmTJ4m6a9Hb06NHRLx0AAEAsNsm9+uqrLum7Y8eOdu2117rH5s+fbw8++KDt2rXLjbQNAAAQ1wHT66+/bsOGDbPWrVv7H2vYsKGVLVvWTVFCwAQAACzem+S2bt1q1apVS/C4HtNzAAAAFu8Bk8Zh+vjjjxM8Pn78eLvkkkuiVS4AAIDYbZLr06eP3XXXXTZv3jx/DtOCBQts5syZiQZSAAAAcVfD1LRpU1uyZInlzZvXJk6c6G76e+nSpXbbbbelTikBAABiqYZJKleubB9++GH0SwMAABCrAdP+/fstR44c/r+T4y0HAAAQVwFTrly5XA+4fPny2fnnn+/mjAulEb/1+H///Zca5QQAADi7A6ZZs2ZZ7ty53d+zZ89O7TIBAADEXsBUq1Yt/98XXXSRFSlSJEEtk2qYfv/99+iXEAAAINZ6ySlg2rlzZ4LHd+/e7Z4DAACweA+YvFylUAcOHLDMmTNHq1wAAACxN6xA586d3f8KljT5btasWf3PKdFbYzNVrFgxdUoJAAAQCwHTihUr/DVMP/74o2XMmNH/nP6uUKGCde3aNXVKCQAAEAsBk9c77t5777UhQ4Yw3hIAAIgbEY/0PXLkyNQpCQAAwLk0Ncp3333nJtrdsmWLHT16NOi5CRMmRKtsAAAAsdlLbty4cVatWjX76aef7PPPP7djx47ZmjVr3OCWOXPmjGhd8+bNswYNGlihQoVcMrkm8g2kfKlevXpZwYIFLUuWLHbjjTfar7/+mmA4gxYtWrgmQo1C3rZtW9djL9CqVausRo0arhefxpAaMGBApLsNAADiWMQBU79+/WzQoEE2efJkl+ytfKaff/7Z7rzzTitatGhE6zp48KBLFn/zzTcTfV6BzWuvvWZvvfWW64WXLVs2q1u3rh0+fNi/jIIlBWzTp0+3KVOmuCCsffv2/uc1912dOnWsWLFitnz5cnv55Zetd+/eNnz48Eh3HQAAxKk0PlXjREBBiwKU4sWLW548eWzOnDlWrlw5V+N0/fXXuznnTqogadK4GqvGjRu7+yqWap66dOni7323b98+y58/v40aNcruvvtut80yZcrYsmXL7Morr3TLfPPNN3bLLbfYH3/84V4/bNgwe/rpp23btm3+nn1PPvmkq81SoJcSCrpUe6btp0aye4Muk6K+TsSmyQMbnekiAEBc2Z/C7/iIa5g0Ee8///zj/i5cuLCtXr3a/b137147dOiQRcvGjRtdkKNmOI92qEqVKrZo0SJ3X/+rGc4LlkTLp02b1tVIecvUrFkzaBgE1VKtW7fO9uzZk+i2jxw54g5g4A0AAMSviAMmBR9q/pI77rjDHnvsMWvXrp01a9bMbrjhhqgVTMGSqEYpkO57z+n/fPnyBT2fPn16N1Fw4DKJrSNwG6H69+/vgjPvprwnAAAQvyLuJffGG2/4c4jU1JUhQwZbuHChNW3a1Hr27Gnngh49evhHNhfVMBE0AQAQvyIKmI4fP+4Sq9WkJWr6Uj5QaihQoID7f/v27a6XnEf3vSlYtMyOHTsSlFE957zX63+9JpB331smVKZMmdwNAAAg4iY5NXc9+OCDQb3UUstFF13kApqZM2cG1fQoN6lq1aruvv5X7pR6v3k0vMGJEydcrpO3jHrOafgDj5oUS5Uq5fKxAAAAop7DdPXVV9vKlSstGjRektblrU+J3vpbA2Kq19zjjz9uzz//vH3xxRdu/rrWrVu7nm9eT7rLLrvMbr75ZpdDtXTpUluwYIF17NjR9aDTctK8eXOX8K3xmdS7b/z48W4ohMAmNwAAgKjmMD388MMu2Pj999+tcuXKbpiBQOXLl49oxPDrrrvOf98LYtq0aeOGDnjiiSfcWE0aV0k1SdWrV3fDBmgASs+YMWNckKSEczURKpdKYzd5lLQ9bdo069Chgytv3rx53WCYgWM1AQAARHUcJgUlCVaSJo0bN0n///fff3auYRwmnC6MwwQAZ+d3fMQ1TGo2AwAAiCcRB0yaYgQAACCeRJz0LaNHj7Zrr73WJVZv3rzZPTZ48GCbNImmJQAAcO6JOGDS3GxKztZ8bUrE9nKWNEWJgiYAAACL94Dp9ddftxEjRrhRvtOlS+d/XPO5qes/AACAxXvApKTvSpUqJXhcI2NrCAAAAACL94BJI3AnNnClxkfSQJIAAAAW773klL+kQSA1PYrGXtII2x999JH179/f3nnnndQpJQAAQCwFTPfff79lyZLFevbsaYcOHXJTj6i3nKYb0ZQkAAAAFu8Bk7Ro0cLdFDBpPrh8+fJFv2QAAACxHDB5smbN6m4AAADnsoiTvrdv326tWrVyzXDp06d3QwsE3gAAACzea5juuece27Jliz3zzDNWsGBBN+EuAADAuSzigGn+/Pn27bffWsWKFVOnRAAAALHeJFekSBE3nAAAAEC8iDhg0nxxTz75pG3atCl1SgQAABDrTXJ33XWXG06gZMmSrodchgwZgp7fvXt3NMsHAAAQewGTapgAAADiScQBU5s2bVKnJAAAAOdKDpNs2LDBTY3SrFkz27Fjh3vs66+/tjVr1kS7fAAAALEXMM2dO9fKlStnS5YssQkTJripUeSHH36wZ599NjXKCAAAEFsBk3rIPf/88zZ9+nTLmDGj//Hrr7/eFi9eHO3yAQAAxF7A9OOPP9ptt92W4HFNwLtr165olQsAACB2A6bzzz/ftm7dmuDxFStWWOHChaNVLgAAgNgNmO6++27r3r27bdu2zc0jd+LECVuwYIF17drVWrdunTqlBAAAiKWAqV+/fla6dGk3RYoSvsuUKWM1a9a0atWquZ5zAAAAFu8BkxK9R4wY4YYWmDJlin344Yf2888/2+jRoy1dunRRL2Dx4sVdTVborUOHDu752rVrJ3juwQcfDFrHli1brH79+m5kcuVadevWzY4fPx71sgIAgHNTxANXeooWLepuqW3ZsmX233//+e+vXr3abrrpJrvjjjv8j7Vr18769u3rv6/AyKPXKlgqUKCALVy40OVfqelQU7qotgwAACDqAVPnzp0TfVw1O5kzZ7aLL77YGjVqZLlz57ZouOCCC4Luv/jii24eu1q1agUFSAqIEjNt2jRbu3atzZgxw/Lnz28VK1a05557zuVh9e7dO2hoBAAAgKgETOoN9/3337uam1KlSrnHfvnlF9ccp9ymoUOHWpcuXWz+/Pkuvymajh496poAFbQpQPOMGTPGPa6gqUGDBvbMM8/4a5kWLVrkBtpUsOSpW7euPfTQQ25k8kqVKiXYzpEjR9zNs3///qjuBwAAOMdzmFR7dOONN9pff/1ly5cvd7c//vjDNZNpqpQ///zTJYF36tQp6oWdOHGi7d271+655x7/Y82bN3fB0uzZs61Hjx4ul6ply5b+59WbLzBYEu++nktM//79LWfOnP6bEtwBAED8SuPz+XyRvEBjLWmU79DaI9XW1KlTxwVMqoHS39EeyFI1Q2pCmzx5cpLLzJo1y2644QZbv369a7pr3769bd682aZOnepf5tChQ5YtWzb76quvrF69eimqYVLQtG/fPsuRI4dFW4Muk6K+TsSmyQMbnekiAEBc2b9/v6scCfcdH3ENk1boTbgbaOfOnf6mKw1uqeazaFLQozyk+++/P9nlqlSp4v5XwCRqptu+fXvQMt79pPKeMmXK5A5a4A0AAMSvk2qSu+++++zzzz93TXG66e+2bdta48aN3TJLly61Sy+9NKoFHTlypBsSQD3ekrNy5Ur3f8GCBd3/VatWddO5BAZ5qiFTEBTtHCsAAHBuijjp++2333b5SRrx2xvLKH369NamTRsbNGiQu6/k73feeSdqhdRo4gqYtA1ty6OxoMaOHWu33HKL5cmTx1atWuXKphyq8uXLu2XUNKjAqFWrVjZgwACXt6QBNjWOk2qSAAAAoh4wnXfeeW7gSgVHv/32m3usRIkS7nGPuu5Hk5riNPikarYCKZ9Jzw0ePNgOHjzo8oyaNm0aNOK4eu9pgE31ilNtk3KXFHgFjtsEAAAQ1aTveJTShLCTRdI3PCR9A8A5kvQNAAAQbwiYAAAAwiBgAgAACIOACQAAIBq95L744gtLqYYNG6Z4WQAAgHMmYPIGpAxHE+JqUl4AAIC4C5g0cCQAAEC8IocJAAAg2iN9i0bVnjt3rht9O3SS3UcfffRkVgkAAHDuBEwrVqxwc7cdOnTIBU65c+e2Xbt2WdasWd3kuARMAADA4r1JTpPbNmjQwPbs2WNZsmSxxYsX2+bNm61y5cr2yiuvpE4pAQAAYilgWrlypXXp0sXSpk3rJrY9cuSIm/R2wIAB9tRTT6VOKQEAAGIpYMqQIYMLlkRNcMpjEk1c9/vvv0e/hAAAALGWw1SpUiVbtmyZXXLJJVarVi3r1auXy2EaPXq0XX755alTSgAAgFiqYerXr58VLFjQ/f3CCy9Yrly57KGHHrKdO3fa22+/nRplBAAAiK0apiuvvNL/t5rkvvnmm2iXCQAAILZrmK6//nrbu3dvgsf379/vngMAALB4D5jmzJmTYLBKOXz4sH377bfRKhcAAEDsNcmtWrXK//fatWtt27Zt/vuacFdNc4ULF45+CQEAAGIlYKpYsaKlSZPG3RJretMglq+//nq0ywcAABA7AdPGjRvN5/NZiRIlbOnSpXbBBRf4n8uYMaNLANdAlgAAAHEbMBUrVsz9f+LEidQsDwAAQOwPKyAbNmywwYMH208//eTulylTxh577DErWbJktMsHAAAQe73kpk6d6gIkNcuVL1/e3ZYsWWJly5a16dOnp04pAQAAYqmG6cknn7ROnTrZiy++mODx7t2720033RTN8gEAAMReDZOa4dq2bZvg8fvuu88NNwAAAGDxHjCpd9zKlSsTPK7H1FMumnr37u0fysC7lS5dOmiwzA4dOliePHnsvPPOs6ZNm9r27duD1rFlyxarX7++Zc2a1ZWvW7dudvz48aiWEwAAnNtS3CTXt29f69q1q7Vr187at29vv/32m1WrVs09t2DBAnvppZesc+fOUS+gcqNmzJjxfwVO/39FVtPgl19+aZ988onlzJnTOnbsaE2aNHHl8QbUVLBUoEABW7hwoW3dutVat25tGTJkcJMIAwAApEQanwZXSgGNsaSAQzVM6iE3cOBA++uvv9xzhQoVcjU3jz76qKsFimYN08SJExOt0dq3b58ry9ixY+322293j/3888922WWX2aJFi+yaa66xr7/+2m699VZXzvz587tl3nrrLZdrtXPnTjd+VEponjwFZNpmjhw5LNoadJkU9XUiNk0e2OhMFwEA4sr+FH7Hp7hJzourFBCpZuePP/5wK9dNf2tYgWgGS55ff/3VBWQaMLNFixauiU2WL19ux44dsxtvvNG/rJrrihYt6gIm0f/lypXzB0tSt25dd3DWrFmT5DaPHDnilgm8AQCA+BVRDlNoQJQ9e3Z3Sy1VqlSxUaNGuXnqhg0b5kYbr1Gjhv3zzz9uLjvVEJ1//vlBr1Fw5M1zp/8DgyXvee+5pPTv399Fm96tSJEiqbJ/AADgHBxW4NJLLw1bi7R7926Llnr16vn/1nhPCqA04vjHH3/s5q5LLT169AjKx1INE0ETAADxK6KAqU+fPq7G5UxRbZKCtvXr17vxno4ePWp79+4NqmVSLzkleYv+1wCbgbxedN4yicmUKZO7AQAARBww3X333VEfOiASBw4ccNOytGrVyipXrux6u82cOdMNJyDr1q1zOU5Vq1Z19/X/Cy+8YDt27PCXW6ORK6lLo5UDAABENWBKjYTucDSMQYMGDVwznHq6Pfvss663XrNmzVxNlwbQVNNZ7ty5XRD0yCOPuCBJPeSkTp06LjBSgDVgwACXt9SzZ083dhM1SAAAIOoBUwpHH4gq9b5TcPT333+7IQSqV69uixcvdn/LoEGDLG3atK6GST3b1ANu6NCh/tcruJoyZYo99NBDLpDKli2btWnTxo0pBQAAEPVxmOIZ4zDhdGEcJgCI8XGYAAAA4hUBEwAAQBgETAAAAGEQMAEAAIRBwAQAABAGARMAAEAYBEwAAABhEDABAACEQcAEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABAAAEAYBEwAAQBgETAAAAGEQMAEAAIRBwAQAABAGARMAAEAYBEwAAABhEDABAACEQcAEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABAAAEAYBEwAAQCwHTP3797errrrKsmfPbvny5bPGjRvbunXrgpapXbu2pUmTJuj24IMPBi2zZcsWq1+/vmXNmtWtp1u3bnb8+PHTvDcAACBWpbez2Ny5c61Dhw4uaFKA89RTT1mdOnVs7dq1li1bNv9y7dq1s759+/rvKzDy/Pfffy5YKlCggC1cuNC2bt1qrVu3tgwZMli/fv1O+z4BAIDYc1YHTN98803Q/VGjRrkaouXLl1vNmjWDAiQFRImZNm2aC7BmzJhh+fPnt4oVK9pzzz1n3bt3t969e1vGjBlTfT8AAEBsO6ub5ELt27fP/Z87d+6gx8eMGWN58+a1yy+/3Hr06GGHDh3yP7do0SIrV66cC5Y8devWtf3799uaNWsS3c6RI0fc84E3AAAQv87qGqZAJ06csMcff9yuvfZaFxh5mjdvbsWKFbNChQrZqlWrXM2R8pwmTJjgnt+2bVtQsCTefT2XVO5Unz59UnV/AABA7IiZgEm5TKtXr7b58+cHPd6+fXv/36pJKliwoN1www22YcMGK1my5EltS7VUnTt39t9XDVORIkVOofQAACCWxUSTXMeOHW3KlCk2e/Zsu/DCC5NdtkqVKu7/9evXu/+V27R9+/agZbz7SeU9ZcqUyXLkyBF0AwAA8eusDph8Pp8Llj7//HObNWuWXXTRRWFfs3LlSve/apqkatWq9uOPP9qOHTv8y0yfPt0FQWXKlEnF0gMAgHNF+rO9GW7s2LE2adIkNxaTl3OUM2dOy5Ili2t20/O33HKL5cmTx+UwderUyfWgK1++vFtWwxAoMGrVqpUNGDDAraNnz55u3apJAgAAiOkapmHDhrmecRqcUjVG3m38+PHueQ0JoOECFBSVLl3aunTpYk2bNrXJkyf715EuXTrXnKf/VdvUsmVLNw5T4LhNAAAAMVvDpCa55CgRW4NbhqNedF999VUUSwYAAOLJWV3DBAAAcDYgYAIAAAiDgAkAACAMAiYAAIAwCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAgDAImAAAAMIgYAIAAAiDgAkAgNPszTfftOLFi1vmzJmtSpUqtnTp0mSX/+STT6x06dJu+XLlytlXX3112sqK/yFgAgDgNBo/frx17tzZnn32Wfv++++tQoUKVrduXduxY0eiyy9cuNCaNWtmbdu2tRUrVljjxo3dbfXq1ae97PGMgAkAgNPo1VdftXbt2tm9995rZcqUsbfeesuyZs1q7733XqLLDxkyxG6++Wbr1q2bXXbZZfbcc8/ZFVdcYW+88YZ/maFDh9oll1ziaqDy589vt99++2nco/hAwAQAwGly9OhRW758ud14443+x9KmTevuL1q0KNHX6PHA5UU1Ut7y3333nT366KPWt29fW7dunX3zzTdWs2bNVN6T+JP+TBcAAIB4sWvXLvvvv/9cLVAg3f/5558Tfc22bdsSXV6Py5YtWyxbtmx26623Wvbs2a1YsWJWqVKlVNyL+EQNEwAAMeymm25yQVKJEiWsVatWNmbMGDt06NCZLtY5h4AJAIDTJG/evJYuXTrbvn170OO6X6BAgURfo8eTW161Skoe/+ijj6xgwYLWq1cvl0i+d+/eVNyT+EPABADAaZIxY0arXLmyzZw50//YiRMn3P2qVasm+ho9Hri8TJ8+PWj59OnTuzynAQMG2KpVq2zTpk02a9asVNyT+EMOEwAAp5GGFGjTpo1deeWVdvXVV9vgwYPt4MGDrtectG7d2goXLmz9+/d39x977DGrVauWDRw40OrXr2/jxo1zid7Dhw93z0+ZMsV+++03l+idK1cuN0aTgrBSpUqd0f081xAwAQBwGt111122c+dO13SmxO2KFSu6nm1eYreSuNVzzlOtWjUbO3as9ezZ05566ik3fMDEiRPt8ssvd8+ff/75NmHCBOvdu7cdPnzYPa/mubJly56xfTwXpfH5fL4zXYiz3f79+y1nzpy2b98+y5EjR9TX36DLpKivE7Fp8sBGZ7oIABBX9qfwO54cJgAAgDDiKmCKdO4eAACAuAqYIp27BwAAIO4Cpkjn7gEAAIirXnLe3D09evRI0dw9R44ccTePEsG8xLDUcOwII7LCUvUcAwAkf90N1wcuLgKmSOfu0dgXffr0SfB4kSJFUrWcQM43z3QJACA+/fPPP663XFwHTJFSTZTynTwaAGz37t2WJ08eS5MmzRkt27kc4Ssg/f3331Nl6AYAiEVcG1OfapYULBUqVCjZ5eIiYIp07p5MmTK5WyANDIbUpwsCFwUACMa1MXUlV7MUV0nfJzN3DwAAQFzVMKVk7h4AAACL94Ap3Nw9OLPUBKoxskKbQgEgnnFtPHswlxwAAEAYcZHDBAAAcCoImAAAAMIgYAIAAAiDgAmnRe3ate3xxx/33y9evLjrqQgAOPN69+7tOkMhaQRMSJF77rnHjXIeelu/fv2ZLhoA+K9TjRs3tnjGMUg9cTOsAE7dzTffbCNHjgx67IILLjhtEyhrAFIAOFM0J6l+KGry9njn8/nc8UifPn7CCN51pJjGAdFUMoE3TTmT2C8aNb+pGe5keet84YUX3Pw+pUqVco/rYjVx4sQE09aMGjXK/b1p0ya3zIQJE+y6666zrFmzWoUKFWzRokUnXRYAsUnXoEcffdSeeOIJy507t7tmqekp0N69e+2BBx5wY/JlzpzZLr/8cpsyZYp7TtcVXV+++OILK1OmjLsGbtmyxY4cOWJdu3a1woULW7Zs2axKlSo2Z84c/zq912k9unbpOnT77bfboUOH7P3333cpCbly5XJlU9DhSel6p06dapdddpmdd9557ofs1q1b3fPaN61/0qRJ/lYA7/Xdu3e3Sy+91JWlRIkS9swzz9ixY8dSfCznzJnj1vf111+7mTN0LObPn5+i639K3odYED+hIWKOpq7R3EnTp0+P+LVPP/20vfLKK3bJJZe4v5s1a+aaD+Pp1xAAcwGEZnpYsmSJ++GkL/hrr73WbrrpJjdFVr169dzEqx9++KGVLFnS1q5d634IehTkvPTSS/bOO++4Cdjz5ctnHTt2dMuNGzfO/aD7/PPPXeDy448/umuO97rXXnvNLaP1N2nSxG677TYX8Hz11Vf222+/WdOmTV1ZNLCypHS9uraNHj3a1XS1bNnSBVljxoxx///0009uwl6vNUABimTPnt0FXFqv1teuXTv3mIKYSDz55JNu+wq6FPRF432IFXx7IMX0a0m/aDy60HzyySeptj39wtJF6mSa4nThqF+/vvu7T58+VrZsWRcwlS5dOhVKCuBsVb58eTdStijoeOONN9yPMX1Rz5gxw5YuXeqCDNW+iAKBQKqFGTp0qKupFtUwKRjR/97s9rreaOYIPd6vXz//64YNG+aCMFENk4IcTfqu66hqrFQLPnv2bBcwRbLet956y79eBVl9+/Z1f2u9WbJkcTVVoRPL9+zZ0/+3ari0bgVmkQZMffv2PakgJ7n3IVYQMCHF9OHWBSAwoElN5cqVO+m8JX04PQULFnT/79ixg4AJiDOB1wLveqBrgaxcudIuvPBCf7CUGF2DAteh2hk1o4W+RkGKaqA8avryghpRk58ClcAfnXrMK8vJrjdwf5Izfvx4V+O1YcMGO3DggB0/ftzV4EfqyiuvtGi/D7GCgAkppgDp4osvTvC4qoVDZ9iJpG08ue2FUht6SraVIUOGoNeIqt8BxJfAa4F3PfCuBaqNCUfLeNcQUbChJrvly5cHNd1JYDCU2HaTK8uprDfcDGdqAmvRooWrba9bt67lzJnT1S4NHDjQTvW6nDaF1//k9j1WEDDhlKmn3OrVq4Me0y+30A9ItLblJTjKr7/+6tr0AeBkaj3++OMP++WXX5KtZQpUqVIlVxOk2pEaNWpErSzRWq9qxAITyWXhwoVWrFgxl8/p2bx5s8Xa9f9Mo5ccTtn1119v3333nX3wwQcugFE7degHKJrbUtv3ihUr3DYffPDBc/KDCSD11apVy2rWrOmSr9W5ZOPGja4XmPKGkqLASrU1rVu3dr1x9RrlQfXv39++/PLLky5LtNarZr9Vq1bZunXrbNeuXa62RzlDyo1SrZKa5NQ0p4TyWLv+n2kETDhlquJVF1UlD1511VWuR4g+9KlBVchFihRxv8CaN2/uEhfVpg8AJ+Ozzz5z1y31pFUitq5joTU0oZSErWtcly5d3LAB6la/bNkyK1q06CmVJRrrVe83vVa5Rqr9WbBggTVs2NA6derkEsQ1mrdqnHTNjrXr/5mWxheu8RMAACDOUcMEAAAQBgETAABAGARMAAAAYRAwAQAAhEHABABIVX///bebg02TY58u3kTcGhMoViU2sW043gS9kdi2bZubokSDUnqvTWyi82hsN9L1nqq77777pAboTAwBEwAgVb3wwgvWqFEjN0YQzj6DBg1yAwIruNQgnqL7mi80pe666y7/a6V3795uCINQka73VGkOPZ1/+/btO+V1ETABAFKNRuJ/9913rW3btme6KEiCBrOsXLmyG+BSNYGiyXszZcqU4nVkyZLF/9rkRLreU3X55Ze7ufc+/PDDU14XARMAINV89dVX7gvymmuuSbb5Rs00gXO2eTUUo0ePdjVTmv9MzSsaGNGjucgGDBjg5rjUNjTAo2oTAv32229u4nANcFuhQgU3r1pgU6EGrCxcuLB7XhN+f/TRR0Gvr127tj366KNuYMbcuXO7L3yVLdDPP/9s1atXt8yZM7vBL2fMmJGg6en333+3O++80+231qMat8AmSg2W2blzZ/e8JtvV9lIyTKKOpfZb5b/tttvcPoWaNGmSXXHFFa58JUqUcHPKafJd0bHV4J0aqVtlVjOgBJbfa96cMGFCkscy8D3V39rGDz/84F6nmx4LXa836bBGC1fApf1u3769m1cvtFnylVdecRP2apkOHToEzVc3dOhQF+xp/zSh8e233x60/w0aNHCjnJ8qAiYAQKr59ttvXe3FydZ86Mt1ypQp7jZ37lx78cUX/c/36NHD3ddI02vXrrWxY8e6L8xAmj9NMwKouUnTjyhA8oKFw4cPu7Jp6hFN56Ev61atWrkpSQK9//77Lr9nyZIlLkDr27evm0rFC3T0ha4gQs8PHz48aM420Ze7RsTOnj27Ox4afVsT6t5888129OhRt4zybBRUvPfeezZ//nzbvXt32OlLtD3V3GkEb+2fgpnnn38+wfHXyNuPPfaYO0Zvv/22244XWGokcZVDwZyay4YMGZLk9p5O5liGNs9ptPKyZcu6deqmx0IdPHjQHZdcuXK5cnzyyScu2NT+BJo9e7Y7F/S/3guV3wvANC2LAlq9J5oORtPaaLqbQFdffbV7T48cOWKnRCN9AwCQGho1auS77777gh4bOXKkL2fOnEGPff7556pO8d9/9tlnfVmzZvXt37/f/1i3bt18VapUcX/r8UyZMvlGjBiR6HY3btzo1vfOO+/4H1uzZo177KeffkqyvPXr1/d16dLFf79WrVq+6tWrBy1z1VVX+bp37+7+/vrrr33p06f3bd261f/89OnT3Xa0TzJ69GhfqVKlfCdOnPAvc+TIEV+WLFl8U6dOdfcLFizoGzBggP/5Y8eO+S688EJ3/JLSrFkz3y233BL02F133RV0bG+44QZfv379gpZRebQ9j7bRpk2boGUCy5+SYzky5D3V+1ehQoUEZQ5c7/Dhw325cuXyHThwwP/8l19+6UubNq1v27Zt7r7KVaxYMd/x48f9y9xxxx1uP+Wzzz7z5ciRI+g8CfXDDz+47W7atMl3KqhhAgCkmn///dc1lZwMNRepVsajJpkdO3a4v3/66SdXY3DDDTcku47y5csHvV68dah26LnnnnNNcWomU63P1KlT3US1Sa0jtByq1dD8lmqqC6zRCKSmqfXr17t90TZ00/ZUw6WaEyUkqxamSpUq/tekT5/ezQeXHB2DwNdI1apVE2xbtS/ednXTfHPanvLLIlE+mWN5MlR+Ne2p9s5z7bXXuqZWHVePaqrSpUsXtG1vu+rdV6xYMdfUqNrBMWPGJNgvNfdJpPsbKv0pvRoAgGTkzZvX9uzZE/RY2rRpE+TnBOakeDJkyBB0X/kv+jIN/BIMJ3AdXo6Ut46XX37ZNUENHjzYBU364n788cf9zWQpKUdKKCdHTX/6Mg+lCXJTk7atfKImTZokeC7SQDZDMscyNSV3/BWEfv/99zZnzhybNm2a9erVy+WYqYnPy6lS82Y0jjU1TACAVFOpUiWXOxNIX1xK3lYOiyfS8ZKU5KugaebMmSddNuUSKfm6ZcuWrqZDtRSBXeNTolSpUi6he/v27f7H9GUdSAnXv/76q+tFpgT1wJuS2XVTrYlykjzKDVq+fHmy277sssuCXiOLFy9OsG3V1oRuVzcFrqklY8aMrgYvXPlVAxZ4Hug9Ubl0XFNKtXE33nijyy9btWqVS1KfNWuW/3nlp1144YUueD8VBEwAgFSjpN41a9YE1TKpGUlJ0k899ZRrklKytpfEm1KqHenevbvrTaYeXlqPggUNYRBJ0KXk7YULF7rmoQceeCAo8EkJNQmp23qbNm3cl7W+8DX2T2AtTIsWLdyXtYIzJWFv3LjR1YgoWfmPP/5wyygpWwnsSnJXr7uHH37Y9u7dm+y29XolOasHmQKyN954w90PpBoXHR/VMul90H6qx5hXxtSi5lTtpwLhXbt2JZpwreOi91HHTkGNkrofeeQR17QWmryfFHUGeO2119x2Nm/e7PZVtU+BAZeOeZ06dU55nwiYAACpRk1dquX4+OOP/Y8pf0fj4mjIAa8rf2hX/ZRQ7zj1xlJQoNoK9cSKJKdGQYPKpqBOwwcoDynSkbWVW6MgR01fV111ld1///3+XnJek5eCw3nz5rnu/2oaU1nVu005TDly5HDLaD8UKCh4UB6Smpo0TEByNFTDiBEjXLOiasjUJBUaCGnfFFToOZVPr9FAlcr7SU1NmzZ1ve/Uc081iqHDNXjHRTljajJT2TQcgHLSFPillJrdNNyBhibQcX3rrbfctpT3JDrGen+Ut3Wq0vz/rHUAAFKFuu1369bN1SKkZjPQ2UK1TBqXSYneqn3CmTNs2DA3PIMCxlNF0jcAIFXVr1/fNRn9+eefrkfZuUZfyOp9piY+BUlqXlNvL4KlM08J46+//npU1kUNEwAAp0B5MxowUsMRKFdJCcgaiFKjUuPcQcAEAAAQxrnfmAwAAHCKCJgAAADCIGACAAAIg4AJAAAgDAImAACAMAiYAAAAwiBgAgAACIOACQAAIAwCJgAAAEve/wOLuk8y3nWfNgAAAABJRU5ErkJggg==", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# python_results records run-1 (full) timings; run-2 (incremental) is near-zero.\n", - "# We illustrate the checksum mechanism with a bar showing full vs incremental total time.\n", - "full_total = py_results[\"generation_seconds\"].sum()\n", - "\n", - "# Incremental run: only checksum lookup overhead — approximate as near-zero per cohort\n", - "# (we don't have per-cohort incremental timings, so we use the script-level total)\n", - "# For now show the full-run breakdown: COMPLETE vs what incremental would skip.\n", - "complete = py_results[py_results[\"status\"] == \"COMPLETE\"]\n", - "failed = py_results[py_results[\"status\"] == \"FAILED\"]\n", - "\n", - "fig, ax = plt.subplots(figsize=(6, 4))\n", - "categories = [\"Full run\", \"Incremental run\\n(unchanged definitions)\"]\n", - "times = [full_total, 0] # incremental skips all → near-zero; 0 is illustrative\n", - "bars = ax.bar(categories, times, color=[\"#4C72B0\", \"#55A868\"], width=0.4)\n", - "ax.set_ylabel(\"Total generation time (s)\", fontsize=10)\n", - "ax.set_title(\"Incremental generation: checksum-based skipping\", fontsize=11)\n", - "for bar, t in zip(bars, times):\n", - " ax.text(\n", - " bar.get_x() + bar.get_width() / 2,\n", - " bar.get_height() + full_total * 0.01,\n", - " f\"{t:.1f}s\",\n", - " ha=\"center\",\n", - " va=\"bottom\",\n", - " fontsize=10,\n", - " )\n", - "plt.tight_layout()\n", - "plt.savefig(OUTPUT_DIR / \"figure3_incremental_speedup.pdf\", bbox_inches=\"tight\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "504fb2a444614c0babb325280ed9130a", - "metadata": {}, - "source": [ - "## Table 3 — Summary statistics (paper-ready)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "59bbdb311c014d738909a11f9e486628", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Metric Value\n", - " Phenotypes in library 707\n", - " R generation success rate 97.6%\n", - " Python generation success rate 93.4%\n", - " R median generation time (s) 0.03\n", - "Python median generation time (s) 0.14\n", - " Total cohort_r rows 103585\n", - " Total cohort_python rows 213977\n", - " Cohorts with identical tables 69 / 69 (100.0%)\n" - ] - } - ], - "source": [ - "summary = pd.DataFrame(\n", - " [\n", - " {\"Metric\": \"Phenotypes in library\", \"Value\": len(r_results)},\n", - " {\n", - " \"Metric\": \"R generation success rate\",\n", - " \"Value\": f\"{(r_results['status'] == 'COMPLETE').mean() * 100:.1f}%\",\n", - " },\n", - " {\n", - " \"Metric\": \"Python generation success rate\",\n", - " \"Value\": f\"{(py_results['status'] == 'COMPLETE').mean() * 100:.1f}%\",\n", - " },\n", - " {\n", - " \"Metric\": \"R median generation time (s)\",\n", - " \"Value\": f\"{r_results['generation_seconds'].median():.2f}\",\n", - " },\n", - " {\n", - " \"Metric\": \"Python median generation time (s)\",\n", - " \"Value\": f\"{py_results['generation_seconds'].median():.2f}\",\n", - " },\n", - " {\"Metric\": \"Total cohort_r rows\", \"Value\": len(cohort_r)},\n", - " {\"Metric\": \"Total cohort_python rows\", \"Value\": len(cohort_py)},\n", - " {\n", - " \"Metric\": \"Cohorts with identical tables\",\n", - " \"Value\": f\"{n_match} / {n_total} ({100 * n_match / n_total:.1f}%)\",\n", - " },\n", - " ]\n", - ")\n", - "print(summary.to_string(index=False))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5913b41a-261a-4319-873d-ae19c5d170b5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f60a8464-f4ed-40c7-bf25-76195b3736c8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/examples/benchmark_run_python.py b/examples/benchmark_run_python.py deleted file mode 100644 index ca36ba4..0000000 --- a/examples/benchmark_run_python.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python3 -""" -benchmark_run_python.py - -CircePy benchmark against a Databricks SQL warehouse. - -What this script does: - 1. Downloads cohort metadata from OHDSI/PhenotypeLibrary on GitHub - 2. Downloads each CIRCE cohort JSON from the same repo - 3. Builds a CircePy CohortDefinitionSet from the downloaded definitions - 4. Calls generate_cohort_set() writing to DATABRICKS_SCRATCH_SCHEMA.cohort_python - 5. Runs a second incremental pass to benchmark checksum skipping - 6. Writes benchmark_output/python_results.csv - -Prerequisites: - - Fill in .env with DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, - DATABRICKS_SCRATCH_SCHEMA - - CDM data at healthverity_cc.cdm_healthverity_cc_all_v3910 - -Usage: - python examples/benchmark_run_python.py -""" - -from __future__ import annotations - -import os -import sys -import time -import urllib.request -from pathlib import Path - -import ibis -import pandas as pd -from dotenv import load_dotenv - -from circe.api import ( - CohortDefinitionSet, - cohort_expression_from_json, - generate_cohort_set, - summarise_generation_results, -) - -# Ensure the repo root is importable -REPO_ROOT = Path(__file__).resolve().parent.parent -sys.path.insert(0, str(REPO_ROOT)) - -load_dotenv(REPO_ROOT / ".env") - -# --------------------------------------------------------------------------- -# Configuration -# --------------------------------------------------------------------------- -PHENOTYPE_META_URL = "https://raw.githubusercontent.com/OHDSI/PhenotypeLibrary/main/inst/Cohorts.csv" -PHENOTYPE_JSON_URL = ( - "https://raw.githubusercontent.com/OHDSI/PhenotypeLibrary/main/inst/cohorts/{cohort_id}.json" -) - -OUTPUT_DIR = REPO_ROOT / "benchmark_output" -RESULTS_CSV = OUTPUT_DIR / "python_results.csv" - -CDM_SCHEMA = "healthverity_cc.cdm_healthverity_cc_all_v3910" -SCRATCH_SCHEMA = os.environ["DATABRICKS_SCRATCH_SCHEMA"] -COHORT_TABLE = "cohort_python" -CHECKSUM_TABLE = "cohort_checksum_python" -N_COHORTS = 40 # number of cohorts to benchmark - - -# --------------------------------------------------------------------------- -# Step 1 — load phenotype library -# --------------------------------------------------------------------------- - - -def load_phenotypes() -> pd.DataFrame: - """Download PhenotypeLibrary metadata and return rows with CIRCE JSON.""" - print("Downloading PhenotypeLibrary metadata...") - meta = pd.read_csv(PHENOTYPE_META_URL) - circe = meta[meta["isCirceJson"].astype(str).str.strip() == "1"].copy() - circe = circe[["cohortId", "cohortName"]].dropna() - circe["cohortId"] = circe["cohortId"].astype(int) - print(f" {len(circe)} CIRCE cohorts in library") - return circe - - -def build_cohort_definition_set(meta: pd.DataFrame) -> tuple[CohortDefinitionSet, list]: - """Download each cohort JSON and build a CohortDefinitionSet.""" - cds = CohortDefinitionSet() - failures: list[tuple[int, str]] = [] - - print(f"Downloading and parsing {len(meta)} cohort definitions...") - for i, (_, row) in enumerate(meta.iterrows(), 1): - cohort_id = int(row["cohortId"]) - cohort_name = str(row["cohortName"]) - url = PHENOTYPE_JSON_URL.format(cohort_id=cohort_id) - - try: - with urllib.request.urlopen(url, timeout=30) as resp: - json_str = resp.read().decode("utf-8") - expression = cohort_expression_from_json(json_str) - cds.add(cohort_id=cohort_id, cohort_name=cohort_name, expression=expression) - except Exception as exc: - failures.append((cohort_id, str(exc))) - - if i % 100 == 0 or i == len(meta): - ok = i - len(failures) - print(f" {i}/{len(meta)} parsed={ok} failed={len(failures)}") - - return cds, failures - - -# --------------------------------------------------------------------------- -# Step 2 — connect to Databricks -# --------------------------------------------------------------------------- - - -def connect_databricks() -> ibis.BaseBackend: - host = os.environ["DATABRICKS_HOST"] - http_path = os.environ["DATABRICKS_HTTP_PATH"] - token = os.environ["DATABRICKS_TOKEN"] - - # Parse catalog and schema from DATABRICKS_SCRATCH_SCHEMA (format: catalog.schema) - parts = SCRATCH_SCHEMA.split(".", 1) - catalog = parts[0] if len(parts) == 2 else None - schema = parts[1] if len(parts) == 2 else parts[0] - - print(f"\nConnecting to Databricks: {host} (catalog={catalog}, schema={schema})") - backend = ibis.databricks.connect( - server_hostname=host, - http_path=http_path, - access_token=token, - catalog=catalog, - schema=schema, - ) - print(" Connected.") - return backend - - -# --------------------------------------------------------------------------- -# Step 3 — generate cohorts -# --------------------------------------------------------------------------- - - -def run_generation(cds: CohortDefinitionSet, backend: ibis.BaseBackend) -> tuple: - # Run 1: full generation with incremental=True so checksums are saved - # (no prior checksums exist, so nothing is skipped on this run) - print(f"\nRun 1: generating {len(cds)} cohorts (full run, saves checksums)...") - t0 = time.perf_counter() - results_run1 = generate_cohort_set( - cds, - backend=backend, - cdm_schema=CDM_SCHEMA, - cohort_table=COHORT_TABLE, - results_schema=SCRATCH_SCHEMA, - incremental=True, - checksum_table=CHECKSUM_TABLE, - stop_on_error=False, - ) - run1_seconds = time.perf_counter() - t0 - s1 = summarise_generation_results(results_run1) - print(f" {run1_seconds:.1f}s COMPLETE={s1['COMPLETE']} FAILED={s1['FAILED']}") - - # Run 2: incremental (all should be skipped) - print("Run 2: incremental re-run (unchanged definitions — all should be skipped)...") - t0 = time.perf_counter() - results_run2 = generate_cohort_set( - cds, - backend=backend, - cdm_schema=CDM_SCHEMA, - cohort_table=COHORT_TABLE, - results_schema=SCRATCH_SCHEMA, - incremental=True, - checksum_table=CHECKSUM_TABLE, - stop_on_error=False, - ) - run2_seconds = time.perf_counter() - t0 - s2 = summarise_generation_results(results_run2) - speedup = run1_seconds / run2_seconds if run2_seconds > 0 else float("inf") - print(f" {run2_seconds:.2f}s SKIPPED={s2['SKIPPED']} speedup={speedup:.1f}x") - - return results_run1, run1_seconds, run2_seconds - - -# --------------------------------------------------------------------------- -# Step 4 — write results -# --------------------------------------------------------------------------- - - -def write_results( - backend: ibis.BaseBackend, - results_run1: list, - parse_failures: list, - meta: pd.DataFrame, -) -> None: - cohort_df = backend.table(COHORT_TABLE, database=SCRATCH_SCHEMA).execute() - row_counts = ( - cohort_df.groupby("cohort_definition_id") - .size() - .reset_index(name="row_count") - .rename(columns={"cohort_definition_id": "cohortId"}) - ) - - rows = [] - for r in results_run1: - duration = (r.end_time - r.start_time).total_seconds() - rc = row_counts.loc[row_counts["cohortId"] == r.cohort_id, "row_count"] - rows.append( - { - "cohortId": r.cohort_id, - "cohortName": r.cohort_name, - "status": r.status, - "generation_seconds": duration, - "row_count": int(rc.iloc[0]) if len(rc) else 0, - "checksum": r.checksum, - "error": str(r.error) if r.error else "", - } - ) - - for cohort_id, err in parse_failures: - name_row = meta.loc[meta["cohortId"] == cohort_id, "cohortName"] - rows.append( - { - "cohortId": cohort_id, - "cohortName": name_row.iloc[0] if len(name_row) else "", - "status": "PARSE_FAILED", - "generation_seconds": 0, - "row_count": 0, - "checksum": "", - "error": err, - } - ) - - pd.DataFrame(rows).to_csv(RESULTS_CSV, index=False) - print(f"\nResults written to {RESULTS_CSV}") - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - - -def main() -> None: - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - meta = load_phenotypes().head(N_COHORTS) - backend = connect_databricks() - cds, parse_failures = build_cohort_definition_set(meta) - - print(f"\nCohortDefinitionSet: {len(cds)} cohorts ({len(parse_failures)} parse failures)") - print(f"CDM schema : {CDM_SCHEMA}") - print(f"Cohort schema : {SCRATCH_SCHEMA}") - - results_run1, run1_s, run2_s = run_generation(cds, backend) - write_results(backend, results_run1, parse_failures, meta) - - s = summarise_generation_results(results_run1) - total = len(meta) - print(f"\n{'=' * 55}") - print("CircePy benchmark complete") - print(f" Total phenotypes : {total}") - print(f" Parse failures : {len(parse_failures)}") - print(f" Generation COMPLETE : {s['COMPLETE']}") - print(f" Generation FAILED : {s['FAILED']}") - print(f" Full run time : {run1_s:.1f}s") - print(f" Incremental run time : {run2_s:.2f}s ({run1_s / run2_s:.1f}x speedup)") - print(f"{'=' * 55}") - - -if __name__ == "__main__": - main() diff --git a/examples/benchmark_run_r.R b/examples/benchmark_run_r.R deleted file mode 100644 index 2b8dba1..0000000 --- a/examples/benchmark_run_r.R +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env Rscript -# benchmark_run_r.R -# -# R CohortGenerator benchmark against a Databricks SQL warehouse. -# Runs independently of benchmark_run_python.py — no shared DuckDB file. -# -# What this script does: -# 1. Loads cohort definitions from OHDSI PhenotypeLibrary -# 2. Connects to Databricks using credentials from .Renviron -# 3. Runs CohortGenerator::generateCohortSet() and writes rows to -# {DATABRICKS_SCRATCH_SCHEMA}.cohort_r -# 4. Writes per-cohort timing and status to benchmark_output/r_results.csv -# -# Prerequisites: -# Fill in .Renviron (repo root) with: -# DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, DATABRICKS_SCRATCH_SCHEMA -# The Databricks JDBC driver JAR must be available: -# Download from https://www.databricks.com/spark/jdbc-drivers-download -# and set DATABASECONNECTOR_JAR_FOLDER in .Renviron to its directory. -# CDM data must exist at healthverity_cc.cdm_healthverity_cc_all_v3910 -# -# Usage: -# Rscript examples/benchmark_run_r.R -# -# Output files (in benchmark_output/): -# r_results.csv -- per-cohort timing and status - -suppressPackageStartupMessages({ - library(dplyr) - library(PhenotypeLibrary) - library(CohortGenerator) - library(DatabaseConnector) -}) - -# --------------------------------------------------------------------------- -# Paths and credentials -# --------------------------------------------------------------------------- -script_path <- normalizePath( - sub("--file=", "", grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)[1]) -) -REPO_ROOT <- dirname(dirname(script_path)) # examples/ -> repo root -OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") -if (!dir.exists(OUTPUT_DIR)) dir.create(OUTPUT_DIR, recursive = TRUE) - -# Load .Renviron from the repo root (supplements the user-level ~/.Renviron) -renviron_path <- file.path(REPO_ROOT, ".Renviron") -if (file.exists(renviron_path)) readRenviron(renviron_path) - -R_RESULTS_CSV <- file.path(OUTPUT_DIR, "r_results.csv") - -DB_HOST <- Sys.getenv("DATABRICKS_HOST") -DB_HTTP_PATH <- Sys.getenv("DATABRICKS_HTTP_PATH") -DB_TOKEN <- Sys.getenv("DATABRICKS_TOKEN") -SCRATCH_SCHEMA <- Sys.getenv("DATABRICKS_SCRATCH_SCHEMA") -CDM_SCHEMA <- "healthverity_cc.cdm_healthverity_cc_all_v3910" -COHORT_TABLE <- "cohort_r" - -for (var in c("DB_HOST", "DB_HTTP_PATH", "DB_TOKEN", "SCRATCH_SCHEMA")) { - if (get(var) == "") stop(sprintf("Environment variable %s is not set. Check .Renviron.", var)) -} - -cat(sprintf("Databricks host : %s\n", DB_HOST)) -cat(sprintf("CDM schema : %s\n", CDM_SCHEMA)) -cat(sprintf("Cohort schema : %s\n", SCRATCH_SCHEMA)) -cat(sprintf("Output directory : %s\n", OUTPUT_DIR)) - -# --------------------------------------------------------------------------- -# 1. Load PhenotypeLibrary -# --------------------------------------------------------------------------- -cat("\nLoading OHDSI PhenotypeLibrary...\n") -phenotype_log <- PhenotypeLibrary::getPhenotypeLog() -all_ids <- phenotype_log$cohortId -cat(sprintf(" %d cohort IDs found in phenotype log\n", length(all_ids))) - -cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = all_ids[1:40]) -cat(sprintf(" %d cohort definitions loaded (first 40)\n", nrow(cds))) - -# --------------------------------------------------------------------------- -# 2. Connect to Databricks -# --------------------------------------------------------------------------- -cat("\nConnecting to Databricks...\n") -conn_string <- paste0( - "jdbc:databricks://", DB_HOST, ":443/default;", - "transportMode=http;ssl=1;", - "httpPath=", DB_HTTP_PATH, ";", - "AuthMech=3;UID=token;PWD=", DB_TOKEN -) -conn_details <- DatabaseConnector::createConnectionDetails( - dbms = "spark", - connectionString = conn_string -) - -# Verify connection and CDM access -conn_check <- DatabaseConnector::connect(conn_details) -tryCatch({ - test <- DatabaseConnector::querySql( - conn_check, - sprintf("SELECT COUNT(*) AS n FROM %s.person", CDM_SCHEMA) - ) - cat(sprintf(" CDM verified: %s.person has %d rows\n", CDM_SCHEMA, test$n)) -}, error = function(e) { - DatabaseConnector::disconnect(conn_check) - stop(sprintf("Cannot access CDM at %s: %s", CDM_SCHEMA, conditionMessage(e))) -}) -DatabaseConnector::disconnect(conn_check) - -# --------------------------------------------------------------------------- -# 3. Create cohort tables (idempotent — drops and recreates) -# --------------------------------------------------------------------------- -cat("\nCreating cohort tables in", SCRATCH_SCHEMA, "...\n") -cohort_table_names <- CohortGenerator::getCohortTableNames(cohortTable = COHORT_TABLE) - -# Drop existing tables for a clean run -conn_drop <- DatabaseConnector::connect(conn_details) -for (tbl in unlist(cohort_table_names)) { - tryCatch( - DatabaseConnector::executeSql( - conn_drop, - sprintf("DROP TABLE IF EXISTS %s.%s", SCRATCH_SCHEMA, tbl), - reportOverallTime = FALSE - ), - error = function(e) NULL - ) -} -DatabaseConnector::disconnect(conn_drop) - -CohortGenerator::createCohortTables( - connectionDetails = conn_details, - cohortDatabaseSchema = SCRATCH_SCHEMA, - cohortTableNames = cohort_table_names, - incremental = FALSE -) - -# --------------------------------------------------------------------------- -# 4. Generate cohorts — single persistent connection -# --------------------------------------------------------------------------- -cat(sprintf("\nGenerating %d cohorts with R CohortGenerator...\n", nrow(cds))) - -all_stats <- NULL -t_start <- proc.time() -n_cohorts <- nrow(cds) - -# One connection for the entire run — avoids per-cohort reconnection overhead. -global_conn <- DatabaseConnector::connect(conn_details) - -for (cohort_i in seq_len(n_cohorts)) { - one_cds <- cds[cohort_i, ] - - # SqlRender (spark dialect) materialises #temp tables as real tables in - # tempEmulationSchema. Drop them before each cohort so they don't conflict. - for (tmp_tbl in c("Codesets", "qualified_events", "inclusion_events", - "included_events", "inclusion_rules", "best_events", - "cohort_rows", "final_cohort", - paste0("Inclusion_", 0:20))) { - tryCatch( - DatabaseConnector::executeSql( - global_conn, - sprintf("DROP TABLE IF EXISTS %s.%s", SCRATCH_SCHEMA, tmp_tbl), - reportOverallTime = FALSE - ), - error = function(e) NULL - ) - } - - cohort_stats <- tryCatch( - CohortGenerator::generateCohortSet( - connection = global_conn, - cdmDatabaseSchema = CDM_SCHEMA, - cohortDatabaseSchema = SCRATCH_SCHEMA, - tempEmulationSchema = SCRATCH_SCHEMA, - cohortTableNames = cohort_table_names, - cohortDefinitionSet = one_cds, - stopOnError = FALSE, - incremental = FALSE - ), - error = function(e) { - cat(sprintf(" Cohort %d/%d error: %s\n", cohort_i, n_cohorts, conditionMessage(e))) - NULL - } - ) - - if (!is.null(cohort_stats)) { - all_stats <- if (is.null(all_stats)) cohort_stats else rbind(all_stats, cohort_stats) - } - - cat(sprintf(" %d/%d cohorts processed\n", cohort_i, n_cohorts)) -} - -DatabaseConnector::disconnect(global_conn) - -generation_stats <- all_stats -elapsed <- (proc.time() - t_start)[["elapsed"]] -cat(sprintf(" Done in %.1f seconds\n", elapsed)) - -# --------------------------------------------------------------------------- -# 5. Summarise and write results -# --------------------------------------------------------------------------- -conn <- DatabaseConnector::connect(conn_details) -cohort_counts <- DatabaseConnector::querySql( - conn, - sprintf( - "SELECT cohort_definition_id, COUNT(*) AS row_count FROM %s.%s GROUP BY 1", - SCRATCH_SCHEMA, COHORT_TABLE - ) -) -DatabaseConnector::disconnect(conn) - -results <- generation_stats %>% - left_join(cohort_counts, by = c("cohortId" = "cohort_definition_id")) %>% - mutate( - row_count = coalesce(row_count, 0L), - generation_seconds = as.numeric(endTime - startTime, units = "secs"), - status = generationStatus - ) %>% - select(cohortId, cohortName, status, generation_seconds, row_count) - -write.csv(results, R_RESULTS_CSV, row.names = FALSE) - -n_complete <- sum(results$status == "COMPLETE") -n_failed <- sum(results$status == "FAILED") -cat(sprintf("\nR benchmark complete\n")) -cat(sprintf(" COMPLETE : %d\n", n_complete)) -cat(sprintf(" FAILED : %d\n", n_failed)) -cat(sprintf(" Total rows in %s : %d\n", COHORT_TABLE, sum(results$row_count))) -cat(sprintf(" Results written to %s\n", R_RESULTS_CSV)) diff --git a/tests/test_cohort_definition_set.py b/tests/test_cohort_definition_set.py index cd18148..91f1b06 100644 --- a/tests/test_cohort_definition_set.py +++ b/tests/test_cohort_definition_set.py @@ -277,7 +277,7 @@ def test_generate_cohort_set_continue_on_error(): call_count = 0 - def _failing_write_cohort(expression, *, cohort_id, **kwargs): + def _failing_write_cohort(*, compiled_relation, cohort_id, **kwargs): nonlocal call_count call_count += 1 if cohort_id == 1: @@ -285,7 +285,7 @@ def _failing_write_cohort(expression, *, cohort_id, **kwargs): # Delegate to real write_cohort for cohort 2 from circe.execution.api import write_cohort as real_write_cohort - real_write_cohort(expression, cohort_id=cohort_id, **kwargs) + real_write_cohort(compiled_relation=compiled_relation, cohort_id=cohort_id, **kwargs) cds = CohortDefinitionSet() cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) @@ -318,7 +318,7 @@ def test_generate_cohort_set_stop_on_error(): from circe.execution.errors import ExecutionError - def _always_fail(expression, *, cohort_id, **kwargs): + def _always_fail(*, compiled_relation, cohort_id, **kwargs): raise ExecutionError("Always fail") cds = CohortDefinitionSet() @@ -356,6 +356,175 @@ def test_summarise_generation_results(): assert summary["FAILED"] == 1 +# --------------------------------------------------------------------------- +# Generation history table integration tests +# --------------------------------------------------------------------------- + + +def test_generate_cohort_set_history_table_populated(): + import pandas as pd + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="B", expression=_simple_expression()) + + CHECKSUM_TABLE = "cohort_checksum_test" + + results = generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort", + incremental=True, + checksum_table=CHECKSUM_TABLE, + ) + assert all(r.status == "COMPLETE" for r in results) + + history = conn.table(CHECKSUM_TABLE, database="main").execute() + assert not history.empty + assert "cohort_definition_id" in history.columns + assert "checksum" in history.columns + assert "status" in history.columns + assert "start_time" in history.columns + assert "end_time" in history.columns + + for _, row in history.iterrows(): + assert row["status"] in ("COMPLETE", "FAILED") + assert pd.to_datetime(row["end_time"]) >= pd.to_datetime(row["start_time"]) + + assert set(history["cohort_definition_id"]) == {1, 2} + assert all(history["status"] == "COMPLETE") + + +def test_generate_cohort_set_history_table_skip_no_duplicate(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="B", expression=_simple_expression()) + + CHECKSUM_TABLE = "cohort_checksum_skip_test" + + # Run 1: both COMPLETE → both get history entries + generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_skip", + incremental=True, + checksum_table=CHECKSUM_TABLE, + ) + after_first = conn.table(CHECKSUM_TABLE, database="main").execute() + assert len(after_first) == 2 # 2 history rows + + # Run 2: incremental, all should be SKIPPED → no new history entries + generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_skip", + incremental=True, + checksum_table=CHECKSUM_TABLE, + ) + after_second = conn.table(CHECKSUM_TABLE, database="main").execute() + assert len(after_second) == 2 # still 2 — no duplicates for SKIPPED + + +def test_generate_cohort_set_history_table_failed(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from unittest.mock import patch + + from circe.execution.errors import ExecutionError + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Good", expression=_simple_expression()) + + CHECKSUM_TABLE = "cohort_checksum_fail_test" + + call_count = 0 + + def _failing_write(*, compiled_relation, cohort_id, **kwargs): + nonlocal call_count + call_count += 1 + if cohort_id == 1: + raise ExecutionError("Simulated failure for cohort 1") + from circe.execution.api import write_cohort as real_write_cohort + + real_write_cohort(compiled_relation=compiled_relation, cohort_id=cohort_id, **kwargs) + + with patch("circe.cohort_definition_set._generate.write_cohort", side_effect=_failing_write): + results = generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_fail", + incremental=True, + checksum_table=CHECKSUM_TABLE, + stop_on_error=False, + ) + + statuses = {r.cohort_id: r.status for r in results} + assert statuses[1] == "FAILED" + assert statuses[2] == "COMPLETE" + + history = conn.table(CHECKSUM_TABLE, database="main").execute() + history_statuses = dict(zip(history["cohort_definition_id"], history["status"])) + assert history_statuses[1] == "FAILED" + assert history_statuses[2] == "COMPLETE" + + +def test_load_generation_history(): + from circe.cohort_definition_set._checksum_store import load_generation_history + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + + CHECKSUM_TABLE = "cohort_history_test" + + generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_hist", + incremental=True, + checksum_table=CHECKSUM_TABLE, + ) + + history = load_generation_history(conn, schema="main", table_name=CHECKSUM_TABLE) + assert history is not None + assert not history.empty + assert "start_time" in history.columns + assert "end_time" in history.columns + assert "status" in history.columns + assert history.iloc[0]["status"] == "COMPLETE" + + # Non-existent table returns None + none_result = load_generation_history(conn, schema="main", table_name="nonexistent_table") + assert none_result is None + + def test_api_exports_cohort_definition_set(): import circe.api as api From 4b89967a916eea1f9de6fd0125af490bf72f2220 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 11:39:42 -0700 Subject: [PATCH 04/53] implementation of custom end era logic in ibis layer --- circe/execution/engine/custom_era.py | 149 ++++++++++++++++++++++ circe/execution/engine/end_strategy.py | 4 +- circe/execution/normalize/cohort.py | 6 +- circe/execution/normalize/end_strategy.py | 1 + tests/execution/test_api_ibis.py | 19 +-- tests/execution/test_error_messages.py | 12 +- 6 files changed, 165 insertions(+), 26 deletions(-) create mode 100644 circe/execution/engine/custom_era.py diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py new file mode 100644 index 0000000..e9a93e3 --- /dev/null +++ b/circe/execution/engine/custom_era.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import ibis + +from ..plan.schema import PERSON_ID, START_DATE +from .end_strategy import _replace_end_date, attach_observation_bounds + + +def _compute_exposure_end_date(table, *, days_supply_override: int | None): + start = table["drug_exposure_start_date"].cast("date") + + if days_supply_override is not None: + return start + ibis.interval(days=days_supply_override) + + raw_end = ( + table["drug_exposure_end_date"].cast("date") + if "drug_exposure_end_date" in table.columns + else ibis.null().cast("date") + ) + days_supply = ( + table["days_supply"].cast("int64") if "days_supply" in table.columns else ibis.null().cast("int64") + ) + supply_end = start + days_supply.as_interval("D") + + return ibis.coalesce(raw_end, supply_end, start + ibis.interval(days=1)) + + +def _compute_eras(exposures, *, gap_days: int, offset: int): + padded = exposures.mutate(_padded_end=(exposures._exposure_end + ibis.interval(days=int(gap_days)))) + + ordering = [ + padded.start_date, + padded._padded_end.desc(), + padded._exposure_end.desc(), + ] + + cumulative_window = ibis.cumulative_window(group_by=padded.person_id, order_by=ordering) + ordered_window = ibis.window(group_by=padded.person_id, order_by=ordering) + + with_cummax = padded.mutate(_cummax_padded_end=padded._padded_end.max().over(cumulative_window)) + + with_prev = with_cummax.mutate(_prev_max=with_cummax._cummax_padded_end.lag().over(ordered_window)) + + marked = with_prev.mutate( + _is_new=ibis.ifelse( + with_prev._prev_max.isnull() | (with_prev._prev_max < with_prev.start_date), + ibis.literal(1, type="int64"), + ibis.literal(0, type="int64"), + ) + ) + + group_window = ibis.cumulative_window( + group_by=marked.person_id, + order_by=[ + marked.start_date, + marked._padded_end.desc(), + marked._exposure_end.desc(), + marked._is_new.desc(), + ], + ) + era_indexed = marked.mutate(_era_id=marked._is_new.sum().over(group_window)) + + collapsed = era_indexed.group_by(era_indexed.person_id, era_indexed._era_id).aggregate( + era_start_date=era_indexed.start_date.min(), + _max_exposure_end=era_indexed._exposure_end.max(), + ) + + return collapsed.select( + collapsed.person_id.cast("int64").name(PERSON_ID), + collapsed.era_start_date.cast("date").name("era_start_date"), + (collapsed._max_exposure_end + ibis.interval(days=int(offset))).cast("date").name("era_end_date"), + ) + + +def compute_drug_eras( + ctx, *, drug_codeset_id: int, gap_days: int, offset: int, days_supply_override: int | None +): + concept_ids = ctx.concept_ids_for_codeset(drug_codeset_id) + + if not concept_ids: + de = ctx.table("drug_exposure") + return de.filter(ibis.literal(False)).select( + de.person_id.cast("int64").name(PERSON_ID), + ibis.null().cast("date").name("era_start_date"), + ibis.null().cast("date").name("era_end_date"), + ) + + de = ctx.table("drug_exposure") + filtered = de.filter(de.drug_concept_id.isin(concept_ids)) + + prepared = filtered.select( + filtered.person_id.cast("int64").name("person_id"), + filtered.drug_exposure_start_date.cast("date").name("start_date"), + _compute_exposure_end_date(filtered, days_supply_override=days_supply_override).name("_exposure_end"), + ) + + return _compute_eras(prepared, gap_days=gap_days, offset=offset) + + +def apply_custom_era_strategy(events, strategy, ctx): + payload = strategy.payload + drug_codeset_id = payload["drug_codeset_id"] + gap_days = payload["gap_days"] + offset = payload["offset"] + days_supply_override = payload.get("days_supply_override") + + if drug_codeset_id is None: + with_bounds = attach_observation_bounds(events, ctx) + return _replace_end_date(events, with_bounds, with_bounds.op_end_date) + + eras = compute_drug_eras( + ctx, + drug_codeset_id=drug_codeset_id, + gap_days=gap_days, + offset=offset, + days_supply_override=days_supply_override, + ) + + eras_for_join = eras.select( + eras.person_id.name("_era_person_id"), + eras.era_start_date, + eras.era_end_date, + ) + + with_bounds = attach_observation_bounds(events, ctx) + + joined = with_bounds.left_join( + eras_for_join, + predicates=[ + with_bounds.person_id == eras_for_join._era_person_id, + with_bounds[START_DATE] >= eras_for_join.era_start_date, + with_bounds[START_DATE] <= eras_for_join.era_end_date, + ], + ) + + event_window = ibis.window( + group_by=joined.event_id, + order_by=[joined.era_end_date.desc()], + ) + ranked = joined.mutate(_rn=ibis.row_number().over(event_window)) + one_per_event = ranked.filter(ranked._rn == 0) + + effective_end = ibis.coalesce( + one_per_event.era_end_date, + one_per_event.op_end_date, + ) + final_end = ibis.least(effective_end, one_per_event.op_end_date) + + return _replace_end_date(events, one_per_event, final_end) diff --git a/circe/execution/engine/end_strategy.py b/circe/execution/engine/end_strategy.py index a099985..4b8e5b9 100644 --- a/circe/execution/engine/end_strategy.py +++ b/circe/execution/engine/end_strategy.py @@ -64,7 +64,9 @@ def apply_end_strategy(events, strategy, ctx): return _replace_end_date(events, with_bounds, end_date_expr) if strategy.kind == "custom_era": - raise UnsupportedFeatureError("Ibis executor end-strategy error: custom_era is not supported.") + from .custom_era import apply_custom_era_strategy + + return apply_custom_era_strategy(events, strategy, ctx) # Fallback: preserve default semantics of op_end_date clipping. return _replace_end_date(events, with_bounds, with_bounds.op_end_date) diff --git a/circe/execution/normalize/cohort.py b/circe/execution/normalize/cohort.py index b2f657f..61765b4 100644 --- a/circe/execution/normalize/cohort.py +++ b/circe/execution/normalize/cohort.py @@ -3,7 +3,7 @@ from ...cohortdefinition import CohortExpression from ...vocabulary.concept import ConceptSet from .._dataclass import frozen_slots_dataclass -from ..errors import ExecutionNormalizationError, UnsupportedFeatureError +from ..errors import ExecutionNormalizationError from .collapse import NormalizedCollapseSettings, normalize_collapse_settings from .criteria import NormalizedCriterion, normalize_criterion from .end_strategy import NormalizedEndStrategy, normalize_end_strategy @@ -149,10 +149,6 @@ def normalize_cohort( ) normalized_end_strategy = normalize_end_strategy(expression.end_strategy) - if normalized_end_strategy is not None and normalized_end_strategy.kind == "custom_era": - raise UnsupportedFeatureError( - "Ibis executor normalization error: custom_era end strategy is not supported." - ) return NormalizedCohort( title=expression.title, diff --git a/circe/execution/normalize/end_strategy.py b/circe/execution/normalize/end_strategy.py index 62ff666..8e03409 100644 --- a/circe/execution/normalize/end_strategy.py +++ b/circe/execution/normalize/end_strategy.py @@ -32,6 +32,7 @@ def normalize_end_strategy( "drug_codeset_id": value.drug_codeset_id, "offset": int(value.offset), "gap_days": int(value.gap_days), + "days_supply_override": value.days_supply_override, }, ) return NormalizedEndStrategy(kind="end_strategy", payload={}) diff --git a/tests/execution/test_api_ibis.py b/tests/execution/test_api_ibis.py index ef0a73e..db55e52 100644 --- a/tests/execution/test_api_ibis.py +++ b/tests/execution/test_api_ibis.py @@ -26,8 +26,7 @@ VisitDetail, VisitOccurrence, ) -from circe.cohortdefinition.core import CustomEraStrategy, NumericRange -from circe.execution.errors import UnsupportedFeatureError +from circe.cohortdefinition.core import NumericRange from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem @@ -1213,10 +1212,12 @@ def test_build_cohort_location_region_keeps_repeated_location_history_rows(): assert sorted(result.start_date.astype(str).tolist()) == ["2020-01-01", "2020-02-01"] -def test_build_cohort_rejects_unsupported_features(): - expression = CohortExpression( - primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence()]), - end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), - ) - with pytest.raises(UnsupportedFeatureError, match="custom_era"): - _ = build_cohort(expression, backend=object(), cdm_schema="main") +def test_build_cohort_rejects_unsupported_criteria(): + """Unsupported base criteria type is rejected at normalization time.""" + from circe.cohortdefinition.criteria import Criteria as RawCriteria + from circe.execution.errors import UnsupportedCriterionError + + with pytest.raises(UnsupportedCriterionError): + from circe.execution.normalize.criteria import normalize_criterion + + normalize_criterion(RawCriteria()) diff --git a/tests/execution/test_error_messages.py b/tests/execution/test_error_messages.py index 80133b4..8e71481 100644 --- a/tests/execution/test_error_messages.py +++ b/tests/execution/test_error_messages.py @@ -14,7 +14,7 @@ Occurrence, PrimaryCriteria, ) -from circe.cohortdefinition.core import CustomEraStrategy, NumericRange +from circe.cohortdefinition.core import NumericRange from circe.execution.errors import CompilationError, UnsupportedCriterionError, UnsupportedFeatureError from circe.execution.normalize.criteria import normalize_criterion from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem @@ -55,16 +55,6 @@ def _concept_set(set_id: int, concept_id: int) -> ConceptSet: ) -def test_error_message_for_custom_era_end_strategy(): - expression = CohortExpression( - primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence()]), - end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), - ) - - with pytest.raises(UnsupportedFeatureError, match="custom_era end strategy"): - _ = build_cohort(expression, backend=object(), cdm_schema="main") - - def test_error_message_for_unsupported_criterion_type(): with pytest.raises( UnsupportedCriterionError, From 09748f2fca4a5e3370dc50a7a7627fdd60c86463 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 12:48:25 -0700 Subject: [PATCH 05/53] more tests around custom era logic for parity with java --- tests/execution/test_custom_era.py | 566 +++++++++++++++++++++++++++++ 1 file changed, 566 insertions(+) create mode 100644 tests/execution/test_custom_era.py diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py new file mode 100644 index 0000000..def6496 --- /dev/null +++ b/tests/execution/test_custom_era.py @@ -0,0 +1,566 @@ +from __future__ import annotations + +from datetime import date + +import pytest + +from circe.api import build_cohort +from circe.cohortdefinition import ( + CohortExpression, + ConditionOccurrence, + DrugExposure, + PrimaryCriteria, +) +from circe.cohortdefinition.core import CustomEraStrategy +from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem + + +def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: + return ConceptSet( + id=set_id, + expression=ConceptSetExpression( + items=[ConceptSetItem(concept=Concept(conceptId=concept_id))] + ), + ) + + +def _seed_common_tables(conn, ibis): + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": [1], + "year_of_birth": [1980], + "gender_concept_id": [8507], + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1], + "observation_period_id": [10], + "observation_period_start_date": [date(2019, 1, 1)], + "observation_period_end_date": [date(2021, 12, 31)], + } + ), + overwrite=True, + ) + + +def test_custom_era_merges_drugs_within_gap(): + """Drug exposures within gap_days merge into one era; cohort end_date reflects it.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1], + "drug_exposure_id": [1, 2], + "drug_concept_id": [222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 3, 3)], + "days_supply": [0, 0], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 1)], + "condition_end_date": [date(2020, 1, 1)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 1 + assert str(result.iloc[0]["start_date"])[:10] == "2020-01-01" + # exp 1: end=2020-01-31, exp 2: end=2020-03-03 + # gap = 1 <= 30 -> merged era: start=2020-01-01, end=2020-03-03 + assert str(result.iloc[0]["end_date"])[:10] == "2020-03-03" + + +def test_custom_era_no_merge_across_large_gap(): + """Drug exposures beyond gap_days form separate eras; cohort uses nearest era.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1], + "drug_exposure_id": [1, 2], + "drug_concept_id": [222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1)], + "drug_exposure_end_date": [date(2020, 1, 6), date(2020, 3, 3)], + "days_supply": [0, 0], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 1)], + "condition_end_date": [date(2020, 1, 1)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=5, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 1 + assert str(result.iloc[0]["start_date"])[:10] == "2020-01-01" + # exp 1: end=2020-01-06, exp 2: end=2020-03-03 + # gap = 26 > 5 -> separate eras + # cohort start 2020-01-01 matches era 1: end 2020-01-06 + assert str(result.iloc[0]["end_date"])[:10] == "2020-01-06" + + +def test_custom_era_offset_applied(): + """Offset days are added to the drug era end_date.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1], + "drug_exposure_id": [1], + "drug_concept_id": [222], + "drug_exposure_start_date": [date(2020, 1, 1)], + "drug_exposure_end_date": [date(2020, 1, 10)], + "days_supply": [0], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 1)], + "condition_end_date": [date(2020, 1, 1)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=7), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 1 + assert str(result.iloc[0]["start_date"])[:10] == "2020-01-01" + # drug effective end: 2020-01-10 (end_date override) + # era: start=2020-01-01, end=2020-01-10+7=2020-01-17 + assert str(result.iloc[0]["end_date"])[:10] == "2020-01-17" + + +def test_custom_era_no_matching_drugs(): + """No matching drug exposures -> fall back to observation_period_end_date.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 15)], + "condition_end_date": [date(2020, 1, 15)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [], + "drug_exposure_id": [], + "drug_concept_id": [], + "drug_exposure_start_date": [], + "drug_exposure_end_date": [], + "days_supply": [], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 999), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 1 + assert str(result.iloc[0]["start_date"])[:10] == "2020-01-15" + # No matching drugs -> end_date = observation_period_end_date = 2021-12-31 + assert str(result.iloc[0]["end_date"])[:10] == "2021-12-31" + + +def test_custom_era_with_drug_exposure_as_primary(): + """Custom era works with DrugExposure as the primary criterion.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1], + "drug_exposure_id": [1, 2], + "drug_concept_id": [222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 3, 3)], + "days_supply": [0, 0], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 222)], + primary_criteria=PrimaryCriteria( + criteria_list=[DrugExposure(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + # With primary_limit_type="all", both drug exposures produce cohort entries. + # Both entries get end_date from the merged drug era (2020-03-03). + assert len(result) == 2 + start_dates = sorted(result["start_date"].astype(str).tolist()) + assert start_dates == ["2020-01-01", "2020-02-01"] + assert all( + str(d)[:10] == "2020-03-03" for d in result["end_date"] + ) + + +def test_compute_drug_eras_matches_java_sql_logic(): + """compute_drug_eras ibis output matches equivalent raw SQL (Java template translated to DuckDB).""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from types import SimpleNamespace + + from circe.execution.engine.custom_era import compute_drug_eras + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + # 5 exposures for person 1, with gap_days=7, offset=3. + # Exposure end_dates are set explicitly so COALESCE is predictable. + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1, 1, 1, 1], + "drug_exposure_id": [1, 2, 3, 4, 5], + "drug_concept_id": [222, 222, 222, 222, 222], + "drug_exposure_start_date": [ + date(2020, 1, 1), + date(2020, 1, 10), + date(2020, 3, 1), + date(2020, 3, 20), + date(2020, 5, 1), + ], + "drug_exposure_end_date": [ + date(2020, 1, 6), + date(2020, 2, 9), + date(2020, 3, 21), + date(2020, 3, 30), + date(2020, 5, 15), + ], + "days_supply": [0, 0, 0, 0, 0], + } + ), + overwrite=True, + ) + + ctx = SimpleNamespace( + table=lambda name: conn.table(name), + concept_ids_for_codeset=lambda cid: (222,) if cid == 2 else (), + ) + + # --- ibis path --- + ibis_result = compute_drug_eras( + ctx, drug_codeset_id=2, gap_days=7, offset=3, days_supply_override=None + ).execute() + ibis_result = ibis_result.sort_values(["person_id", "era_start_date"]).reset_index(drop=True) + + # --- raw SQL path (Java template core logic, DuckDB dialect) --- + # Java template uses: COALESCE(end, start+days_supply, start+1) + # then pads by (gap_days + offset), groups by cumulative-max-over-preceding, + # and finally subtracts gap_days from max(end) to leave only offset. + gap = 7 + off = 3 + + sql = f""" + WITH exposures AS ( + SELECT + person_id::INTEGER AS person_id, + drug_exposure_start_date::DATE AS start_date, + COALESCE( + drug_exposure_end_date::DATE, + drug_exposure_start_date::DATE + days_supply::INTEGER, + drug_exposure_start_date::DATE + 1 + ) + {gap + off} AS padded_end + FROM drug_exposure + WHERE drug_concept_id IN (222) + ), + with_prev_max AS ( + SELECT *, + MAX(padded_end) OVER ( + PARTITION BY person_id ORDER BY start_date, padded_end DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING + ) AS prev_max + FROM exposures + ), + with_markers AS ( + SELECT *, + CASE WHEN prev_max IS NULL OR prev_max < start_date THEN 1 ELSE 0 END AS is_new + FROM with_prev_max + ), + with_era AS ( + SELECT *, + SUM(is_new) OVER ( + PARTITION BY person_id + ORDER BY start_date, is_new DESC, padded_end DESC + ) AS era_id + FROM with_markers + ) + SELECT + person_id, + MIN(start_date)::DATE AS era_start_date, + (MAX(padded_end) - {gap})::DATE AS era_end_date + FROM with_era + GROUP BY person_id, era_id + ORDER BY person_id, MIN(start_date) + """ + + raw_conn = conn.con + sql_result = raw_conn.sql(sql).fetchdf() + + # --- compare --- + pd = pytest.importorskip("pandas") + pd.testing.assert_frame_equal( + ibis_result, + sql_result, + check_dtype=False, + check_column_type=False, + ) + + +def test_full_cohort_custom_era_matches_sql_end_dates(): + """Full cohort pipeline with CustomEraStrategy produces same end_dates as raw SQL.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1], + "drug_exposure_id": [1, 2], + "drug_concept_id": [222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 3, 3)], + "days_supply": [0, 0], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 1)], + "condition_end_date": [date(2020, 1, 1)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)] + ), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), + ) + + # --- ibis pipeline --- + cohort_result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + # --- raw SQL pipeline (Java CUSTOM_ERA_STRATEGY_TEMPLATE logic, DuckDB dialect) --- + # Computes drug eras, then matches era end_dates to events via start_date overlap. + sql = f""" + WITH drug_eras AS ( + SELECT + person_id, + MIN(start_date) AS era_start_date, + MAX(padded_end) - 30 AS era_end_date + FROM ( + SELECT + person_id, start_date, padded_end, + SUM(is_new) OVER ( + PARTITION BY person_id + ORDER BY start_date, is_new DESC, padded_end DESC + ) AS era_id + FROM ( + SELECT + person_id, start_date, padded_end, + CASE WHEN prev_max IS NULL OR prev_max < start_date THEN 1 ELSE 0 END AS is_new + FROM ( + SELECT + person_id, start_date, padded_end, + MAX(padded_end) OVER ( + PARTITION BY person_id ORDER BY start_date, padded_end DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING + ) AS prev_max + FROM ( + SELECT + de.person_id, + de.drug_exposure_start_date::DATE AS start_date, + COALESCE( + de.drug_exposure_end_date::DATE, + de.drug_exposure_start_date::DATE + de.days_supply::INTEGER, + de.drug_exposure_start_date::DATE + 1 + ) + 30 AS padded_end + FROM drug_exposure de + WHERE de.drug_concept_id = 222 + ) raw_ends + ) maxes + ) marked + ) indexed + GROUP BY person_id, era_id + ), + events_with_obs AS ( + SELECT + e.person_id, + e.condition_occurrence_id AS event_id, + e.condition_start_date::DATE AS start_date, + op.observation_period_end_date::DATE AS op_end_date + FROM condition_occurrence e + JOIN observation_period op ON e.person_id = op.person_id + ) + SELECT + ev.person_id, + ev.start_date, + LEAST( + COALESCE(MAX(er.era_end_date), ev.op_end_date), + ev.op_end_date + )::DATE AS end_date + FROM events_with_obs ev + LEFT JOIN drug_eras er + ON ev.person_id = er.person_id + AND ev.start_date BETWEEN er.era_start_date AND er.era_end_date + GROUP BY ev.person_id, ev.event_id, ev.start_date, ev.op_end_date + ORDER BY ev.person_id, ev.start_date + """ + + sql_result = conn.con.sql(sql).fetchdf() + + # Compare end_dates and start_dates after sorting + ibis_ends = sorted(cohort_result["end_date"].astype(str).tolist()) + sql_ends = sorted(sql_result["end_date"].astype(str).tolist()) + assert ibis_ends == sql_ends + + ibis_starts = sorted(cohort_result["start_date"].astype(str).tolist()) + sql_starts = sorted(sql_result["start_date"].astype(str).tolist()) + assert ibis_starts == sql_starts From 5c990602b15838c4922e158609cdd3f506c07e63 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 18:33:59 -0700 Subject: [PATCH 06/53] Fixed perfromance issues wwith ibis compilation parse trees and improved overall performance. Uncovered custom eras bug --- benchmarks/benchmark_analyze_duckdb.py | 31 +-- benchmarks/benchmark_run_py.py | 10 +- benchmarks/compare_cohort_outputs.py | 275 +++++++++++++++++++++ circe/cohort_definition_set/_generate.py | 31 ++- circe/execution/api.py | 10 +- circe/execution/engine/cohort.py | 86 +++++-- circe/execution/engine/group_operators.py | 31 ++- circe/execution/engine/primary.py | 18 +- tests/execution/test_custom_era.py | 34 +-- tests/execution/test_phenotype_failures.py | 84 +++++++ tests/execution/test_union_scaling.py | 117 +++++++++ 11 files changed, 651 insertions(+), 76 deletions(-) create mode 100644 benchmarks/compare_cohort_outputs.py create mode 100644 tests/execution/test_phenotype_failures.py create mode 100644 tests/execution/test_union_scaling.py diff --git a/benchmarks/benchmark_analyze_duckdb.py b/benchmarks/benchmark_analyze_duckdb.py index 09cf5cf..f69c19e 100644 --- a/benchmarks/benchmark_analyze_duckdb.py +++ b/benchmarks/benchmark_analyze_duckdb.py @@ -17,6 +17,7 @@ import ibis import pandas as pd +from compare_cohort_outputs import compare_cohort_outputs, print_comparison_report REPO_ROOT = Path(__file__).resolve().parent.parent OUTPUT_DIR = REPO_ROOT / "benchmark_output" @@ -55,10 +56,9 @@ def print_coverage(label: str, df: pd.DataFrame) -> None: def print_timing(label: str, df: pd.DataFrame) -> None: - if _has_status(df): - complete = df[df["status"] == "COMPLETE"] - else: - complete = df # checksum table — all rows are COMPLETE + complete = ( + df[df["status"] == "COMPLETE"] if _has_status(df) else df + ) # checksum table — all rows are COMPLETE if complete.empty: print(f" {label}: no completed cohorts to report timing") return @@ -72,8 +72,9 @@ def print_timing(label: str, df: pd.DataFrame) -> None: print(f" Max : {secs.max():.4f}s") -def cross_validate(label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, - cohort_table: str, checksum_table: str) -> None: +def cross_validate( + label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, cohort_table: str, checksum_table: str +) -> None: """Read the persisted checksum table and compare with the CSV.""" try: history = backend.table(checksum_table, database="main").execute() @@ -85,10 +86,7 @@ def cross_validate(label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, print(f" {label} cross-validation: checksum table is empty") return - if _has_status(csv_df): - complete_csv = csv_df[csv_df["status"] == "COMPLETE"] - else: - complete_csv = csv_df + complete_csv = csv_df[csv_df["status"] == "COMPLETE"] if _has_status(csv_df) else csv_df if complete_csv.empty: return @@ -113,8 +111,7 @@ def cross_validate(label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, print(f" Delta : {delta:.4f}s {'✓' if delta < 1.0 else '✗'}") -def print_cohort_row_counts(label: str, backend: ibis.BaseBackend, - cohort_table: str) -> None: +def print_cohort_row_counts(label: str, backend: ibis.BaseBackend, cohort_table: str) -> None: """Print row count summary from the cohort output table.""" try: rows = backend.table(cohort_table, database="main").execute() @@ -131,8 +128,8 @@ def print_cohort_row_counts(label: str, backend: ibis.BaseBackend, def compare_shared(label_prefix: str, r_df: pd.DataFrame, py_df: pd.DataFrame) -> None: """Compare timing for cohorts present in both runs.""" - r_complete = (r_df[r_df["status"] == "COMPLETE"].copy() if _has_status(r_df) else r_df.copy()) - py_complete = (py_df[py_df["status"] == "COMPLETE"].copy() if _has_status(py_df) else py_df.copy()) + r_complete = r_df[r_df["status"] == "COMPLETE"].copy() if _has_status(r_df) else r_df.copy() + py_complete = py_df[py_df["status"] == "COMPLETE"].copy() if _has_status(py_df) else py_df.copy() if r_complete.empty or py_complete.empty: return @@ -209,6 +206,12 @@ def main() -> None: print("\nTable 5 — R vs Python shared-cohort comparison") compare_shared("=>", r_df, py_df) + # ── Row-level cohort output comparison ────────────────────────────── + if DUCKDB_PATH.exists(): + backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + report = compare_cohort_outputs(backend) + print_comparison_report(report) + print(f"\n{'=' * 60}") print("Analysis complete") print(f"{'=' * 60}\n") diff --git a/benchmarks/benchmark_run_py.py b/benchmarks/benchmark_run_py.py index 8fd3abf..9139842 100644 --- a/benchmarks/benchmark_run_py.py +++ b/benchmarks/benchmark_run_py.py @@ -25,15 +25,15 @@ import ibis import pandas as pd +from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set +from circe.cohortdefinition import CohortExpression + logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) -from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set -from circe.cohortdefinition import CohortExpression - REPO_ROOT = Path(__file__).resolve().parent.parent OUTPUT_DIR = REPO_ROOT / "benchmark_output" JSON_DIR = OUTPUT_DIR / "phenotype_jsons" @@ -75,9 +75,7 @@ def main() -> None: # ── 2. Connect to DuckDB ───────────────────────────────────────────── if not DUCKDB_PATH.exists(): - raise FileNotFoundError( - f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first." - ) + raise FileNotFoundError(f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first.") print(f"Connecting to DuckDB: {DUCKDB_PATH}") backend = ibis.duckdb.connect(str(DUCKDB_PATH)) diff --git a/benchmarks/compare_cohort_outputs.py b/benchmarks/compare_cohort_outputs.py new file mode 100644 index 0000000..359e310 --- /dev/null +++ b/benchmarks/compare_cohort_outputs.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +"""Cross-implementation cohort output validator. + +Compares the row-level output of two cohort generation implementations +(e.g. R/CohortGenerator vs Python/circe) to verify they produce identical +``(subject_id, cohort_start_date, cohort_end_date)`` rows for each shared +cohort. + +Usage:: + + import ibis + from benchmarks.compare_cohort_outputs import compare_cohort_outputs, print_comparison_report + + backend = ibis.duckdb.connect("benchmark_output/eunomia.duckdb") + report = compare_cohort_outputs( + backend, r_table="cohort", py_table="cohort_py", + ) + print_comparison_report(report) + + # Optionally validate programmatically: + assert report.n_cohorts_matched_exactly == report.n_cohorts_shared +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +import pandas as pd + +from circe.execution.typing import IbisBackendLike + + +@dataclass +class CohortMatchSummary: + """Per-cohort row-level comparison result.""" + + cohort_id: int + """Cohort definition identifier.""" + + n_r: int + """Row count in the reference (R) table.""" + + n_py: int + """Row count in the Python table.""" + + n_matched: int + """Rows found identically in both tables.""" + + n_only_r: int + """Rows present only in the reference (R) table.""" + + n_only_py: int + """Rows present only in the Python table.""" + + sample_only_r: list[tuple[int, str, str]] = field(default_factory=list) + """Up to 3 sample rows from the reference table not found in Python.""" + + sample_only_py: list[tuple[int, str, str]] = field(default_factory=list) + """Up to 3 sample rows from the Python table not found in the reference.""" + + @property + def is_exact_match(self) -> bool: + """True when the two implementations produce identical row sets.""" + return self.n_only_r == 0 and self.n_only_py == 0 + + @property + def pass_ratio(self) -> float: + """Fraction of rows found in both implementations (0-1).""" + denom = max(self.n_r, self.n_py) + return self.n_matched / denom if denom > 0 else 1.0 + + +@dataclass +class CohortComparisonReport: + """Aggregate row-level comparison across implementations.""" + + per_cohort: list[CohortMatchSummary] + """Per-cohort comparison results.""" + + n_cohorts_shared: int + """Number of cohorts present in both tables.""" + + n_cohorts_matched_exactly: int + """Number of cohorts with zero row-level differences.""" + + total_r_rows: int + """Total row count in the reference table (all compared cohorts).""" + + total_py_rows: int + """Total row count in the Python table (all compared cohorts).""" + + total_matched: int + """Total matched rows across all compared cohorts.""" + + total_only_r: int + """Total rows only in the reference table.""" + + total_only_py: int + """Total rows only in the Python table.""" + + @property + def exact_match_pct(self) -> float: + """Percentage of shared cohorts that match exactly.""" + return self.n_cohorts_matched_exactly / self.n_cohorts_shared * 100 if self.n_cohorts_shared else 0.0 + + +def _read_cohort_table( + backend: IbisBackendLike, + table_name: str, + schema: str | None, + label: str, +) -> pd.DataFrame | None: + """Read a cohort output table, cast columns to canonical types, return a DataFrame.""" + try: + raw = backend.table(table_name, database=schema).execute() + except Exception: + print(f" [WARN] {label} table '{table_name}' not found") + return None + + if raw.empty: + return None + + df = pd.DataFrame( + { + "cohort_definition_id": pd.to_numeric(raw["cohort_definition_id"], errors="coerce") + .astype("int64"), + "subject_id": pd.to_numeric(raw["subject_id"], errors="coerce").astype("int64"), + "cohort_start_date": pd.to_datetime(raw["cohort_start_date"], errors="coerce").dt.date, + "cohort_end_date": pd.to_datetime(raw["cohort_end_date"], errors="coerce").dt.date, + } + ) + return df.drop_duplicates().dropna() + + +def _compare_single_cohort( + cohort_id: int, + r_rows: pd.DataFrame, + py_rows: pd.DataFrame, +) -> CohortMatchSummary: + """Compare row-level output for a single cohort.""" + key_cols = ["subject_id", "cohort_start_date", "cohort_end_date"] + + r_set = tuple( + tuple(row) for row in r_rows[key_cols].itertuples(index=False) + ) + py_set = tuple( + tuple(row) for row in py_rows[key_cols].itertuples(index=False) + ) + + r_unique = set(r_set) + py_unique = set(py_set) + + only_r = sorted(r_unique - py_unique) + only_py = sorted(py_unique - r_unique) + matched = r_unique & py_unique + + return CohortMatchSummary( + cohort_id=cohort_id, + n_r=len(r_unique), + n_py=len(py_unique), + n_matched=len(matched), + n_only_r=len(only_r), + n_only_py=len(only_py), + sample_only_r=[(int(s), str(d), str(e)) for s, d, e in only_r[:3]], + sample_only_py=[(int(s), str(d), str(e)) for s, d, e in only_py[:3]], + ) + + +def compare_cohort_outputs( + backend: IbisBackendLike, + r_table: str = "cohort", + py_table: str = "cohort_py", + schema: str | None = "main", + *, + cohort_ids: list[int] | None = None, +) -> CohortComparisonReport: + """Compare row-level cohort output between two implementations. + + Args: + backend: Ibis backend connection pointing at the database. + r_table: Name of the reference (R/CohortGenerator) cohort output table. + py_table: Name of the Python/circe cohort output table. + schema: Database schema where both tables reside. + cohort_ids: Specific cohort IDs to compare (``None`` = all shared). + + Returns: + :class:`CohortComparisonReport` with per-cohort and aggregate results. + """ + r_df = _read_cohort_table(backend, r_table, schema, "R") + py_df = _read_cohort_table(backend, py_table, schema, "Py") + + if r_df is None or py_df is None: + return CohortComparisonReport( + per_cohort=[], + n_cohorts_shared=0, + n_cohorts_matched_exactly=0, + total_r_rows=0, + total_py_rows=0, + total_matched=0, + total_only_r=0, + total_only_py=0, + ) + + r_ids = set(r_df["cohort_definition_id"].unique()) + py_ids = set(py_df["cohort_definition_id"].unique()) + + if cohort_ids is not None: + shared = sorted(r_ids & py_ids & set(cohort_ids)) + else: + shared = sorted(r_ids & py_ids) + + per_cohort: list[CohortMatchSummary] = [] + total_r = 0 + total_py = 0 + total_m = 0 + total_o_r = 0 + total_o_py = 0 + exact_count = 0 + + for cid in shared: + r_rows = r_df[r_df["cohort_definition_id"] == cid] + py_rows = py_df[py_df["cohort_definition_id"] == cid] + summary = _compare_single_cohort(cid, r_rows, py_rows) + per_cohort.append(summary) + total_r += summary.n_r + total_py += summary.n_py + total_m += summary.n_matched + total_o_r += summary.n_only_r + total_o_py += summary.n_only_py + if summary.is_exact_match: + exact_count += 1 + + return CohortComparisonReport( + per_cohort=per_cohort, + n_cohorts_shared=len(shared), + n_cohorts_matched_exactly=exact_count, + total_r_rows=total_r, + total_py_rows=total_py, + total_matched=total_m, + total_only_r=total_o_r, + total_only_py=total_o_py, + ) + + +def print_comparison_report(report: CohortComparisonReport) -> None: + """Print a human-readable row-level parity report.""" + print(f"\nTable 6 — Row-level parity (R vs Python)") + + if report.n_cohorts_shared == 0: + print(" No shared cohorts to compare.") + return + + print(f" Shared cohorts: {report.n_cohorts_shared}") + print(f" Exactly matched: {report.n_cohorts_matched_exactly} " + f"({report.exact_match_pct:.1f}%)") + print(f" Total R rows: {report.total_r_rows:,}") + print(f" Total Py rows: {report.total_py_rows:,}") + print(f" Total matched: {report.total_matched:,}") + print(f" Total only in R: {report.total_only_r:,}") + print(f" Total only in Py: {report.total_only_py:,}") + + mismatched = [c for c in report.per_cohort if not c.is_exact_match] + if mismatched: + print(f"\n Cohort mismatches ({len(mismatched)}):") + for c in mismatched: + print(f" {c.cohort_id:>5d} " + f"R={c.n_r:<6d} Py={c.n_py:<6d} " + f"matched={c.n_matched:<6d} " + f"only_R={c.n_only_r:<4d} only_Py={c.n_only_py:<4d}") + if c.sample_only_r: + print(f" samples only_R: {c.sample_only_r[:3]}") + if c.sample_only_py: + print(f" samples only_Py: {c.sample_only_py[:3]}") + else: + print(f"\n ✓ All {report.n_cohorts_shared} shared cohorts match exactly.") diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index f5db6b7..4f8f5e0 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -2,6 +2,7 @@ from __future__ import annotations +import contextlib import logging from datetime import datetime from typing import TYPE_CHECKING, Literal @@ -79,6 +80,13 @@ def generate_cohort_set( total = len(cohort_definition_set) current_checksums = cohort_definition_set.checksums() + # Clear the correlated-events compilation cache so that entries + # referencing a previous backend (whose ``id()`` may have been reused + # by the current connection) never collide. + from ..execution.engine.group_operators import _COMPILED_CORRELATED_EVENTS + + _COMPILED_CORRELATED_EVENTS.clear() + previous_checksums: dict[int, str] = {} if incremental: previous_checksums = load_checksums( @@ -126,7 +134,9 @@ def generate_cohort_set( start_time: datetime | None = None end_time: datetime | None = None try: - # Compile cohort expression to an ibis relation (not timed) + # Compile cohort expression to an ibis relation (not timed for + # benchmark parity — benchmarks measure database execution only) + compile_start = datetime.now() new_rows = build_cohort( cohort.expression, backend=backend, @@ -134,11 +144,21 @@ def generate_cohort_set( results_schema=results_schema, vocabulary_schema=vocabulary_schema, use_persistent_cache=False, + cohort_id=cohort.cohort_id, ) + compile_end = datetime.now() + compile_duration = (compile_end - compile_start).total_seconds() new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) # Materialize the compiled relation — this is the DB IO we time start_time = datetime.now() + logger.debug( + "[%d/%d] Executing cohort %d (%s) ...", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + ) write_cohort( compiled_relation=new_rows, backend=backend, @@ -153,11 +173,12 @@ def generate_cohort_set( duration = (end_time - start_time).total_seconds() logger.info( - "[%d/%d] Completed cohort %d (%s) in %.1fs", + "[%d/%d] Completed cohort %d (%s) — compile %.1fs, execute %.1fs", i, total, cohort.cohort_id, cohort.cohort_name, + compile_duration, duration, ) except ExecutionError as exc: @@ -194,6 +215,12 @@ def generate_cohort_set( raise continue + # Clean up staging tables created by the materialized pipeline + schema = results_schema or cdm_schema + for stage in ("primary", "qualified", "included", "ended"): + with contextlib.suppress(Exception): + backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) + results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, diff --git a/circe/execution/api.py b/circe/execution/api.py index 05b892a..ed89bdd 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -30,8 +30,13 @@ def build_cohort( results_schema: str | None = None, vocabulary_schema: str | None = None, use_persistent_cache: bool = False, + cohort_id: int = 0, ) -> Table: - """Normalize, compile, and assemble a cohort relation.""" + """Normalize, compile, and assemble a cohort relation. + + Paths like to stage-by-stage temp tables when *cohort_id* is provided, + so that the ibis expression tree never grows too large to compile. + """ maybe_apply_databricks_post_connect_workaround(backend) normalized = normalize_cohort(expression) @@ -45,7 +50,7 @@ def build_cohort( use_persistent_cache=use_persistent_cache, ) - return build_cohort_table(normalized, ctx) + return build_cohort_table(normalized, ctx, cohort_id=cohort_id) def write_relation( @@ -140,6 +145,7 @@ def write_cohort( results_schema=results_schema, vocabulary_schema=vocabulary_schema, use_persistent_cache=use_persistent_cache, + cohort_id=cohort_id, ) new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort_id) diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 7f34652..75eb23a 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -1,6 +1,9 @@ from __future__ import annotations +import contextlib + from ..ibis.context import ExecutionContext +from ..ibis.operations import create_table, read_table from ..lower.criteria import lower_criterion from ..normalize.cohort import NormalizedCohort from ..plan.cohort import CohortPlan, PrimaryEventInput @@ -14,7 +17,47 @@ from .primary import build_primary_events -def build_cohort_table(normalized: NormalizedCohort, ctx: ExecutionContext) -> Table: +def _materialize( + table: Table, + *, + ctx: ExecutionContext, + cohort_id: int, + stage: str, + schema: str | None, +) -> Table: + """Write *table* to a backend staging table and return a fresh reference. + + Without this step every pipeline stage accumulates on top of the previous + ibis expression tree. For cohorts with many primary criteria the tree + grows too large for the ibis SQL compiler to traverse in reasonable time. + + Materialising at each pipeline boundary keeps the expression tree sent + to the compiler shallow — each stage only builds on a simple + ``DatabaseTable`` reference. + """ + name = f"__cg_{cohort_id}_{stage}" + create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) + return read_table(ctx.backend, table_name=name, schema=schema) + + +def _drop_staging_tables( + ctx: ExecutionContext, + cohort_id: int, + schema: str | None, +) -> None: + """Remove all staging tables for *cohort_id* from the database.""" + for stage in ("primary", "qualified", "included", "ended"): + name = f"__cg_{cohort_id}_{stage}" + with contextlib.suppress(Exception): + ctx.backend.drop_table(name, database=schema, force=True) + + +def build_cohort_table( + normalized: NormalizedCohort, + ctx: ExecutionContext, + *, + cohort_id: int = 0, +) -> Table: primary_plans = tuple( PrimaryEventInput( event_plan=lower_criterion(criterion, criterion_index=index), @@ -29,27 +72,36 @@ def build_cohort_table(normalized: NormalizedCohort, ctx: ExecutionContext) -> T qualified_limit_type=normalized.result_limits.qualified_limit_type, expression_limit_type=normalized.result_limits.expression_limit_type, ) + + schema = ctx.results_schema or ctx.cdm_schema + + # ── Primary events ────────────────────────────────────────────────── primary_events = build_primary_events(cohort_plan, ctx) + primary_events = _materialize( + primary_events, ctx=ctx, cohort_id=cohort_id, stage="primary", schema=schema + ) + + # ── Additional (correlated) criteria ──────────────────────────────── qualified_events = apply_additional_criteria(primary_events, normalized.additional_criteria, ctx) if normalized.additional_criteria is not None and not normalized.additional_criteria.is_empty(): - qualified_events = apply_result_limit( - qualified_events, - cohort_plan.qualified_limit_type, - ) + qualified_events = apply_result_limit(qualified_events, cohort_plan.qualified_limit_type) + qualified_events = _materialize( + qualified_events, ctx=ctx, cohort_id=cohort_id, stage="qualified", schema=schema + ) + + # ── Inclusion rules ───────────────────────────────────────────────── included_events = apply_inclusion_rules(qualified_events, normalized.inclusion_rules, ctx) - included_events = apply_result_limit( - included_events, - cohort_plan.expression_limit_type, + included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) + included_events = _materialize( + included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema ) + + # ── End strategy ──────────────────────────────────────────────────── ended_events = apply_end_strategy(included_events, normalized.end_strategy, ctx) + ended_events = _materialize(ended_events, ctx=ctx, cohort_id=cohort_id, stage="ended", schema=schema) + + # ── Censoring + collapse (final stage — no materialize after) ────── censored_events = apply_censoring( - ended_events, - normalized.censoring_criteria, - normalized.censor_window, - ctx, - ) - return collapse_events( - censored_events, - normalized.collapse_settings, - normalized.censor_window, + ended_events, normalized.censoring_criteria, normalized.censor_window, ctx ) + return collapse_events(censored_events, normalized.collapse_settings, normalized.censor_window) diff --git a/circe/execution/engine/group_operators.py b/circe/execution/engine/group_operators.py index 3ed7c8d..b2d35ff 100644 --- a/circe/execution/engine/group_operators.py +++ b/circe/execution/engine/group_operators.py @@ -94,24 +94,43 @@ def group_predicate(match_count_expr, mode: str, count: int | None, child_count: ) +_COMPILED_CORRELATED_EVENTS: dict[tuple[int, int], Table] = {} +"""Cache for :func:`_compile_correlated_events` keyed by ``(backend_id, content_hash)``. + +Identical correlated criteria frequently appear across multiple primary event +criteria within a cohort — compiling them once avoids 350+ duplicate ibis +expression tree constructions for large cohorts. +""" + + def _compile_correlated_events( correlated: NormalizedCorrelatedCriteria, *, criterion_index: int, ctx: ExecutionContext, ) -> Table: + """Compile a correlated criterion to an ibis Table expression. + + The compiled events are independent of *criterion_index* (the position + within the enclosing group), so results are cached by content hash + scoped to the current backend connection. + """ + cache_key = (id(ctx.backend), hash(repr(correlated))) + cached = _COMPILED_CORRELATED_EVENTS.get(cache_key) + if cached is not None: + return cached + event_plan = lower_criterion(correlated.criterion, criterion_index=criterion_index) events = compile_event_plan(event_plan, ctx) nested_group = correlated.criterion.correlated_criteria - if nested_group is None or nested_group.is_empty(): - return events + if nested_group is not None and not nested_group.is_empty(): + from .groups import apply_additional_criteria # noqa: PLC0415 - # Correlated criteria can themselves carry nested correlated criteria. - # Re-apply the same group evaluator used for primary/additional criteria. - from .groups import apply_additional_criteria + events = apply_additional_criteria(events, nested_group, ctx) - return apply_additional_criteria(events, nested_group, ctx) + _COMPILED_CORRELATED_EVENTS[cache_key] = events + return events def correlated_match_keys( diff --git a/circe/execution/engine/primary.py b/circe/execution/engine/primary.py index 07f0ec4..eea5224 100644 --- a/circe/execution/engine/primary.py +++ b/circe/execution/engine/primary.py @@ -14,10 +14,20 @@ def _union_all(tables): - current = tables[0] - for table in tables[1:]: - current = current.union(table, distinct=False) - return current + if not tables: + raise ValueError("_union_all requires at least one table") + + if len(tables) == 1: + return tables[0] + + # Binary-tree merge: recursively halve the list to produce a balanced + # union tree with O(log n) nesting depth instead of O(n). + # Without this, a cohort with 87 primary criteria would produce 86 levels + # of nested UNION ALL, exceeding DuckDB's query compilation limits. + mid = len(tables) // 2 + left = _union_all(tables[:mid]) + right = _union_all(tables[mid:]) + return left.union(right, distinct=False) def _assign_primary_event_ids(events): diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py index def6496..e1c45f7 100644 --- a/tests/execution/test_custom_era.py +++ b/tests/execution/test_custom_era.py @@ -18,9 +18,7 @@ def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: return ConceptSet( id=set_id, - expression=ConceptSetExpression( - items=[ConceptSetItem(concept=Concept(conceptId=concept_id))] - ), + expression=ConceptSetExpression(items=[ConceptSetItem(concept=Concept(conceptId=concept_id))]), ) @@ -92,9 +90,7 @@ def test_custom_era_merges_drugs_within_gap(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -149,9 +145,7 @@ def test_custom_era_no_merge_across_large_gap(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=5, offset=0), ) @@ -207,9 +201,7 @@ def test_custom_era_offset_applied(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=7), ) @@ -264,9 +256,7 @@ def test_custom_era_no_matching_drugs(): _make_concept_set(1, 111), _make_concept_set(2, 999), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -303,9 +293,7 @@ def test_custom_era_with_drug_exposure_as_primary(): expression = CohortExpression( concept_sets=[_make_concept_set(1, 222)], - primary_criteria=PrimaryCriteria( - criteria_list=[DrugExposure(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[DrugExposure(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), ) @@ -316,9 +304,7 @@ def test_custom_era_with_drug_exposure_as_primary(): assert len(result) == 2 start_dates = sorted(result["start_date"].astype(str).tolist()) assert start_dates == ["2020-01-01", "2020-02-01"] - assert all( - str(d)[:10] == "2020-03-03" for d in result["end_date"] - ) + assert all(str(d)[:10] == "2020-03-03" for d in result["end_date"]) def test_compute_drug_eras_matches_java_sql_logic(): @@ -478,9 +464,7 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -489,7 +473,7 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): # --- raw SQL pipeline (Java CUSTOM_ERA_STRATEGY_TEMPLATE logic, DuckDB dialect) --- # Computes drug eras, then matches era end_dates to events via start_date overlap. - sql = f""" + sql = """ WITH drug_eras AS ( SELECT person_id, diff --git a/tests/execution/test_phenotype_failures.py b/tests/execution/test_phenotype_failures.py new file mode 100644 index 0000000..6416000 --- /dev/null +++ b/tests/execution/test_phenotype_failures.py @@ -0,0 +1,84 @@ +"""Regression tests for PhenotypeLibrary cohorts that previously failed. + +These are the 3 most complex cohorts in the PhenotypeLibrary (51-97 primary +criteria, 105 concept sets, 7-9 inclusion rules, multiple censoring criteria). +They failed because the sequential ``_union_all`` produced deeply nested +UNION ALL expressions that exceeded DuckDB's query compilation limits. + +The binary-tree merge in ``_union_all`` reduces nesting from O(n) to O(log n). + +These tests verify that ``build_cohort`` (compilation) succeeds. Full +``generate_cohort_set`` is covered by ``pytest.mark.slow`` tests. +""" + +from __future__ import annotations + +import datetime +import json +from pathlib import Path + +import pytest + +from circe.cohortdefinition import CohortExpression +from circe.execution.api import build_cohort + +BENCHMARK_OUTPUT = Path(__file__).resolve().parent.parent.parent / "benchmark_output" +JSON_DIR = BENCHMARK_OUTPUT / "phenotype_jsons" + +D = datetime.date # shorthand + + +def _seed_minimal_cdm(conn, ibis): + """Minimal CDM tables so cohort compilation doesn't fail on missing tables. + Uses sentinel person_id=999 so no actual rows match real cohort criteria.""" + S = D(2000, 1, 1) # sentinel date + conn.create_table("person", obj=ibis.memtable({"person_id": [999], "year_of_birth": [1900], "gender_concept_id": [0]}), overwrite=True) + conn.create_table("observation_period", obj=ibis.memtable({"person_id": [999], "observation_period_id": [999], "observation_period_start_date": [S], "observation_period_end_date": [S]}), overwrite=True) + conn.create_table("condition_occurrence", obj=ibis.memtable({"person_id": [999], "condition_occurrence_id": [999], "condition_concept_id": [0], "condition_start_date": [S], "condition_end_date": [S]}), overwrite=True) + conn.create_table("procedure_occurrence", obj=ibis.memtable({"person_id": [999], "procedure_occurrence_id": [999], "procedure_concept_id": [0], "procedure_date": [S]}), overwrite=True) + conn.create_table("measurement", obj=ibis.memtable({"person_id": [999], "measurement_id": [999], "measurement_concept_id": [0], "measurement_date": [S]}), overwrite=True) + conn.create_table("observation", obj=ibis.memtable({"person_id": [999], "observation_id": [999], "observation_concept_id": [0], "observation_date": [S]}), overwrite=True) + conn.create_table("drug_exposure", obj=ibis.memtable({"person_id": [999], "drug_exposure_id": [999], "drug_concept_id": [0], "drug_exposure_start_date": [S], "drug_exposure_end_date": [S]}), overwrite=True) + conn.create_table("death", obj=ibis.memtable({"person_id": [999], "death_date": [S]}), overwrite=True) + conn.create_table("visit_occurrence", obj=ibis.memtable({"person_id": [999], "visit_occurrence_id": [999], "visit_concept_id": [0], "visit_start_date": [S], "visit_end_date": [S]}), overwrite=True) + conn.create_table("specimen", obj=ibis.memtable({"person_id": [999], "specimen_id": [999], "specimen_concept_id": [0], "specimen_date": [S]}), overwrite=True) + conn.create_table("device_exposure", obj=ibis.memtable({"person_id": [999], "device_exposure_id": [999], "device_concept_id": [0], "device_exposure_start_date": [S], "device_exposure_end_date": [S]}), overwrite=True) + conn.create_table("dose_era", obj=ibis.memtable({"person_id": [999], "dose_era_id": [999], "drug_concept_id": [0], "unit_concept_id": [0], "dose_value": [0.0], "dose_era_start_date": [S], "dose_era_end_date": [S]}), overwrite=True) + conn.create_table("payer_plan_period", obj=ibis.memtable({"person_id": [999], "payer_plan_period_id": [999], "payer_plan_period_start_date": [S], "payer_plan_period_end_date": [S]}), overwrite=True) + conn.create_table("visit_detail", obj=ibis.memtable({"person_id": [999], "visit_detail_id": [999], "visit_detail_concept_id": [0], "visit_detail_start_date": [S], "visit_detail_end_date": [S]}), overwrite=True) + conn.create_table("condition_era", obj=ibis.memtable({"person_id": [999], "condition_era_id": [999], "condition_concept_id": [0], "condition_era_start_date": [S], "condition_era_end_date": [S], "condition_occurrence_count": [1]}), overwrite=True) + conn.create_table("drug_era", obj=ibis.memtable({"person_id": [999], "drug_era_id": [999], "drug_concept_id": [0], "drug_era_start_date": [S], "drug_era_end_date": [S], "drug_exposure_count": [1], "gap_days": [0]}), overwrite=True) + conn.create_table("concept", obj=ibis.memtable({"concept_id": [0, 999], "invalid_reason": ["X", None]}), overwrite=True) + conn.create_table("concept_ancestor", obj=ibis.memtable({"ancestor_concept_id": [999], "descendant_concept_id": [999]}), overwrite=True) + conn.create_table("concept_relationship", obj=ibis.memtable({"concept_id_1": [999], "concept_id_2": [999], "relationship_id": ["X"], "invalid_reason": ["X"]}), overwrite=True) + + +# The 3 persistently failing PhenotypeLibrary cohorts +FAILING_COHORT_IDS = [1432, 1433, 1434] + + +@pytest.mark.parametrize("cohort_id", FAILING_COHORT_IDS) +def test_phenotype_cohort_compiles(cohort_id: int) -> None: + """Cohorts 1432-1434 (87, 97, 51 primary criteria) must compile successfully. + + Compilation exercises ``_union_all`` which previously produced O(n) nested + UNION ALL expressions that crashed DuckDB. The binary-tree merge reduces + nesting to O(log n). + """ + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + json_path = JSON_DIR / f"{cohort_id}.json" + if not json_path.exists(): + pytest.skip(f"Phenotype JSON not found: {json_path}") + + expression = CohortExpression.model_validate_json(json_path.read_text()) + + conn = ibis.duckdb.connect() + _seed_minimal_cdm(conn, ibis) + + # build_cohort exercises the full UNION ALL path without expensive DB execution + try: + build_cohort(expression, backend=conn, cdm_schema="main") + except Exception as exc: + pytest.fail(f"Cohort {cohort_id} compilation failed: {exc}") diff --git a/tests/execution/test_union_scaling.py b/tests/execution/test_union_scaling.py new file mode 100644 index 0000000..eecb7de --- /dev/null +++ b/tests/execution/test_union_scaling.py @@ -0,0 +1,117 @@ +"""Tests that cohort expression UNION ALL scales to large numbers of primary criteria. + +Reproduces the failure mode where cohorts with 51-97 criteria (like +PhenotypeLibrary cohorts 1432-1434) crash DuckDB because the sequential +pairwise ``_union_all`` produces O(n) nesting depth. +""" + +from __future__ import annotations + +import pytest + +from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set +from circe.cohortdefinition import CohortExpression, ConditionOccurrence, PrimaryCriteria +from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem + + +def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: + return ConceptSet( + id=set_id, + expression=ConceptSetExpression(items=[ConceptSetItem(concept=Concept(conceptId=concept_id))]), + ) + + +def _seed_tables(conn, ibis, n_persons: int = 2): + person_ids = list(range(1, n_persons + 1)) + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": person_ids, + "year_of_birth": [1980] * n_persons, + "gender_concept_id": [8507] * n_persons, + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": person_ids, + "observation_period_id": person_ids, + "observation_period_start_date": ["2019-01-01"] * n_persons, + "observation_period_end_date": ["2022-12-31"] * n_persons, + } + ), + overwrite=True, + ) + + # Each person gets one condition row per concept_id from 1..N + condition_rows = [] + for pid in person_ids: + for cid in range(1, n_persons * 25 + 1): + condition_rows.append( + { + "person_id": pid, + "condition_occurrence_id": pid * 1000 + cid, + "condition_concept_id": cid, + "condition_start_date": "2020-01-10", + "condition_end_date": "2020-01-10", + } + ) + + conn.create_table("condition_occurrence", obj=ibis.memtable(condition_rows), overwrite=True) + + # Minimal vocabulary (must have at least one non-None invalid_reason + # so ibis can infer a non-NULL column type for DuckDB) + concept_ids = list(range(1, n_persons * 25 + 1)) + invalid = [None] * len(concept_ids) + if invalid: + invalid[0] = "X" # ensure type inference + conn.create_table( + "concept", + obj=ibis.memtable( + { + "concept_id": concept_ids, + "invalid_reason": invalid, + } + ), + overwrite=True, + ) + + +def _build_multi_criterion_expression(n: int) -> CohortExpression: + """Build a cohort with *n* simple ConditionOccurrence primary criteria.""" + concept_sets = [] + criteria = [] + for i in range(n): + cs_id = i + 1 + concept_sets.append(_make_concept_set(cs_id, concept_id=cs_id)) + criteria.append(ConditionOccurrence(codeset_id=cs_id)) + return CohortExpression(concept_sets=concept_sets, primary_criteria=PrimaryCriteria(criteria_list=criteria)) + + +@pytest.mark.parametrize("n_criteria", [1, 2, 5, 10, 20, 50, 100]) +def test_union_all_scales(n_criteria: int) -> None: + """Cohorts with N primary criteria should compile and execute without nesting errors.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + expression = _build_multi_criterion_expression(n_criteria) + cds = CohortDefinitionSet() + cds.add(1, f"UnionTest_{n_criteria}", expression) + + results = generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table=f"union_test_{n_criteria}", + stop_on_error=True, + ) + + assert len(results) == 1 + assert results[0].status == "COMPLETE", f"{n_criteria} criteria failed: {results[0].error}" From b714dc09164c34f8eaaa82cb904bf6ffbbc2a78c Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 18:55:07 -0700 Subject: [PATCH 07/53] fix(execution): CustomEra window partitions on (person_id, event_id) to preserve all events When DrugExposure(first=True) and QualifiedLimit=First, every person has exactly 1 event with event_id=1 (assigned by _assign_primary_event_ids). The CustomEra window previously grouped by event_id alone, collapsing all rows into 1 partition and dropping N-1 rows with _rn==0. Grouping by (person_id, event_id) gives each row its own partition, preserving all events. Adds regression tests via build_cohort and generate_cohort_set. --- circe/execution/engine/custom_era.py | 2 +- tests/execution/test_custom_era.py | 125 ++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py index e9a93e3..bf53091 100644 --- a/circe/execution/engine/custom_era.py +++ b/circe/execution/engine/custom_era.py @@ -134,7 +134,7 @@ def apply_custom_era_strategy(events, strategy, ctx): ) event_window = ibis.window( - group_by=joined.event_id, + group_by=[joined.person_id, joined.event_id], order_by=[joined.era_end_date.desc()], ) ranked = joined.mutate(_rn=ibis.row_number().over(event_window)) diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py index e1c45f7..fbe334c 100644 --- a/tests/execution/test_custom_era.py +++ b/tests/execution/test_custom_era.py @@ -5,13 +5,14 @@ import pytest from circe.api import build_cohort +from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set from circe.cohortdefinition import ( CohortExpression, ConditionOccurrence, DrugExposure, PrimaryCriteria, ) -from circe.cohortdefinition.core import CustomEraStrategy +from circe.cohortdefinition.core import CustomEraStrategy, ResultLimit from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem @@ -548,3 +549,125 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): ibis_starts = sorted(cohort_result["start_date"].astype(str).tolist()) sql_starts = sorted(sql_result["start_date"].astype(str).tolist()) assert ibis_starts == sql_starts + + +# --------------------------------------------------------------------------- +# Regression: CustomEra must preserve all events when event_id is shared +# +# After ``first=True`` + ``QualifiedLimit=First`` + ``ExpressionLimit=First`` +# every person contributes at most one event, and ``_assign_primary_event_ids`` +# assigns ``event_id=1`` to all of them. The CustomEra window that selects +# one matching era per event must therefore partition on *(person_id, event_id)* +# — otherwise all rows collapse into a single partition and only one survives. +# --------------------------------------------------------------------------- + + +def _seed_common_tables_multi_person(conn, ibis): + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "year_of_birth": [1980, 1985, 1990], + "gender_concept_id": [8507, 8507, 8507], + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "observation_period_id": [10, 11, 12], + "observation_period_start_date": [date(2019, 1, 1), date(2019, 1, 1), date(2019, 1, 1)], + "observation_period_end_date": [date(2021, 12, 31), date(2021, 12, 31), date(2021, 12, 31)], + } + ), + overwrite=True, + ) + + +def test_custom_era_preserves_all_persons_with_first_true(): + """All persons survive when DrugExposure(first=True) + CustomEra + limits. + + The window ``group_by=joined.event_id`` previously collapsed every row + into a single partition because all events had ``event_id=1`` (assigned + by ``_assign_primary_event_ids`` — each person has exactly 1 event after + ``first=True`` and the per-person limits). + """ + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables_multi_person(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "drug_exposure_id": [100, 200, 300], + "drug_concept_id": [222, 222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1), date(2020, 3, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 2, 28), date(2020, 3, 31)], + "days_supply": [0, 0, 0], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 222)], + primary_criteria=PrimaryCriteria(criteria_list=[DrugExposure(codeset_id=1, first=True)]), + qualified_limit=ResultLimit(Type="First"), + expression_limit=ResultLimit(Type="First"), + end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 3, f"expected 3 rows, got {len(result)}" + assert set(result["person_id"]) == {1, 2, 3} + + +def test_custom_era_preserves_all_persons_via_generate_cohort_set(): + """Full generate_cohort_set pipeline keeps every person.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables_multi_person(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "drug_exposure_id": [100, 200, 300], + "drug_concept_id": [222, 222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1), date(2020, 3, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 2, 28), date(2020, 3, 31)], + "days_supply": [0, 0, 0], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 222)], + primary_criteria=PrimaryCriteria(criteria_list=[DrugExposure(codeset_id=1, first=True)]), + qualified_limit=ResultLimit(Type="First"), + expression_limit=ResultLimit(Type="First"), + end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), + ) + + cds = CohortDefinitionSet() + cds.add(1, "CustomEra Regression", expression) + results = generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="ce_regress") + + assert results[0].status == "COMPLETE", f"unexpected status: {results[0].status}" + # Verify row count from the output table + out_rows = conn.table("ce_regress", database="main").execute() + assert len(out_rows) == 3, f"expected 3 rows, got {len(out_rows)}" + assert set(out_rows["subject_id"]) == {1, 2, 3} From 8f7c4d2051e19d59ddef2e10a190583ea8b87a81 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 18:55:07 -0700 Subject: [PATCH 08/53] fix(execution): CustomEra window partitions on (person_id, event_id) to preserve all events When DrugExposure(first=True) and QualifiedLimit=First, every person has exactly 1 event with event_id=1 (assigned by _assign_primary_event_ids). The CustomEra window previously grouped by event_id alone, collapsing all rows into 1 partition and dropping N-1 rows with _rn==0. Grouping by (person_id, event_id) gives each row its own partition, preserving all events. Adds regression test via build_cohort. --- circe/execution/engine/custom_era.py | 2 +- tests/execution/test_custom_era.py | 82 +++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py index e9a93e3..bf53091 100644 --- a/circe/execution/engine/custom_era.py +++ b/circe/execution/engine/custom_era.py @@ -134,7 +134,7 @@ def apply_custom_era_strategy(events, strategy, ctx): ) event_window = ibis.window( - group_by=joined.event_id, + group_by=[joined.person_id, joined.event_id], order_by=[joined.era_end_date.desc()], ) ranked = joined.mutate(_rn=ibis.row_number().over(event_window)) diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py index def6496..6a3a85b 100644 --- a/tests/execution/test_custom_era.py +++ b/tests/execution/test_custom_era.py @@ -11,7 +11,7 @@ DrugExposure, PrimaryCriteria, ) -from circe.cohortdefinition.core import CustomEraStrategy +from circe.cohortdefinition.core import CustomEraStrategy, ResultLimit from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem @@ -564,3 +564,83 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): ibis_starts = sorted(cohort_result["start_date"].astype(str).tolist()) sql_starts = sorted(sql_result["start_date"].astype(str).tolist()) assert ibis_starts == sql_starts + + +# --------------------------------------------------------------------------- +# Regression: CustomEra must preserve all events when event_id is shared +# +# After ``first=True`` + ``QualifiedLimit=First`` + ``ExpressionLimit=First`` +# every person contributes at most one event, and ``_assign_primary_event_ids`` +# assigns ``event_id=1`` to all of them. The CustomEra window that selects +# one matching era per event must therefore partition on *(person_id, event_id)* +# — otherwise all rows collapse into a single partition and only one survives. +# --------------------------------------------------------------------------- + + +def _seed_common_tables_multi_person(conn, ibis): + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "year_of_birth": [1980, 1985, 1990], + "gender_concept_id": [8507, 8507, 8507], + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "observation_period_id": [10, 11, 12], + "observation_period_start_date": [date(2019, 1, 1), date(2019, 1, 1), date(2019, 1, 1)], + "observation_period_end_date": [date(2021, 12, 31), date(2021, 12, 31), date(2021, 12, 31)], + } + ), + overwrite=True, + ) + + +def test_custom_era_preserves_all_persons_with_first_true(): + """All persons survive when DrugExposure(first=True) + CustomEra + limits. + + The window ``group_by=joined.event_id`` previously collapsed every row + into a single partition because all events had ``event_id=1`` (assigned + by ``_assign_primary_event_ids`` — each person has exactly 1 event after + ``first=True`` and the per-person limits). + """ + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables_multi_person(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "drug_exposure_id": [100, 200, 300], + "drug_concept_id": [222, 222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 2, 1), date(2020, 3, 1)], + "drug_exposure_end_date": [date(2020, 1, 31), date(2020, 2, 28), date(2020, 3, 31)], + "days_supply": [0, 0, 0], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 222)], + primary_criteria=PrimaryCriteria(criteria_list=[DrugExposure(codeset_id=1, first=True)]), + qualified_limit=ResultLimit(Type="First"), + expression_limit=ResultLimit(Type="First"), + end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 3, f"expected 3 rows, got {len(result)}" + assert set(result["person_id"]) == {1, 2, 3} From 3621bb2a3036e5af722705baf8b3f8e952f2a63f Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 18:57:32 -0700 Subject: [PATCH 09/53] feat(execution): add materialize flag to build_cohort for compile-only use Adds materialize: bool = True parameter to build_cohort and build_cohort_table. When False, skips the staging-table creation added for large-cohort SQL compilation performance. Used by the phenotype regression tests so they remain compile-only. Also: - Add union-scaling regression tests (1-100 criteria) - Fix phenotype test to use materialize=False (was hanging) - Fix SIM108 ternary in compare_cohort_outputs.py --- benchmarks/compare_cohort_outputs.py | 33 ++-- circe/execution/api.py | 9 +- circe/execution/engine/cohort.py | 25 ++- tests/execution/test_phenotype_failures.py | 220 +++++++++++++++++++-- tests/execution/test_union_scaling.py | 4 +- 5 files changed, 237 insertions(+), 54 deletions(-) diff --git a/benchmarks/compare_cohort_outputs.py b/benchmarks/compare_cohort_outputs.py index 359e310..ea92b94 100644 --- a/benchmarks/compare_cohort_outputs.py +++ b/benchmarks/compare_cohort_outputs.py @@ -122,8 +122,9 @@ def _read_cohort_table( df = pd.DataFrame( { - "cohort_definition_id": pd.to_numeric(raw["cohort_definition_id"], errors="coerce") - .astype("int64"), + "cohort_definition_id": pd.to_numeric(raw["cohort_definition_id"], errors="coerce").astype( + "int64" + ), "subject_id": pd.to_numeric(raw["subject_id"], errors="coerce").astype("int64"), "cohort_start_date": pd.to_datetime(raw["cohort_start_date"], errors="coerce").dt.date, "cohort_end_date": pd.to_datetime(raw["cohort_end_date"], errors="coerce").dt.date, @@ -140,12 +141,8 @@ def _compare_single_cohort( """Compare row-level output for a single cohort.""" key_cols = ["subject_id", "cohort_start_date", "cohort_end_date"] - r_set = tuple( - tuple(row) for row in r_rows[key_cols].itertuples(index=False) - ) - py_set = tuple( - tuple(row) for row in py_rows[key_cols].itertuples(index=False) - ) + r_set = tuple(tuple(row) for row in r_rows[key_cols].itertuples(index=False)) + py_set = tuple(tuple(row) for row in py_rows[key_cols].itertuples(index=False)) r_unique = set(r_set) py_unique = set(py_set) @@ -204,10 +201,7 @@ def compare_cohort_outputs( r_ids = set(r_df["cohort_definition_id"].unique()) py_ids = set(py_df["cohort_definition_id"].unique()) - if cohort_ids is not None: - shared = sorted(r_ids & py_ids & set(cohort_ids)) - else: - shared = sorted(r_ids & py_ids) + shared = sorted(r_ids & py_ids & set(cohort_ids)) if cohort_ids is not None else sorted(r_ids & py_ids) per_cohort: list[CohortMatchSummary] = [] total_r = 0 @@ -244,15 +238,14 @@ def compare_cohort_outputs( def print_comparison_report(report: CohortComparisonReport) -> None: """Print a human-readable row-level parity report.""" - print(f"\nTable 6 — Row-level parity (R vs Python)") + print("\nTable 6 — Row-level parity (R vs Python)") if report.n_cohorts_shared == 0: print(" No shared cohorts to compare.") return print(f" Shared cohorts: {report.n_cohorts_shared}") - print(f" Exactly matched: {report.n_cohorts_matched_exactly} " - f"({report.exact_match_pct:.1f}%)") + print(f" Exactly matched: {report.n_cohorts_matched_exactly} ({report.exact_match_pct:.1f}%)") print(f" Total R rows: {report.total_r_rows:,}") print(f" Total Py rows: {report.total_py_rows:,}") print(f" Total matched: {report.total_matched:,}") @@ -263,10 +256,12 @@ def print_comparison_report(report: CohortComparisonReport) -> None: if mismatched: print(f"\n Cohort mismatches ({len(mismatched)}):") for c in mismatched: - print(f" {c.cohort_id:>5d} " - f"R={c.n_r:<6d} Py={c.n_py:<6d} " - f"matched={c.n_matched:<6d} " - f"only_R={c.n_only_r:<4d} only_Py={c.n_only_py:<4d}") + print( + f" {c.cohort_id:>5d} " + f"R={c.n_r:<6d} Py={c.n_py:<6d} " + f"matched={c.n_matched:<6d} " + f"only_R={c.n_only_r:<4d} only_Py={c.n_only_py:<4d}" + ) if c.sample_only_r: print(f" samples only_R: {c.sample_only_r[:3]}") if c.sample_only_py: diff --git a/circe/execution/api.py b/circe/execution/api.py index ed89bdd..63ae5c1 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -31,11 +31,14 @@ def build_cohort( vocabulary_schema: str | None = None, use_persistent_cache: bool = False, cohort_id: int = 0, + materialize: bool = True, ) -> Table: """Normalize, compile, and assemble a cohort relation. - Paths like to stage-by-stage temp tables when *cohort_id* is provided, - so that the ibis expression tree never grows too large to compile. + Paths through stage-by-stage temp tables when *cohort_id* is provided + and *materialize* is True, so that the ibis expression tree never grows + too large to compile. Set *materialize=False* for compile-only use + (e.g. unit tests that only verify the expression tree can be built). """ maybe_apply_databricks_post_connect_workaround(backend) @@ -50,7 +53,7 @@ def build_cohort( use_persistent_cache=use_persistent_cache, ) - return build_cohort_table(normalized, ctx, cohort_id=cohort_id) + return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) def write_relation( diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 75eb23a..74e91bf 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -57,6 +57,7 @@ def build_cohort_table( ctx: ExecutionContext, *, cohort_id: int = 0, + materialize: bool = True, ) -> Table: primary_plans = tuple( PrimaryEventInput( @@ -77,28 +78,32 @@ def build_cohort_table( # ── Primary events ────────────────────────────────────────────────── primary_events = build_primary_events(cohort_plan, ctx) - primary_events = _materialize( - primary_events, ctx=ctx, cohort_id=cohort_id, stage="primary", schema=schema - ) + if materialize: + primary_events = _materialize( + primary_events, ctx=ctx, cohort_id=cohort_id, stage="primary", schema=schema + ) # ── Additional (correlated) criteria ──────────────────────────────── qualified_events = apply_additional_criteria(primary_events, normalized.additional_criteria, ctx) if normalized.additional_criteria is not None and not normalized.additional_criteria.is_empty(): qualified_events = apply_result_limit(qualified_events, cohort_plan.qualified_limit_type) - qualified_events = _materialize( - qualified_events, ctx=ctx, cohort_id=cohort_id, stage="qualified", schema=schema - ) + if materialize: + qualified_events = _materialize( + qualified_events, ctx=ctx, cohort_id=cohort_id, stage="qualified", schema=schema + ) # ── Inclusion rules ───────────────────────────────────────────────── included_events = apply_inclusion_rules(qualified_events, normalized.inclusion_rules, ctx) included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) - included_events = _materialize( - included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema - ) + if materialize: + included_events = _materialize( + included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + ) # ── End strategy ──────────────────────────────────────────────────── ended_events = apply_end_strategy(included_events, normalized.end_strategy, ctx) - ended_events = _materialize(ended_events, ctx=ctx, cohort_id=cohort_id, stage="ended", schema=schema) + if materialize: + ended_events = _materialize(ended_events, ctx=ctx, cohort_id=cohort_id, stage="ended", schema=schema) # ── Censoring + collapse (final stage — no materialize after) ────── censored_events = apply_censoring( diff --git a/tests/execution/test_phenotype_failures.py b/tests/execution/test_phenotype_failures.py index 6416000..c68e9cc 100644 --- a/tests/execution/test_phenotype_failures.py +++ b/tests/execution/test_phenotype_failures.py @@ -14,7 +14,6 @@ from __future__ import annotations import datetime -import json from pathlib import Path import pytest @@ -32,25 +31,203 @@ def _seed_minimal_cdm(conn, ibis): """Minimal CDM tables so cohort compilation doesn't fail on missing tables. Uses sentinel person_id=999 so no actual rows match real cohort criteria.""" S = D(2000, 1, 1) # sentinel date - conn.create_table("person", obj=ibis.memtable({"person_id": [999], "year_of_birth": [1900], "gender_concept_id": [0]}), overwrite=True) - conn.create_table("observation_period", obj=ibis.memtable({"person_id": [999], "observation_period_id": [999], "observation_period_start_date": [S], "observation_period_end_date": [S]}), overwrite=True) - conn.create_table("condition_occurrence", obj=ibis.memtable({"person_id": [999], "condition_occurrence_id": [999], "condition_concept_id": [0], "condition_start_date": [S], "condition_end_date": [S]}), overwrite=True) - conn.create_table("procedure_occurrence", obj=ibis.memtable({"person_id": [999], "procedure_occurrence_id": [999], "procedure_concept_id": [0], "procedure_date": [S]}), overwrite=True) - conn.create_table("measurement", obj=ibis.memtable({"person_id": [999], "measurement_id": [999], "measurement_concept_id": [0], "measurement_date": [S]}), overwrite=True) - conn.create_table("observation", obj=ibis.memtable({"person_id": [999], "observation_id": [999], "observation_concept_id": [0], "observation_date": [S]}), overwrite=True) - conn.create_table("drug_exposure", obj=ibis.memtable({"person_id": [999], "drug_exposure_id": [999], "drug_concept_id": [0], "drug_exposure_start_date": [S], "drug_exposure_end_date": [S]}), overwrite=True) + conn.create_table( + "person", + obj=ibis.memtable({"person_id": [999], "year_of_birth": [1900], "gender_concept_id": [0]}), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [999], + "observation_period_id": [999], + "observation_period_start_date": [S], + "observation_period_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [999], + "condition_occurrence_id": [999], + "condition_concept_id": [0], + "condition_start_date": [S], + "condition_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "procedure_occurrence", + obj=ibis.memtable( + { + "person_id": [999], + "procedure_occurrence_id": [999], + "procedure_concept_id": [0], + "procedure_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "measurement", + obj=ibis.memtable( + { + "person_id": [999], + "measurement_id": [999], + "measurement_concept_id": [0], + "measurement_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "observation", + obj=ibis.memtable( + { + "person_id": [999], + "observation_id": [999], + "observation_concept_id": [0], + "observation_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [999], + "drug_exposure_id": [999], + "drug_concept_id": [0], + "drug_exposure_start_date": [S], + "drug_exposure_end_date": [S], + } + ), + overwrite=True, + ) conn.create_table("death", obj=ibis.memtable({"person_id": [999], "death_date": [S]}), overwrite=True) - conn.create_table("visit_occurrence", obj=ibis.memtable({"person_id": [999], "visit_occurrence_id": [999], "visit_concept_id": [0], "visit_start_date": [S], "visit_end_date": [S]}), overwrite=True) - conn.create_table("specimen", obj=ibis.memtable({"person_id": [999], "specimen_id": [999], "specimen_concept_id": [0], "specimen_date": [S]}), overwrite=True) - conn.create_table("device_exposure", obj=ibis.memtable({"person_id": [999], "device_exposure_id": [999], "device_concept_id": [0], "device_exposure_start_date": [S], "device_exposure_end_date": [S]}), overwrite=True) - conn.create_table("dose_era", obj=ibis.memtable({"person_id": [999], "dose_era_id": [999], "drug_concept_id": [0], "unit_concept_id": [0], "dose_value": [0.0], "dose_era_start_date": [S], "dose_era_end_date": [S]}), overwrite=True) - conn.create_table("payer_plan_period", obj=ibis.memtable({"person_id": [999], "payer_plan_period_id": [999], "payer_plan_period_start_date": [S], "payer_plan_period_end_date": [S]}), overwrite=True) - conn.create_table("visit_detail", obj=ibis.memtable({"person_id": [999], "visit_detail_id": [999], "visit_detail_concept_id": [0], "visit_detail_start_date": [S], "visit_detail_end_date": [S]}), overwrite=True) - conn.create_table("condition_era", obj=ibis.memtable({"person_id": [999], "condition_era_id": [999], "condition_concept_id": [0], "condition_era_start_date": [S], "condition_era_end_date": [S], "condition_occurrence_count": [1]}), overwrite=True) - conn.create_table("drug_era", obj=ibis.memtable({"person_id": [999], "drug_era_id": [999], "drug_concept_id": [0], "drug_era_start_date": [S], "drug_era_end_date": [S], "drug_exposure_count": [1], "gap_days": [0]}), overwrite=True) - conn.create_table("concept", obj=ibis.memtable({"concept_id": [0, 999], "invalid_reason": ["X", None]}), overwrite=True) - conn.create_table("concept_ancestor", obj=ibis.memtable({"ancestor_concept_id": [999], "descendant_concept_id": [999]}), overwrite=True) - conn.create_table("concept_relationship", obj=ibis.memtable({"concept_id_1": [999], "concept_id_2": [999], "relationship_id": ["X"], "invalid_reason": ["X"]}), overwrite=True) + conn.create_table( + "visit_occurrence", + obj=ibis.memtable( + { + "person_id": [999], + "visit_occurrence_id": [999], + "visit_concept_id": [0], + "visit_start_date": [S], + "visit_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "specimen", + obj=ibis.memtable( + {"person_id": [999], "specimen_id": [999], "specimen_concept_id": [0], "specimen_date": [S]} + ), + overwrite=True, + ) + conn.create_table( + "device_exposure", + obj=ibis.memtable( + { + "person_id": [999], + "device_exposure_id": [999], + "device_concept_id": [0], + "device_exposure_start_date": [S], + "device_exposure_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "dose_era", + obj=ibis.memtable( + { + "person_id": [999], + "dose_era_id": [999], + "drug_concept_id": [0], + "unit_concept_id": [0], + "dose_value": [0.0], + "dose_era_start_date": [S], + "dose_era_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "payer_plan_period", + obj=ibis.memtable( + { + "person_id": [999], + "payer_plan_period_id": [999], + "payer_plan_period_start_date": [S], + "payer_plan_period_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "visit_detail", + obj=ibis.memtable( + { + "person_id": [999], + "visit_detail_id": [999], + "visit_detail_concept_id": [0], + "visit_detail_start_date": [S], + "visit_detail_end_date": [S], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_era", + obj=ibis.memtable( + { + "person_id": [999], + "condition_era_id": [999], + "condition_concept_id": [0], + "condition_era_start_date": [S], + "condition_era_end_date": [S], + "condition_occurrence_count": [1], + } + ), + overwrite=True, + ) + conn.create_table( + "drug_era", + obj=ibis.memtable( + { + "person_id": [999], + "drug_era_id": [999], + "drug_concept_id": [0], + "drug_era_start_date": [S], + "drug_era_end_date": [S], + "drug_exposure_count": [1], + "gap_days": [0], + } + ), + overwrite=True, + ) + conn.create_table( + "concept", obj=ibis.memtable({"concept_id": [0, 999], "invalid_reason": ["X", None]}), overwrite=True + ) + conn.create_table( + "concept_ancestor", + obj=ibis.memtable({"ancestor_concept_id": [999], "descendant_concept_id": [999]}), + overwrite=True, + ) + conn.create_table( + "concept_relationship", + obj=ibis.memtable( + {"concept_id_1": [999], "concept_id_2": [999], "relationship_id": ["X"], "invalid_reason": ["X"]} + ), + overwrite=True, + ) # The 3 persistently failing PhenotypeLibrary cohorts @@ -77,8 +254,9 @@ def test_phenotype_cohort_compiles(cohort_id: int) -> None: conn = ibis.duckdb.connect() _seed_minimal_cdm(conn, ibis) - # build_cohort exercises the full UNION ALL path without expensive DB execution + # build_cohort exercises the full UNION ALL path; materialize=False + # keeps this compile-only (no temp tables created). try: - build_cohort(expression, backend=conn, cdm_schema="main") + build_cohort(expression, backend=conn, cdm_schema="main", materialize=False) except Exception as exc: pytest.fail(f"Cohort {cohort_id} compilation failed: {exc}") diff --git a/tests/execution/test_union_scaling.py b/tests/execution/test_union_scaling.py index eecb7de..175ec31 100644 --- a/tests/execution/test_union_scaling.py +++ b/tests/execution/test_union_scaling.py @@ -89,7 +89,9 @@ def _build_multi_criterion_expression(n: int) -> CohortExpression: cs_id = i + 1 concept_sets.append(_make_concept_set(cs_id, concept_id=cs_id)) criteria.append(ConditionOccurrence(codeset_id=cs_id)) - return CohortExpression(concept_sets=concept_sets, primary_criteria=PrimaryCriteria(criteria_list=criteria)) + return CohortExpression( + concept_sets=concept_sets, primary_criteria=PrimaryCriteria(criteria_list=criteria) + ) @pytest.mark.parametrize("n_criteria", [1, 2, 5, 10, 20, 50, 100]) From 4a4da42f2e5d3d563727feea624f698007adb392 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 19:35:38 -0700 Subject: [PATCH 10/53] updated benchmarks to run well on databricks --- benchmarks/_backend.py | 150 +++++++++++++++++++++++++ benchmarks/benchmark_analyze_duckdb.py | 89 ++++++++------- benchmarks/benchmark_db_config.yaml | 39 ++++--- benchmarks/benchmark_run_py.py | 63 ++++++----- benchmarks/benchmark_run_r.R | 122 ++++++++++++++++---- 5 files changed, 364 insertions(+), 99 deletions(-) create mode 100644 benchmarks/_backend.py diff --git a/benchmarks/_backend.py b/benchmarks/_backend.py new file mode 100644 index 0000000..004ef71 --- /dev/null +++ b/benchmarks/_backend.py @@ -0,0 +1,150 @@ +"""Shared backend connection helpers for the benchmarks. + +Used by both :file:`benchmark_run_py.py` and :file:`benchmark_analyze_duckdb.py` +to connect to DuckDB (local file) or Databricks (via YAML config) uniformly. +""" + +from __future__ import annotations + +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import ibis +import yaml + +# Optional Databricks support — checked lazily at connect time. +try: + import ibis.backends.databricks # noqa: F401 + + _HAS_IBIS_DATABRICKS = True +except ImportError: + _HAS_IBIS_DATABRICKS = False + +REPO_ROOT = Path(__file__).resolve().parent.parent +OUTPUT_DIR = REPO_ROOT / "benchmark_output" +CONFIG_PATH = Path(__file__).resolve().parent / "benchmark_db_config.yaml" +DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" + +R_COHORT_TABLE = "cohort" +PY_COHORT_TABLE = "cohort_py" +R_CHECKSUM_TABLE = "cohort_checksum" +PY_CHECKSUM_TABLE = "cohort_py_checksum" + +# CSV paths +R_CSV = OUTPUT_DIR / "r_checksum_times.csv" +PY_CSV = OUTPUT_DIR / "py_checksum_times.csv" + + +def _expandvars(text: str) -> str: + """Expand ``${ENV_VAR}`` patterns in *text*, falling back to an empty string.""" + return re.sub( + r"\$\{(\w+)\}", + lambda m: os.environ.get(m.group(1), ""), + text, + ) + + +def _expandvars_recursive(obj: Any) -> Any: + """Expand environment variables throughout a nested dict/list.""" + if isinstance(obj, str): + return _expandvars(obj) + if isinstance(obj, dict): + return {k: _expandvars_recursive(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_expandvars_recursive(v) for v in obj] + return obj + + +@dataclass +class BackendConnection: + """Hold the configured connection and schema information for a benchmark run.""" + + backend: ibis.BaseBackend + cdm_schema: str + results_schema: str + vocabulary_schema: str + r_cohort_table: str + py_cohort_table: str + r_checksum_table: str + py_checksum_table: str + + +def load_config(backend_name: str) -> dict[str, Any]: + """Load the YAML configuration for *backend_name*. + + ``${ENV_VAR}`` placeholders are expanded from the process environment. + """ + if not CONFIG_PATH.exists(): + raise FileNotFoundError(f"Config not found: {CONFIG_PATH}") + + config = yaml.safe_load(CONFIG_PATH.read_text()) + section = config.get(backend_name) + if section is None: + available = [k for k in config if k != "eunomia"] + raise ValueError(f"Unknown backend '{backend_name}'. Available: {', '.join(available)}") + + return _expandvars_recursive(section) + + +def connect_backend(backend_name: str) -> BackendConnection: + """Create and return a backend connection from the YAML config. + + For ``duckdb`` the configuration is pre-set (points to the local Eunomia + DuckDB file). For ``databricks`` the configuration must be provided in + :file:`benchmarks/benchmark_db_config.yaml`. + """ + if backend_name == "duckdb": + if not DUCKDB_PATH.exists(): + raise FileNotFoundError( + f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first." + ) + backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + return BackendConnection( + backend=backend, + cdm_schema="main", + results_schema="main", + vocabulary_schema="main", + r_cohort_table=R_COHORT_TABLE, + py_cohort_table=PY_COHORT_TABLE, + r_checksum_table=R_CHECKSUM_TABLE, + py_checksum_table=PY_CHECKSUM_TABLE, + ) + + cfg = load_config(backend_name) + driver = cfg.get("driver", backend_name) + + if driver == "databricks": + if not _HAS_IBIS_DATABRICKS: + raise ImportError( + "ibis-framework[databricks] is required. Install with: " + "pip install 'ibis-framework[databricks]'" + ) + + conn_cfg = cfg["connection"] + db_cfg: dict[str, Any] = { + "host": conn_cfg["server_hostname"], + "http_path": conn_cfg["http_path"], + } + if conn_cfg.get("personal_access_token"): + db_cfg["token"] = conn_cfg["personal_access_token"] + if conn_cfg.get("catalog"): + db_cfg["catalog"] = conn_cfg["catalog"] + if conn_cfg.get("schema"): + db_cfg["schema"] = conn_cfg["schema"] + + backend = ibis.databricks.connect(**db_cfg) + return BackendConnection( + backend=backend, + cdm_schema=cfg["cdm_schema"], + results_schema=cfg["results_schema"], + vocabulary_schema=cfg.get("vocabulary_schema", cfg["cdm_schema"]), + r_cohort_table=cfg.get("r_cohort_table", R_COHORT_TABLE), + py_cohort_table=cfg.get("py_cohort_table", PY_COHORT_TABLE), + r_checksum_table=cfg.get("r_checksum_table", R_CHECKSUM_TABLE), + py_checksum_table=cfg.get("py_checksum_table", PY_CHECKSUM_TABLE), + ) + + raise ValueError(f"Unsupported driver: {driver}") diff --git a/benchmarks/benchmark_analyze_duckdb.py b/benchmarks/benchmark_analyze_duckdb.py index f69c19e..b897332 100644 --- a/benchmarks/benchmark_analyze_duckdb.py +++ b/benchmarks/benchmark_analyze_duckdb.py @@ -3,33 +3,36 @@ Reads the checksum timing CSVs produced by :file:`benchmarks/benchmark_run_r.R` and :file:`benchmarks/benchmark_run_py.py`, -queries the persisted history tables directly from the DuckDB database for +queries the persisted history tables directly from the database for cross-validation, and prints a paper-ready comparative summary. Usage:: - python benchmarks/benchmark_analyze_duckdb.py + python benchmarks/benchmark_analyze_duckdb.py # DuckDB + python benchmarks/benchmark_analyze_duckdb.py --backend databricks """ from __future__ import annotations +import argparse from pathlib import Path -import ibis import pandas as pd +from _backend import PY_CSV, R_CSV, connect_backend from compare_cohort_outputs import compare_cohort_outputs, print_comparison_report REPO_ROOT = Path(__file__).resolve().parent.parent -OUTPUT_DIR = REPO_ROOT / "benchmark_output" -DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" -R_CSV = OUTPUT_DIR / "r_checksum_times.csv" -PY_CSV = OUTPUT_DIR / "py_checksum_times.csv" -R_COHORT_TABLE = "cohort" -PY_COHORT_TABLE = "cohort_py" -R_CHECKSUM_TABLE = "cohort_checksum" -PY_CHECKSUM_TABLE = "cohort_py_checksum" +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="CircePy benchmark result analyzer") + p.add_argument( + "--backend", + default="duckdb", + choices=("duckdb", "databricks"), + help="Target database backend for cross-validation (default: duckdb)", + ) + return p.parse_args() def load_csv(path: Path) -> pd.DataFrame | None: @@ -56,9 +59,7 @@ def print_coverage(label: str, df: pd.DataFrame) -> None: def print_timing(label: str, df: pd.DataFrame) -> None: - complete = ( - df[df["status"] == "COMPLETE"] if _has_status(df) else df - ) # checksum table — all rows are COMPLETE + complete = df[df["status"] == "COMPLETE"] if _has_status(df) else df if complete.empty: print(f" {label}: no completed cohorts to report timing") return @@ -73,11 +74,15 @@ def print_timing(label: str, df: pd.DataFrame) -> None: def cross_validate( - label: str, csv_df: pd.DataFrame, backend: ibis.BaseBackend, cohort_table: str, checksum_table: str + label: str, + csv_df: pd.DataFrame, + conn, + cohort_table: str, + checksum_table: str, ) -> None: """Read the persisted checksum table and compare with the CSV.""" try: - history = backend.table(checksum_table, database="main").execute() + history = conn.backend.table(checksum_table, database=conn.results_schema).execute() except Exception: print(f" {label} cross-validation: checksum table '{checksum_table}' not found") return @@ -90,13 +95,11 @@ def cross_validate( if complete_csv.empty: return - # Compare count of COMPLETE rows history_complete = history[history["status"] == "COMPLETE"] if _has_status(history) else history print(f" {label} cross-validation:") print(f" CSV rows : {len(complete_csv)}") print(f" DB rows : {len(history_complete)}") - # Compare total timing csv_total = complete_csv["generation_seconds"].sum() if "start_time" in history_complete.columns and "end_time" in history_complete.columns: starts = history_complete["start_time"] @@ -111,10 +114,10 @@ def cross_validate( print(f" Delta : {delta:.4f}s {'✓' if delta < 1.0 else '✗'}") -def print_cohort_row_counts(label: str, backend: ibis.BaseBackend, cohort_table: str) -> None: +def print_cohort_row_counts(label: str, conn, cohort_table: str) -> None: """Print row count summary from the cohort output table.""" try: - rows = backend.table(cohort_table, database="main").execute() + rows = conn.backend.table(cohort_table, database=conn.results_schema).execute() except Exception: print(f" {label} row counts: table '{cohort_table}' not found") return @@ -154,8 +157,11 @@ def compare_shared(label_prefix: str, r_df: pd.DataFrame, py_df: pd.DataFrame) - def main() -> None: + args = _parse_args() + backend_label = args.backend + print("=" * 60) - print("R vs Python CohortGenerator Benchmark Comparison") + print(f"R vs Python CohortGenerator Benchmark Comparison (backend={backend_label})") print("=" * 60) r_df = load_csv(R_CSV) @@ -181,37 +187,40 @@ def main() -> None: if py_df is not None: print_timing("Py", py_df) - # ── Cross-validation against persisted checksum tables ────────────── - if DUCKDB_PATH.exists(): - print("\nTable 3 — Cross-validation (CSV vs persisted checksum table)") - backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + # ── Cross-validation & row counts (needs a backend connection) ────── + print("\nTable 3 — Cross-validation (CSV vs persisted checksum table)") + try: + conn = connect_backend(backend_label) + except Exception as exc: + print(f" Cannot connect to {backend_label}: {exc}") + conn = None + + if conn is not None: if r_df is not None: - cross_validate("R ", r_df, backend, R_COHORT_TABLE, R_CHECKSUM_TABLE) + cross_validate("R ", r_df, conn, conn.r_cohort_table, conn.r_checksum_table) if py_df is not None: - cross_validate("Py", py_df, backend, PY_COHORT_TABLE, PY_CHECKSUM_TABLE) - else: - print(f"\nTable 3 — Cross-validation: {DUCKDB_PATH} not found, skipping") + cross_validate("Py", py_df, conn, conn.py_cohort_table, conn.py_checksum_table) - # ── Cohort row counts ──────────────────────────────────────────────── - if DUCKDB_PATH.exists(): print("\nTable 4 — Cohort row counts") - backend = ibis.duckdb.connect(str(DUCKDB_PATH)) if r_df is not None: - print_cohort_row_counts("R ", backend, R_COHORT_TABLE) + print_cohort_row_counts("R ", conn, conn.r_cohort_table) if py_df is not None: - print_cohort_row_counts("Py", backend, PY_COHORT_TABLE) + print_cohort_row_counts("Py", conn, conn.py_cohort_table) + + print("\nTable 6 — Row-level parity (R vs Python)") + report = compare_cohort_outputs( + conn.backend, + r_table=conn.r_cohort_table, + py_table=conn.py_cohort_table, + schema=conn.results_schema, + ) + print_comparison_report(report) # ── R vs Python shared-cohort comparison ───────────────────────────── if r_df is not None and py_df is not None: print("\nTable 5 — R vs Python shared-cohort comparison") compare_shared("=>", r_df, py_df) - # ── Row-level cohort output comparison ────────────────────────────── - if DUCKDB_PATH.exists(): - backend = ibis.duckdb.connect(str(DUCKDB_PATH)) - report = compare_cohort_outputs(backend) - print_comparison_report(report) - print(f"\n{'=' * 60}") print("Analysis complete") print(f"{'=' * 60}\n") diff --git a/benchmarks/benchmark_db_config.yaml b/benchmarks/benchmark_db_config.yaml index 4c7d366..fd84994 100644 --- a/benchmarks/benchmark_db_config.yaml +++ b/benchmarks/benchmark_db_config.yaml @@ -36,21 +36,30 @@ duckdb: # Use environment variables for sensitive credentials. # Databricks backend (for cloud-scale validation) -# Uncomment and configure to test against Databricks -# databricks: -# driver: "databricks" -# description: "Databricks SQL warehouse" -# connection: -# server_hostname: "${DATABRICKS_HOST}" -# http_path: "${DATABRICKS_HTTP_PATH}" -# personal_access_token: "${DATABRICKS_TOKEN}" -# cdm_schema: "hive_metastore.omop_cdm" -# vocabulary_schema: "hive_metastore.omop_cdm" -# results_schema: "hive_metastore.results" -# data_source: "provided" # Assumes OMOP data already loaded -# notes: | -# Requires Databricks workspace and valid credentials via environment variables. -# Use for validating CircePy with cloud-scale OMOP implementations. +# Set the required environment variables before running: +# export DATABRICKS_HOST="..." +# export DATABRICKS_HTTP_PATH="..." +# export DATABRICKS_TOKEN="..." +databricks: + driver: "databricks" + description: "Databricks SQL warehouse" + connection: + server_hostname: "${DATABRICKS_HOST}" + http_path: "${DATABRICKS_HTTP_PATH}" + personal_access_token: "${DATABRICKS_TOKEN}" + # catalog: "main" # optional — set if using Unity Catalog + # schema: "default" # optional — default schema + cdm_schema: "hive_metastore.omop_cdm" + vocabulary_schema: "hive_metastore.omop_cdm" + results_schema: "hive_metastore.results" + r_cohort_table: "cohort_r" + py_cohort_table: "cohort_py" + r_checksum_table: "cohort_r_checksum" + py_checksum_table: "cohort_py_checksum" + notes: | + Requires Databricks workspace, valid credentials via env vars, and + OMOP CDM data already loaded into the configured schema. + Uses ibis-framework[databricks] for Python connectivity. # Eunomia data source configuration eunomia: diff --git a/benchmarks/benchmark_run_py.py b/benchmarks/benchmark_run_py.py index 9139842..7a8e635 100644 --- a/benchmarks/benchmark_run_py.py +++ b/benchmarks/benchmark_run_py.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 -"""Runnable Python benchmark of PhenotypeLibrary cohorts on Eunomia (DuckDB). +"""Runnable Python benchmark of PhenotypeLibrary cohorts. Usage:: # Export PhenotypeLibrary cohort JSONs (one-time setup) Rscript benchmarks/export_phenotypes.R - # Optional: create the Eunomia DuckDB (Python can also reuse R's) - Rscript benchmarks/benchmark_run_r.R - - # Run the Python benchmark + # DuckDB (default — needs Eunomia DB from R) python benchmarks/benchmark_run_py.py + # Databricks (set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN) + python benchmarks/benchmark_run_py.py --backend databricks + Output (written to *benchmark_output/*):: py_checksum_times.csv -- per-phenotype generation timing and status @@ -19,11 +19,13 @@ from __future__ import annotations +import argparse import logging +import sys from pathlib import Path -import ibis import pandas as pd +from _backend import connect_backend from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set from circe.cohortdefinition import CohortExpression @@ -38,21 +40,32 @@ OUTPUT_DIR = REPO_ROOT / "benchmark_output" JSON_DIR = OUTPUT_DIR / "phenotype_jsons" MANIFEST_PATH = OUTPUT_DIR / "phenotype_manifest.csv" -DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" RESULTS_CSV = OUTPUT_DIR / "py_checksum_times.csv" -COHORT_TABLE = "cohort_py" -CHECKSUM_TABLE = "cohort_py_checksum" -CDM_SCHEMA = "main" + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Python circe cohort benchmark runner") + p.add_argument( + "--backend", + default="duckdb", + choices=("duckdb", "databricks"), + help="Target database backend (default: duckdb)", + ) + return p.parse_args() def main() -> None: + args = _parse_args() + backend_label = args.backend + # ── 1. Load phenotype definitions ──────────────────────────────────── print("Loading phenotype definitions ...") if not MANIFEST_PATH.exists(): - raise FileNotFoundError( - f"{MANIFEST_PATH} not found. Run 'Rscript benchmarks/export_phenotypes.R' first." + print( + f" {MANIFEST_PATH} not found. Run 'Rscript benchmarks/export_phenotypes.R' first.", + file=sys.stderr, ) + sys.exit(1) manifest = pd.read_csv(MANIFEST_PATH) print(f" Manifest has {len(manifest)} cohorts") @@ -73,23 +86,23 @@ def main() -> None: print(f" Skipped {skipped} cohorts with missing JSON files") print(f" Loaded {len(cds)} cohorts into CohortDefinitionSet") - # ── 2. Connect to DuckDB ───────────────────────────────────────────── - if not DUCKDB_PATH.exists(): - raise FileNotFoundError(f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first.") - print(f"Connecting to DuckDB: {DUCKDB_PATH}") - backend = ibis.duckdb.connect(str(DUCKDB_PATH)) + # ── 2. Connect to backend ──────────────────────────────────────────── + print(f"Connecting to backend: {backend_label}") + conn = connect_backend(backend_label) # ── 3. Generate cohorts ────────────────────────────────────────────── - print("Generating cohorts (incremental) ...") + checksum_table = conn.py_checksum_table + cohort_table = conn.py_cohort_table + print(f"Generating cohorts (incremental) → {conn.results_schema}.{cohort_table}") results = generate_cohort_set( cds, - backend=backend, - cdm_schema=CDM_SCHEMA, - cohort_table=COHORT_TABLE, - results_schema=CDM_SCHEMA, - vocabulary_schema=CDM_SCHEMA, + backend=conn.backend, + cdm_schema=conn.cdm_schema, + cohort_table=cohort_table, + results_schema=conn.results_schema, + vocabulary_schema=conn.vocabulary_schema, incremental=True, - checksum_table=CHECKSUM_TABLE, + checksum_table=checksum_table, stop_on_error=False, ) @@ -120,7 +133,7 @@ def main() -> None: skipped_df = df[df["status"] == "SKIPPED"] print(f"\n{'=' * 55}") - print("Python benchmark complete") + print(f"Python benchmark complete (backend={backend_label})") print(f" Phenotypes loaded : {len(manifest)}") print(f" COMPLETE : {len(complete_df)}") print(f" FAILED : {len(failed_df)}") diff --git a/benchmarks/benchmark_run_r.R b/benchmarks/benchmark_run_r.R index 00942a4..08b17ad 100644 --- a/benchmarks/benchmark_run_r.R +++ b/benchmarks/benchmark_run_r.R @@ -1,10 +1,11 @@ #!/usr/bin/env Rscript # benchmark_run_r.R # -# R CohortGenerator benchmark — PhenotypeLibrary on Eunomia (DuckDB). +# R CohortGenerator benchmark — PhenotypeLibrary on Eunomia (DuckDB) or Databricks. # # Usage: -# Rscript benchmarks/benchmark_run_r.R +# Rscript benchmarks/benchmark_run_r.R # DuckDB (default) +# Rscript benchmarks/benchmark_run_r.R --backend databricks # # Output (in benchmark_output/): # r_checksum_times.csv -- per-phenotype generation timing from checksum table @@ -17,6 +18,16 @@ suppressPackageStartupMessages({ library(dplyr) }) +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- +args <- commandArgs(trailingOnly = TRUE) +backend <- "duckdb" +if ("--backend" %in% args) { + idx <- which(args == "--backend") + if (idx < length(args)) backend <- args[idx + 1] +} + # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- @@ -40,24 +51,96 @@ cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$coho cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) # --------------------------------------------------------------------------- -# 2. Set up Eunomia DuckDB database +# 2. Set up database connection # --------------------------------------------------------------------------- -cat("Setting up Eunomia DuckDB...\n") -if (!file.exists(DUCKDB_PATH)) { - dbPath <- Eunomia::getDatabaseFile( - datasetName = "GiBleed", +if (backend == "duckdb") { + + cat("Setting up Eunomia DuckDB...\n") + if (!file.exists(DUCKDB_PATH)) { + dbPath <- Eunomia::getDatabaseFile( + datasetName = "GiBleed", + dbms = "duckdb", + databaseFile = DUCKDB_PATH + ) + } else { + dbPath <- DUCKDB_PATH + } + cat(sprintf(" Database: %s\n", dbPath)) + + connectionDetails <- DatabaseConnector::createConnectionDetails( dbms = "duckdb", - databaseFile = DUCKDB_PATH + server = dbPath ) + CDM_SCHEMA <- "main" + RESULTS_SCHEMA <- "main" + COHORT_TABLE <- "cohort" + TEMP_EMULATION_SCHEMA <- NULL + +} else if (backend == "databricks") { + + cat("Setting up Databricks connection...\n") + + # Read YAML config + config_path <- file.path(dirname(script_path), "benchmark_db_config.yaml") + if (!file.exists(config_path)) { + stop(sprintf("Config not found: %s", config_path)) + } + + # Simple YAML reader — extracts top-level key's connection block + yaml_txt <- readLines(config_path, warn = FALSE) + yaml_txt <- yaml_txt[!grepl("^\\s*#", yaml_txt)] # strip comments + + extract_yaml <- function(key) { + pattern <- sprintf("^\\s*%s\\s*:\\s*[\"']?(.+?)[\"']?\\s*$", key) + line <- grep(pattern, yaml_txt, value = TRUE) + if (length(line) == 0) return("") + sub(pattern, "\\1", line[1]) + } + + resolve_env <- function(val) { + # Expand ${VAR} placeholders + gsub("\\$\\{(\\w+)\\}", function(m) { + v <- Sys.getenv(gsub("[${}]", "", m), unset = "") + v + }, val, perl = TRUE) + } + + server_hostname <- resolve_env(extract_yaml("server_hostname")) + http_path <- resolve_env(extract_yaml("http_path")) + databricks_token <- resolve_env(extract_yaml("personal_access_token")) + + if (server_hostname == "" || http_path == "" || databricks_token == "") { + stop("Databricks credentials not found. Set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN environment variables.") + } + + CDM_SCHEMA <- resolve_env(extract_yaml("cdm_schema")) + if (CDM_SCHEMA == "") CDM_SCHEMA <- "hive_metastore.omop_cdm" + + RESULTS_SCHEMA <- resolve_env(extract_yaml("results_schema")) + if (RESULTS_SCHEMA == "") RESULTS_SCHEMA <- "hive_metastore.results" + + COHORT_TABLE <- "cohort_r" + TEMP_EMULATION_SCHEMA <- RESULTS_SCHEMA # Databricks needs a real schema for temp + + conn_string <- paste0( + "jdbc:databricks://", server_hostname, ":443/default;", + "transportMode=http;ssl=1;", + "httpPath=", http_path, ";", + "AuthMech=3;UID=token;PWD=", databricks_token + ) + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "spark", connectionString = conn_string + ) + + cat(sprintf("Databricks host: %s\n", server_hostname)) + } else { - dbPath <- DUCKDB_PATH + stop(sprintf("Unknown backend: %s. Use 'duckdb' or 'databricks'.", backend)) } -cat(sprintf(" Database: %s\n", dbPath)) -connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = "duckdb", - server = dbPath -) +cat(sprintf("CDM schema : %s\n", CDM_SCHEMA)) +cat(sprintf("Results schema : %s\n", RESULTS_SCHEMA)) +cat(sprintf("Cohort table : %s\n", COHORT_TABLE)) # --------------------------------------------------------------------------- # 3. Generate cohorts using runCohortGeneration (incremental mode) @@ -65,12 +148,13 @@ connectionDetails <- DatabaseConnector::createConnectionDetails( cat("Generating cohorts (incremental)...\n") CohortGenerator::runCohortGeneration( connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cohortDatabaseSchema = "main", + cdmDatabaseSchema = CDM_SCHEMA, + cohortDatabaseSchema = RESULTS_SCHEMA, + tempEmulationSchema = TEMP_EMULATION_SCHEMA, cohortDefinitionSet = cds, incremental = TRUE, outputFolder = OUTPUT_DIR, - databaseId = "eunomia", + databaseId = backend, stopOnError = FALSE ) @@ -80,7 +164,7 @@ CohortGenerator::runCohortGeneration( cat("Extracting checksum timing...\n") checksums <- CohortGenerator::getLastGeneratedCohortChecksums( connectionDetails = connectionDetails, - cohortDatabaseSchema = "main" + cohortDatabaseSchema = RESULTS_SCHEMA ) times <- checksums %>% @@ -99,7 +183,7 @@ write.csv(times, out_file, row.names = FALSE) # Summary # --------------------------------------------------------------------------- cat(sprintf("\n%s\n", paste(rep("=", 55), collapse = ""))) -cat(sprintf("R benchmark complete\n")) +cat(sprintf("R benchmark complete (backend=%s)\n", backend)) cat(sprintf(" Phenotypes loaded : %d\n", nrow(cds))) cat(sprintf(" Cohorts generated : %d\n", nrow(times))) cat(sprintf(" Total time (sum) : %.4fs\n", sum(times$generation_seconds, na.rm = TRUE))) From 4d402f96fde4815b794db538fb95c6592dd59012 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Thu, 14 May 2026 20:15:11 -0700 Subject: [PATCH 11/53] Added files needed by python from benchmark script --- benchmarks/benchmark_run_r.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/benchmarks/benchmark_run_r.R b/benchmarks/benchmark_run_r.R index 08b17ad..7e31b78 100644 --- a/benchmarks/benchmark_run_r.R +++ b/benchmarks/benchmark_run_r.R @@ -50,6 +50,27 @@ phenotype_log <- PhenotypeLibrary::getPhenotypeLog() cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$cohortId) cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) +# --------------------------------------------------------------------------- +# 1b. Export phenotype JSONs and manifest for the Python benchmark +# --------------------------------------------------------------------------- +cat("Exporting phenotype JSONs and manifest ...\n") +json_dir <- file.path(OUTPUT_DIR, "phenotype_jsons") +dir.create(json_dir, showWarnings = FALSE, recursive = TRUE) + +for (i in seq_len(nrow(cds))) { + cohort_id <- cds$cohortId[i] + json_path <- file.path(json_dir, sprintf("%d.json", cohort_id)) + writeLines(cds$json[i], json_path) +} + +manifest <- data.frame( + cohortId = cds$cohortId, + cohortName = cds$cohortName, + stringsAsFactors = FALSE +) +write.csv(manifest, file.path(OUTPUT_DIR, "phenotype_manifest.csv"), row.names = FALSE) +cat(sprintf(" Wrote %d JSONs and manifest\n", nrow(cds))) + # --------------------------------------------------------------------------- # 2. Set up database connection # --------------------------------------------------------------------------- From ba78e1bb63e0055e3db51604603eb7d3aa2b1cae Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Fri, 15 May 2026 13:59:48 -0700 Subject: [PATCH 12/53] Attempt to fix test warnings --- tests/execution/test_databricks_compat.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/execution/test_databricks_compat.py b/tests/execution/test_databricks_compat.py index 448bd3d..11d8b22 100644 --- a/tests/execution/test_databricks_compat.py +++ b/tests/execution/test_databricks_compat.py @@ -16,7 +16,8 @@ class FakeDatabricksBackend: def _post_connect(self): raise RuntimeError("CREATE VOLUME IF NOT EXISTS my_catalog.my_schema.memtable") - patched = apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) + with pytest.warns(DeprecationWarning): + patched = apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) assert patched is True backend = FakeDatabricksBackend() @@ -29,7 +30,8 @@ def _post_connect(self): _ = "CREATE VOLUME IF NOT EXISTS my_catalog.my_schema.memtable" raise RuntimeError("different setup error") - patched = apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) + with pytest.warns(DeprecationWarning): + patched = apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) assert patched is True backend = FakeDatabricksBackend() @@ -76,6 +78,8 @@ class FakeDatabricksBackend: def _post_connect(self): raise RuntimeError("CREATE VOLUME IF NOT EXISTS my_catalog.my_schema.memtable") - assert apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) is True + with pytest.warns(DeprecationWarning): + assert apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) is True + # Second call is idempotent — no warning because the patch flag is already set assert apply_databricks_post_connect_workaround(backend_cls=FakeDatabricksBackend) is True assert maybe_apply_databricks_post_connect_workaround(FakeDatabricksBackend()) is True From 98606041c68a454798ee85355b1a32bb4b4c7f29 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Fri, 15 May 2026 14:46:13 -0700 Subject: [PATCH 13/53] remove chat slop --- circe/chat.py | 256 -------------------------------------------------- 1 file changed, 256 deletions(-) delete mode 100644 circe/chat.py diff --git a/circe/chat.py b/circe/chat.py deleted file mode 100644 index fd58d8d..0000000 --- a/circe/chat.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Chat module for interacting with LLMs to generate cohort definitions. -""" - -import json -import os -import re -import sys -from pathlib import Path -from typing import Optional - -from circe.prompt_builder import CohortPromptBuilder, ConceptSet - - -def chat_command(args): - """ - Entry point for the chat command. - """ - start_chat( - model=args.model, - prompt_type=args.prompt_type, - output=args.output, - concept_sets_file=args.concept_sets, - input_file=args.input_file, - ) - return 0 - - -def start_chat( - model: Optional[str], - prompt_type: str, - output: Optional[str], - concept_sets_file: Optional[str], - input_file: Optional[str] = None, -): - """ - Start the interactive chat session. - """ - # Check dependencies - try: - import litellm - from dotenv import load_dotenv - except ImportError: - print( - "Error: 'litellm' and 'python-dotenv' are required for chat functionality.", - file=sys.stderr, - ) - print( - "Please install them with: pip install litellm python-dotenv", - file=sys.stderr, - ) - return 1 - - # Load environment variables - load_dotenv() - - # Determine model - if not model: - model = os.getenv("LLM_MODEL", "gpt-4o") - # Handle optional temperature if needed, but litellm handles it or we pass it - - print("🚀 Starting Circe Chat") - print(f" Model: {model}") - print(f" Prompt: {prompt_type}") - print("-" * 50) - - # Load concept sets if provided - concept_sets_data = [] - if concept_sets_file: - try: - with open(concept_sets_file) as f: - raw_data = json.load(f) - # Expecting list of dicts with id, name - for item in raw_data: - concept_sets_data.append( - ConceptSet( - id=item.get("id"), - name=item.get("name"), - description=item.get("description"), - ) - ) - print(f" Loaded {len(concept_sets_data)} concept sets from {concept_sets_file}") - except Exception as e: - print(f"Error loading concept sets: {e}", file=sys.stderr) - return 1 - - # Initialize builder - builder = CohortPromptBuilder() - - try: - system_prompt = builder.load_system_prompt(prompt_type) - except Exception as e: - print(f"Error loading system prompt: {e}", file=sys.stderr) - return 1 - - # Add inference instruction if no concept sets provided - if not concept_sets_data: - system_prompt += ( - "\n\nIMPORTANT: No concept sets were provided.\n" - "You MUST infer appropriate concept sets from the clinical description.\n" - "1. Define them using `circe.vocabulary.concept_set`.\n" - "2. Add them to the builder using `.with_concept_sets(...)`.\n" - "3. Use valid OMOP Concept IDs (or realistic placeholders if exact IDs are unknown)." - ) - - messages = [{"role": "system", "content": system_prompt}] - - print("\nPlease describe the cohort you want to build (or type 'quit' to exit):") - - first_turn = True - initial_input = None - - if input_file: - try: - initial_input = Path(input_file).read_text() - print(f" Loaded clinical description from {input_file}") - except Exception as e: - print(f"Error reading input file: {e}", file=sys.stderr) - return 1 - - while True: - try: - if first_turn and initial_input: - user_input = initial_input - print("\n> [Processing input from file...]") - else: - user_input = input("\n> ") - except (EOFError, KeyboardInterrupt): - print("\nExiting chat.") - break - - if user_input.lower() in ("quit", "exit"): - break - - if not user_input.strip(): - continue - - # Turn off first_turn flag after we have a valid input - if first_turn: - first_turn = False - - # Construct user message - if len(messages) == 1: - # First user message - format nicely - formatted_content = f"\n---\n## User Task\n**Clinical Description:**\n{user_input}\n" - if concept_sets_data: - formatted_content += builder.format_concept_sets(concept_sets_data) - else: - formatted_content += "\nNo pre-defined concept sets provided. Please infer them." - - messages.append({"role": "user", "content": formatted_content}) - else: - messages.append({"role": "user", "content": user_input}) - - # Call AI - print("Thinking...") - try: - response = litellm.completion(model=model, messages=messages) - content = response.choices[0].message.content - print("\n" + content) - - messages.append({"role": "assistant", "content": content}) - - # Extract and process code - _process_response_content(content, output) - - except Exception as e: - print(f"\nError during API call: {e}", file=sys.stderr) - - -def _process_response_content(content: str, output_base: Optional[str]): - """ - Extract logic to find Python code, save it, and attempt to run it to generate JSON. - """ - # Look for python code block - code_match = re.search(r"```python\n(.*?)\n```", content, re.DOTALL) - if not code_match: - return - - code = code_match.group(1) - - # Determine output filenames - if output_base: - py_file = Path(output_base + ".py") - json_file = Path(output_base + ".json") - else: - # Default name - py_file = Path("cohort_definition.py") - json_file = Path("cohort_definition.json") - - # Save Python code - try: - py_file.write_text(code) - print(f"\n✅ Saved Python code to {py_file}") - except Exception as e: - print(f"Error saving Python file: {e}") - return - - # Attempt to execute and save JSON - # This involves running the code and capturing the 'cohort' variable or 'expression' variable - print(" Attempting to generate JSON...") - - try: - # Create a local scope - local_scope = {} - # We need to make sure the CWD is in path so imports work? - # Assuming we are running from project root or installed package - - exec(code, {}, local_scope) - - # Look for a CohortExpression or CohortBuilder object - # The prompt usually produces: - # cohort = CohortBuilder(...).build() - # So we look for 'cohort' - - cohort_obj = local_scope.get("cohort") - if not cohort_obj: - # Try to find any variable that is a tuple (builder) or CohortExpression - for _k, v in local_scope.items(): - if hasattr(v, "to_json"): # CohortExpression has to_json? Check API. - cohort_obj = v - break - - if cohort_obj: - # If it's the builder (tuple in some cases?), checks if it has build() - # But the prompt says `.build()` returns CohortExpression. - - # Check if it has 'to_json' or similar. - # circe.cohortdefinition.CohortExpression uses Pydantic? - # It inherits from Serializable? - - json_output = None - if hasattr(cohort_obj, "json"): # Pydantic v1/v2 - json_output = ( - cohort_obj.model_dump_json(indent=2) - if hasattr(cohort_obj, "model_dump_json") - else cohort_obj.json(indent=2) - ) - elif hasattr(cohort_obj, "to_json"): - json_output = cohort_obj.to_json() - else: - # It might be a dict? - if isinstance(cohort_obj, dict): - json_output = json.dumps(cohort_obj, indent=2) - - if json_output: - json_file.write_text(json_output) - print(f"✅ Saved Cohort JSON to {json_file}") - else: - print(" Could not serialize 'cohort' object to JSON.") - else: - print(" Could not find 'cohort' variable in executed code.") - - except Exception as e: - print(f" Error executing generated code: {e}") - print(" (Ensure the generated code is valid and all dependencies are installed)") From dd20010a84cf5e76b234ab509cd98d62999b36ff Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Sat, 16 May 2026 16:11:17 -0700 Subject: [PATCH 14/53] fixes and improved tests; MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ccrce/execution/engine/custom_era.py: - Issue 2 ✓ — _padded_end includes gap_days + offset; era end uses max(padded_end) - gap_days matching Circe BE - Issue 3 ✓ — Filters both drug_concept_id and drug_source_concept_id (with column existence guard) - Issue 5 ✓ — compute_drug_eras accepts cohort_person_ids; apply_custom_era_strategy semi-joins drug_exposure to cohort persons --- circe/execution/engine/custom_era.py | 32 +++++- tests/execution/test_custom_era.py | 141 ++++++++++++++++++++------- 2 files changed, 131 insertions(+), 42 deletions(-) diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py index bf53091..8a80523 100644 --- a/circe/execution/engine/custom_era.py +++ b/circe/execution/engine/custom_era.py @@ -26,7 +26,9 @@ def _compute_exposure_end_date(table, *, days_supply_override: int | None): def _compute_eras(exposures, *, gap_days: int, offset: int): - padded = exposures.mutate(_padded_end=(exposures._exposure_end + ibis.interval(days=int(gap_days)))) + padded = exposures.mutate( + _padded_end=(exposures._exposure_end + ibis.interval(days=int(gap_days + offset))) + ) ordering = [ padded.start_date, @@ -62,18 +64,24 @@ def _compute_eras(exposures, *, gap_days: int, offset: int): collapsed = era_indexed.group_by(era_indexed.person_id, era_indexed._era_id).aggregate( era_start_date=era_indexed.start_date.min(), - _max_exposure_end=era_indexed._exposure_end.max(), + _max_padded_end=era_indexed._padded_end.max(), ) return collapsed.select( collapsed.person_id.cast("int64").name(PERSON_ID), collapsed.era_start_date.cast("date").name("era_start_date"), - (collapsed._max_exposure_end + ibis.interval(days=int(offset))).cast("date").name("era_end_date"), + (collapsed._max_padded_end - ibis.interval(days=int(gap_days))).cast("date").name("era_end_date"), ) def compute_drug_eras( - ctx, *, drug_codeset_id: int, gap_days: int, offset: int, days_supply_override: int | None + ctx, + *, + drug_codeset_id: int, + gap_days: int, + offset: int, + days_supply_override: int | None, + cohort_person_ids=None, ): concept_ids = ctx.concept_ids_for_codeset(drug_codeset_id) @@ -86,7 +94,18 @@ def compute_drug_eras( ) de = ctx.table("drug_exposure") - filtered = de.filter(de.drug_concept_id.isin(concept_ids)) + if cohort_person_ids is not None: + de = de.semi_join( + cohort_person_ids.select(cohort_person_ids.person_id).distinct(), + predicates=[de.person_id == cohort_person_ids.person_id], + ) + + if "drug_source_concept_id" in de.columns: + filtered = de.filter( + de.drug_concept_id.isin(concept_ids) | de.drug_source_concept_id.isin(concept_ids) + ) + else: + filtered = de.filter(de.drug_concept_id.isin(concept_ids)) prepared = filtered.select( filtered.person_id.cast("int64").name("person_id"), @@ -108,12 +127,15 @@ def apply_custom_era_strategy(events, strategy, ctx): with_bounds = attach_observation_bounds(events, ctx) return _replace_end_date(events, with_bounds, with_bounds.op_end_date) + cohort_person_ids = events.select(events.person_id).distinct() + eras = compute_drug_eras( ctx, drug_codeset_id=drug_codeset_id, gap_days=gap_days, offset=offset, days_supply_override=days_supply_override, + cohort_person_ids=cohort_person_ids, ) eras_for_join = eras.select( diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py index 6a3a85b..ab327d4 100644 --- a/tests/execution/test_custom_era.py +++ b/tests/execution/test_custom_era.py @@ -18,9 +18,7 @@ def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: return ConceptSet( id=set_id, - expression=ConceptSetExpression( - items=[ConceptSetItem(concept=Concept(conceptId=concept_id))] - ), + expression=ConceptSetExpression(items=[ConceptSetItem(concept=Concept(conceptId=concept_id))]), ) @@ -92,9 +90,7 @@ def test_custom_era_merges_drugs_within_gap(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -149,9 +145,7 @@ def test_custom_era_no_merge_across_large_gap(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=5, offset=0), ) @@ -207,9 +201,7 @@ def test_custom_era_offset_applied(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=7), ) @@ -264,9 +256,7 @@ def test_custom_era_no_matching_drugs(): _make_concept_set(1, 111), _make_concept_set(2, 999), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -303,9 +293,7 @@ def test_custom_era_with_drug_exposure_as_primary(): expression = CohortExpression( concept_sets=[_make_concept_set(1, 222)], - primary_criteria=PrimaryCriteria( - criteria_list=[DrugExposure(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[DrugExposure(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=1, gap_days=30, offset=0), ) @@ -316,9 +304,7 @@ def test_custom_era_with_drug_exposure_as_primary(): assert len(result) == 2 start_dates = sorted(result["start_date"].astype(str).tolist()) assert start_dates == ["2020-01-01", "2020-02-01"] - assert all( - str(d)[:10] == "2020-03-03" for d in result["end_date"] - ) + assert all(str(d)[:10] == "2020-03-03" for d in result["end_date"]) def test_compute_drug_eras_matches_java_sql_logic(): @@ -436,6 +422,72 @@ def test_compute_drug_eras_matches_java_sql_logic(): ) +def test_custom_era_offset_affects_era_grouping(): + """Offset included in padded_end changes which exposures merge into eras. + + With gap_days=0, offset=30: + exp1: end=2020-01-10, exp2: start=2020-01-12 (gap=2 days) + + Without offset in padded_end: padded_end1=2020-01-10 (< start 2020-01-12) + → separate eras, cohort end=2020-01-10+30=2020-02-09 + + With offset in padded_end (Circe BE: DATEADD(day, gap+offset, end)): + padded_end1=2020-01-10+30=2020-02-09 (>= start 2020-01-12) + → merged era, cohort end=2020-01-20+30=2020-02-19 + """ + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis) + + conn.create_table( + "drug_exposure", + obj=ibis.memtable( + { + "person_id": [1, 1], + "drug_exposure_id": [1, 2], + "drug_concept_id": [222, 222], + "drug_exposure_start_date": [date(2020, 1, 1), date(2020, 1, 12)], + "drug_exposure_end_date": [date(2020, 1, 10), date(2020, 1, 20)], + "days_supply": [0, 0], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": [date(2020, 1, 1)], + "condition_end_date": [date(2020, 1, 1)], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + ], + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), + end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=0, offset=30), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + + assert len(result) == 1 + assert str(result.iloc[0]["start_date"])[:10] == "2020-01-01" + # Both exposures merge because padded_end1=2020-02-09 >= start 2020-01-12 + # era end = max(end) + offset = 2020-01-20 + 30 = 2020-02-19 + assert str(result.iloc[0]["end_date"])[:10] == "2020-02-19" + + def test_full_cohort_custom_era_matches_sql_end_dates(): """Full cohort pipeline with CustomEraStrategy produces same end_dates as raw SQL.""" ibis = pytest.importorskip("ibis") @@ -478,9 +530,7 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): _make_concept_set(1, 111), _make_concept_set(2, 222), ], - primary_criteria=PrimaryCriteria( - criteria_list=[ConditionOccurrence(codeset_id=1)] - ), + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), end_strategy=CustomEraStrategy(drug_codeset_id=2, gap_days=30, offset=0), ) @@ -488,13 +538,17 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): cohort_result = build_cohort(expression, backend=conn, cdm_schema="main").execute() # --- raw SQL pipeline (Java CUSTOM_ERA_STRATEGY_TEMPLATE logic, DuckDB dialect) --- - # Computes drug eras, then matches era end_dates to events via start_date overlap. + # Mirrors Circe BE's generateCohort.sql end-date selection: + # ROW_NUMBER() PARTITION BY person_id, event_id ORDER BY era_end_date ASC + # picks the earliest strategy end per event, matching Circe BE's + # MIN(end_date) across #strategy_ends union. + gap = 30 sql = f""" WITH drug_eras AS ( SELECT person_id, MIN(start_date) AS era_start_date, - MAX(padded_end) - 30 AS era_end_date + MAX(padded_end) - {gap} AS era_end_date FROM ( SELECT person_id, start_date, padded_end, @@ -521,7 +575,7 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): de.drug_exposure_end_date::DATE, de.drug_exposure_start_date::DATE + de.days_supply::INTEGER, de.drug_exposure_start_date::DATE + 1 - ) + 30 AS padded_end + ) + {gap} AS padded_end FROM drug_exposure de WHERE de.drug_concept_id = 222 ) raw_ends @@ -538,20 +592,33 @@ def test_full_cohort_custom_era_matches_sql_end_dates(): op.observation_period_end_date::DATE AS op_end_date FROM condition_occurrence e JOIN observation_period op ON e.person_id = op.person_id + ), + ranked_ends AS ( + SELECT + ev.person_id, + ev.event_id, + ev.start_date, + ev.op_end_date, + er.era_end_date, + ROW_NUMBER() OVER ( + PARTITION BY ev.person_id, ev.event_id + ORDER BY er.era_end_date + ) AS rn + FROM events_with_obs ev + LEFT JOIN drug_eras er + ON ev.person_id = er.person_id + AND ev.start_date BETWEEN er.era_start_date AND er.era_end_date ) SELECT - ev.person_id, - ev.start_date, + person_id, + start_date, LEAST( - COALESCE(MAX(er.era_end_date), ev.op_end_date), - ev.op_end_date + COALESCE(era_end_date, op_end_date), + op_end_date )::DATE AS end_date - FROM events_with_obs ev - LEFT JOIN drug_eras er - ON ev.person_id = er.person_id - AND ev.start_date BETWEEN er.era_start_date AND er.era_end_date - GROUP BY ev.person_id, ev.event_id, ev.start_date, ev.op_end_date - ORDER BY ev.person_id, ev.start_date + FROM ranked_ends + WHERE rn = 1 + ORDER BY person_id, start_date """ sql_result = conn.con.sql(sql).fetchdf() From 3f5b21f7e53ba6d7f17fc63778a307c452f95b61 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Sat, 16 May 2026 18:46:58 -0700 Subject: [PATCH 15/53] simplification of checksum process --- .../cohort_definition_set/_checksum_store.py | 72 +++++++++++++++++++ circe/cohort_definition_set/_generate.py | 48 ++++++------- circe/cohortdefinition/cohort.py | 58 ++++++++------- 3 files changed, 130 insertions(+), 48 deletions(-) diff --git a/circe/cohort_definition_set/_checksum_store.py b/circe/cohort_definition_set/_checksum_store.py index 08226fb..e80a471 100644 --- a/circe/cohort_definition_set/_checksum_store.py +++ b/circe/cohort_definition_set/_checksum_store.py @@ -217,3 +217,75 @@ def save_generation_history( ) merged = filtered_existing.union(new_relation, distinct=False) create_table(backend, table_name=table_name, schema=schema, obj=merged, overwrite=True) + + +def upsert_generation_history( + backend: IbisBackendLike, + *, + schema: str | None, + table_name: str, + cohort_id: int, + checksum: str, + status: str, + start_time: datetime, + end_time: datetime, +) -> None: + """Persist the generation result for a single cohort. + + Unlike ``save_generation_history()`` this operates on one cohort at a + time so that incremental persistence is possible — a completed cohort + is recorded immediately rather than waiting for the entire batch to + finish. + + Uses DELETE + INSERT (no full-table rewrite) so it is O(1) per call. + + Args: + backend: Ibis backend connection. + schema: Schema/database where the table lives. + table_name: Name of the generation history table. + cohort_id: Cohort definition id. + checksum: Expression checksum for this generation. + status: ``"COMPLETE"`` or ``"FAILED"``. + start_time: When execution started. + end_time: When execution ended. + """ + import ibis + import pandas as pd + + from ..execution.ibis.operations import ( + create_table, + delete_cohort_rows, + insert_relation, + table_exists, + ) + + new_rows_df = pd.DataFrame( + [ + { + "cohort_definition_id": int(cohort_id), + "checksum": str(checksum), + "status": str(status), + "start_time": pd.to_datetime(start_time), + "end_time": pd.to_datetime(end_time), + } + ] + ) + + new_relation = ibis.memtable(new_rows_df) + + if not table_exists(backend, table_name=table_name, schema=schema): + create_table(backend, table_name=table_name, schema=schema, obj=new_relation, overwrite=False) + return + + delete_cohort_rows( + backend, + cohort_table=table_name, + results_schema=schema, + cohort_id=cohort_id, + ) + insert_relation( + new_relation, + backend=backend, + target_table=table_name, + target_schema=schema, + ) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 4f8f5e0..3727818 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -9,7 +9,7 @@ from ..execution.api import build_cohort, project_to_ohdsi_cohort_table, write_cohort from ..execution.errors import ExecutionError -from ._checksum_store import load_checksums, save_generation_history +from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinitionSet, CohortGenerationResult if TYPE_CHECKING: @@ -78,7 +78,6 @@ def generate_cohort_set( ... print(r.cohort_name, r.status) """ total = len(cohort_definition_set) - current_checksums = cohort_definition_set.checksums() # Clear the correlated-events compilation cache so that entries # referencing a previous backend (whose ``id()`` may have been reused @@ -96,12 +95,11 @@ def generate_cohort_set( ) results: list[CohortGenerationResult] = [] - generated_this_run: dict[int, tuple[str, str, datetime, datetime]] = {} logger.info("Generating %d cohort(s) (incremental=%s)", total, incremental) for i, cohort in enumerate(cohort_definition_set, start=1): - current_checksum = current_checksums[cohort.cohort_id] + current_checksum = cohort.expression.checksum() if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: logger.info( @@ -205,12 +203,17 @@ def generate_cohort_set( error=exc, ) ) - generated_this_run[cohort.cohort_id] = ( - current_checksum, - "FAILED", - start_time or datetime.now(), - end_time, - ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="FAILED", + start_time=start_time or datetime.now(), + end_time=end_time, + ) if stop_on_error: raise continue @@ -231,20 +234,17 @@ def generate_cohort_set( end_time=end_time or datetime.now(), ) ) - generated_this_run[cohort.cohort_id] = ( - current_checksum, - "COMPLETE", - start_time or datetime.now(), - end_time or datetime.now(), - ) - - if incremental and generated_this_run: - save_generation_history( - backend, - schema=results_schema, - table_name=checksum_table, - generated=generated_this_run, - ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="COMPLETE", + start_time=start_time or datetime.now(), + end_time=end_time or datetime.now(), + ) summary = summarise_generation_results(results) logger.info( diff --git a/circe/cohortdefinition/cohort.py b/circe/cohortdefinition/cohort.py index 6071dc8..8cd4570 100644 --- a/circe/cohortdefinition/cohort.py +++ b/circe/cohortdefinition/cohort.py @@ -9,7 +9,6 @@ """ import contextlib -import json from typing import TYPE_CHECKING, Any, Optional, Union from pydantic import ( @@ -51,6 +50,32 @@ InclusionRule = Any +def _python_serialize(obj: Any) -> bytes: + """Deterministic Python-native serialization for checksum hashing. + + Recursively serializes Python builtins (dict, list, str, int, float, + bool, None) to a stable byte representation. Dict keys are sorted to + guarantee deterministic output across Python versions and platforms. + """ + if isinstance(obj, dict): + items = b",".join(_python_serialize(k) + b":" + _python_serialize(v) for k, v in sorted(obj.items())) + return b"{" + items + b"}" + if isinstance(obj, list): + items = b",".join(_python_serialize(v) for v in obj) + return b"[" + items + b"]" + if isinstance(obj, bool): + return b"true" if obj else b"false" + if isinstance(obj, int): + return repr(obj).encode("ascii") + if isinstance(obj, float): + return repr(obj).encode("ascii") + if isinstance(obj, str): + return obj.encode("utf-8") + if obj is None: + return b"null" + return repr(obj).encode("utf-8") + + class CohortExpression(CirceBaseModel): """Main cohort expression class containing all cohort definition components. @@ -364,19 +389,12 @@ def checksum(self, algorithm: str = "sha256") -> str: Hex digest of the checksum """ import hashlib - import json - - # 1. Dump with defaults excluded to handle implicit defaults - data = self.model_dump(exclude_unset=True, exclude_defaults=True, by_alias=True) - - # 2. Normalize: remove metadata, deduplicate concept sets, etc. - normalized_data = self._normalize_for_checksum(data) - - # 3. Serialize to canonical JSON - canonical_json = json.dumps(normalized_data, sort_keys=True) + data = self.model_dump(by_alias=True, exclude_none=True) + normalized = self._normalize_for_checksum(data) + serialized = _python_serialize(normalized) h = hashlib.new(algorithm) - h.update(canonical_json.encode("utf-8")) + h.update(serialized) return h.hexdigest() def _normalize_for_checksum(self, data: Any) -> Any: @@ -396,20 +414,14 @@ def _normalize_for_checksum(self, data: Any) -> Any: seen_items = set() for item in data["items"]: - # Normalize the item first norm_item = self._normalize_for_checksum(item) + item_bytes = _python_serialize(norm_item) - # Create a sortable/hashable representation for deduplication - # We need to sort keys to ensure tuple order is consistent - item_json = json.dumps(norm_item, sort_keys=True) - - if item_json not in seen_items: - seen_items.add(item_json) + if item_bytes not in seen_items: + seen_items.add(item_bytes) normalized_items.append(norm_item) - # Sort items to ensure list order doesn't affect hash - # Sort by the JSON string representation - normalized_items.sort(key=lambda x: json.dumps(x, sort_keys=True)) + normalized_items.sort(key=_python_serialize) new_data = data.copy() new_data["items"] = normalized_items @@ -417,8 +429,6 @@ def _normalize_for_checksum(self, data: Any) -> Any: # Handle Concept Objects (heuristically by fields) if "CONCEPT_ID" in data: - # Keep ID, remove metadata names/codes/vocab - # Keep only structural identifier return {"CONCEPT_ID": data["CONCEPT_ID"]} # Recurse for other dicts From f151953a235f453e4e9105c26f9a3163c93fc1b6 Mon Sep 17 00:00:00 2001 From: James Gilbert Date: Sat, 16 May 2026 22:44:34 -0400 Subject: [PATCH 16/53] Error log --- benchmarks/error_repoer.txt | 79 +++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 benchmarks/error_repoer.txt diff --git a/benchmarks/error_repoer.txt b/benchmarks/error_repoer.txt new file mode 100644 index 0000000..6697390 --- /dev/null +++ b/benchmarks/error_repoer.txt @@ -0,0 +1,79 @@ +Traceback (most recent call last): + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 190, in insert_relation + _call_with_optional_database( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + insert, + ^^^^^^^ + ...<3 lines>... + overwrite=False, + ^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 13, in _call_with_optional_database + return method(*args, database=database, **kwargs) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\sql\__init__.py", line 457, in insert + self._run_pre_execute_hooks(obj) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1300, in _run_pre_execute_hooks + self._register_in_memory_tables(expr) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1277, in _register_in_memory_tables + self._register_in_memory_table(memtable) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 476, in _register_in_memory_table + cur.execute(put_into) + ~~~~~~~~~~~^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\telemetry\latency_logger.py", line 182, in wrapper + return func(self, *args, **kwargs) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1357, in execute + self._handle_staging_operation( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + staging_allowed_local_path=self.connection.staging_allowed_local_path, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + input_stream=input_stream, + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1103, in _handle_staging_operation + raise ProgrammingError( + ...<3 lines>... + ) +databricks.sql.exc.ProgrammingError: Local file operations are restricted to paths within the configured staging_allowed_local_path + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 160, in + main() + ~~~~^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 108, in main + results = generate_cohort_set( + cds, + ...<7 lines>... + stop_on_error=False, + ) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 238, in generate_cohort_set + upsert_generation_history( + ~~~~~~~~~~~~~~~~~~~~~~~~~^ + backend, + ^^^^^^^^ + ...<6 lines>... + end_time=end_time or datetime.now(), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_checksum_store.py", line 286, in upsert_generation_history + insert_relation( + ~~~~~~~~~~~~~~~^ + new_relation, + ^^^^^^^^^^^^^ + ...<2 lines>... + target_schema=schema, + ^^^^^^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 199, in insert_relation + raise ExecutionError( + ...<2 lines>... + ) from exc +circe.execution.errors.ExecutionError: Ibis executor write error: failed inserting relation into table 'benchmark_cohort_py_checksum' in schema 'scratch.scratch_jgilber2'. \ No newline at end of file From 95fcc6b9003dbecf630345ca9494223cf8b43db7 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Sun, 17 May 2026 09:42:38 -0700 Subject: [PATCH 17/53] added fixes to checksums using raw insert/upsert instead of memtables --- .../cohort_definition_set/_checksum_store.py | 45 ++++++++-------- circe/execution/ibis/operations.py | 52 +++++++++++++++++++ 2 files changed, 76 insertions(+), 21 deletions(-) diff --git a/circe/cohort_definition_set/_checksum_store.py b/circe/cohort_definition_set/_checksum_store.py index e80a471..baa5972 100644 --- a/circe/cohort_definition_set/_checksum_store.py +++ b/circe/cohort_definition_set/_checksum_store.py @@ -250,31 +250,33 @@ def upsert_generation_history( end_time: When execution ended. """ import ibis - import pandas as pd from ..execution.ibis.operations import ( create_table, delete_cohort_rows, - insert_relation, + insert_rows_via_raw_sql, table_exists, ) - new_rows_df = pd.DataFrame( - [ - { - "cohort_definition_id": int(cohort_id), - "checksum": str(checksum), - "status": str(status), - "start_time": pd.to_datetime(start_time), - "end_time": pd.to_datetime(end_time), - } - ] - ) - - new_relation = ibis.memtable(new_rows_df) + columns = ["cohort_definition_id", "checksum", "status", "start_time", "end_time"] + row = [int(cohort_id), str(checksum), str(status), start_time, end_time] if not table_exists(backend, table_name=table_name, schema=schema): - create_table(backend, table_name=table_name, schema=schema, obj=new_relation, overwrite=False) + create_table( + backend, + table_name=table_name, + schema=schema, + obj=ibis.memtable( + { + "cohort_definition_id": [int(cohort_id)], + "checksum": [str(checksum)], + "status": [str(status)], + "start_time": [start_time], + "end_time": [end_time], + } + ), + overwrite=False, + ) return delete_cohort_rows( @@ -283,9 +285,10 @@ def upsert_generation_history( results_schema=schema, cohort_id=cohort_id, ) - insert_relation( - new_relation, - backend=backend, - target_table=table_name, - target_schema=schema, + insert_rows_via_raw_sql( + backend, + table_name=table_name, + schema=schema, + columns=columns, + rows=[row], ) diff --git a/circe/execution/ibis/operations.py b/circe/execution/ibis/operations.py index c8ba111..40789e1 100644 --- a/circe/execution/ibis/operations.py +++ b/circe/execution/ibis/operations.py @@ -119,6 +119,58 @@ def delete_cohort_rows( ) from exc +def insert_rows_via_raw_sql( + backend: IbisBackendLike, + *, + table_name: str, + schema: str | None, + columns: list[str], + rows: list[list], +) -> None: + """Insert rows into a backend table using a raw SQL INSERT VALUES statement. + + Avoids ``ibis.memtable()`` so that backends like Databricks (which + restrict staging/volume paths) can write small payloads without + hitting ``staging_allowed_local_path`` constraints. + """ + from datetime import datetime + + raw_sql = getattr(backend, "raw_sql", None) + if not callable(raw_sql): + raise ExecutionError("Ibis executor write error: backend does not support raw_sql for raw inserts.") + + catalog, database = _catalog_db_tuple(backend, schema) + quoted = getattr(getattr(backend, "compiler", None), "quoted", False) + + table = sg.table(table_name, db=database, catalog=catalog, quoted=quoted) + table_sql = table.sql(dialect=getattr(backend, "name", None) or "duckdb") + + cols_sql = ", ".join( + sg.column(c, quoted=quoted).sql(dialect=getattr(backend, "name", None) or "duckdb") for c in columns + ) + + def _sql_value(v): + if v is None: + return "NULL" + if isinstance(v, (int, float)): + return repr(v) + if isinstance(v, datetime): + return "'" + v.strftime("%Y-%m-%d %H:%M:%S") + "'" + return "'" + str(v).replace("'", "''") + "'" + + values_sql = ", ".join("(" + ", ".join(_sql_value(v) for v in row) + ")" for row in rows) + + statement = f"INSERT INTO {table_sql} ({cols_sql}) VALUES {values_sql}" + + try: + raw_sql(statement) + except Exception as exc: + raise ExecutionError( + "Ibis executor write error: failed inserting rows into " + f"table '{table_name}' in schema '{schema}'." + ) from exc + + def supports_transactional_replace(backend: IbisBackendLike) -> bool: """Return whether cohort-scoped delete+insert can run transactionally.""" return getattr(backend, "name", None) in {"duckdb", "postgres"} From 3141f10eaf3a5337f49c444eb9936e1800741345 Mon Sep 17 00:00:00 2001 From: James Gilbert Date: Sun, 17 May 2026 17:32:57 -0400 Subject: [PATCH 18/53] Error log --- benchmarks/error_repoer.txt | 200 ++++++++++++++++++++++++------------ 1 file changed, 135 insertions(+), 65 deletions(-) diff --git a/benchmarks/error_repoer.txt b/benchmarks/error_repoer.txt index 6697390..c8f1cb7 100644 --- a/benchmarks/error_repoer.txt +++ b/benchmarks/error_repoer.txt @@ -1,47 +1,3 @@ -Traceback (most recent call last): - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 190, in insert_relation - _call_with_optional_database( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - insert, - ^^^^^^^ - ...<3 lines>... - overwrite=False, - ^^^^^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 13, in _call_with_optional_database - return method(*args, database=database, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\sql\__init__.py", line 457, in insert - self._run_pre_execute_hooks(obj) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1300, in _run_pre_execute_hooks - self._register_in_memory_tables(expr) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1277, in _register_in_memory_tables - self._register_in_memory_table(memtable) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 476, in _register_in_memory_table - cur.execute(put_into) - ~~~~~~~~~~~^^^^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\telemetry\latency_logger.py", line 182, in wrapper - return func(self, *args, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1357, in execute - self._handle_staging_operation( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - staging_allowed_local_path=self.connection.staging_allowed_local_path, - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - input_stream=input_stream, - ^^^^^^^^^^^^^^^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1103, in _handle_staging_operation - raise ProgrammingError( - ...<3 lines>... - ) -databricks.sql.exc.ProgrammingError: Local file operations are restricted to paths within the configured staging_allowed_local_path - -The above exception was the direct cause of the following exception: - Traceback (most recent call last): File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 160, in main() @@ -52,28 +8,142 @@ Traceback (most recent call last): ...<7 lines>... stop_on_error=False, ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 238, in generate_cohort_set - upsert_generation_history( - ~~~~~~~~~~~~~~~~~~~~~~~~~^ - backend, - ^^^^^^^^ - ...<6 lines>... - end_time=end_time or datetime.now(), - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 138, in generate_cohort_set + new_rows = build_cohort( + cohort.expression, + ...<5 lines>... + cohort_id=cohort.cohort_id, ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_checksum_store.py", line 286, in upsert_generation_history - insert_relation( - ~~~~~~~~~~~~~~~^ - new_relation, - ^^^^^^^^^^^^^ - ...<2 lines>... - target_schema=schema, + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\api.py", line 56, in build_cohort + return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\engine\cohort.py", line 99, in build_cohort_table + included_events = _materialize( + included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + ) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\engine\cohort.py", line 39, in _materialize + create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 64, in create_table + _call_with_optional_database( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + backend.create_table, ^^^^^^^^^^^^^^^^^^^^^ + ...<2 lines>... + **kwargs, + ^^^^^^^^^ ) ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 199, in insert_relation - raise ExecutionError( - ...<2 lines>... - ) from exc -circe.execution.errors.ExecutionError: Ibis executor write error: failed inserting relation into table 'benchmark_cohort_py_checksum' in schema 'scratch.scratch_jgilber2'. \ No newline at end of file + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 13, in _call_with_optional_database + return method(*args, database=database, **kwargs) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 228, in create_table + cur.execute(insert_stmt).fetchall() + ~~~~~~~~~~~^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\telemetry\latency_logger.py", line 182, in wrapper + return func(self, *args, **kwargs) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1341, in execute + self.active_result_set = self.backend.execute_command( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + operation=prepared_operation, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ...<10 lines>... + query_tags=query_tags, + ^^^^^^^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 1066, in execute_command + execute_response, has_more_rows = self._handle_execute_response( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + resp, cursor + ^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 1273, in _handle_execute_response + final_operation_state = self._wait_until_command_done( + resp.operationHandle, + resp.directResults and resp.directResults.operationStatus, + ) + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 954, in _wait_until_command_done + self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 634, in _check_command_not_in_error_or_closed_state + raise ServerOperationError( + ...<7 lines>... + ) +databricks.sql.exc.ServerOperationError: ShuffleMapStage 9148 (mapPartitionsInternal at PhotonExec.scala:865) has failed the maximum allowable number of times: 4. Most recent failure reason: +org.apache.spark.shuffle.PrismFetchFailedException: Prism failed to read shuffle data from BlockManagerId: BlockManagerId(9, 10.128.16.236, 4048, None). shuffleId=2880, firstMapdId=158410, firstMapIndex=4, firstReduceId=0. Error reason: status: ResourceExhausted, message: "Request is throttled: Low priority request queue is full", details: [], metadata: MetadataMap { headers: {"content-type": "application/grpc", "date": "Sun, 17 May 2026 21:32:04 GMT", "content-length": "0"} }. Failed shuffle fetch from executor node: 9 at 10.128.16.236 + at org.apache.spark.errors.SparkCoreErrors$.prismFetchFailedError(SparkCoreErrors.scala:442) + at org.apache.spark.errors.SparkCoreErrors.prismFetchFailedError(SparkCoreErrors.scala) + at com.databricks.spark.prism.PrismInputStream.read(PrismInputStream.java:128) + at java.base/java.io.FilterInputStream.read(FilterInputStream.java:82) + at java.base/java.io.PushbackInputStream.read(PushbackInputStream.java:135) + at org.apache.spark.storage.ShuffleBlockFetcherIterator.prismHasNext(ShuffleBlockFetcherIterator.scala:1011) + at org.apache.spark.storage.ShuffleBlockFetcherIterator.hasNext(ShuffleBlockFetcherIterator.scala:962) + at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:30) + at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) + at com.databricks.photon.LazyBlockFetcherIterator.hasNext(ShuffledBlockRDD.scala:443) + at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) + at scala.collection.convert.JavaCollectionWrappers$IteratorWrapper.hasNext(JavaCollectionWrappers.scala:32) + at com.databricks.photon.CloseableIterator$$anon$6.hasNext(CloseableIterator.scala:71) + at com.databricks.photon.JniApiImpl.open(Native Method) + at com.databricks.photon.JniApi.open(JniApi.scala) + at com.databricks.photon.JniExecNode.open(JniExecNode.java:74) + at com.databricks.photon.PhotonPreShuffleResultHandler.$anonfun$getResultImpl$1(PhotonExec.scala:1413) + at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) + at com.databricks.photon.PhotonResultHandler.timeit(PhotonResultHandler.scala:33) + at com.databricks.photon.PhotonResultHandler.timeit$(PhotonResultHandler.scala:31) + at com.databricks.photon.PhotonPreShuffleResultHandler.timeit(PhotonExec.scala:1406) + at com.databricks.photon.PhotonPreShuffleResultHandler.getResultImpl(PhotonExec.scala:1413) + at com.databricks.photon.PhotonResultHandler.$anonfun$getResult$1(PhotonResultHandler.scala:74) + at com.databricks.photon.PhotonResultHandler.com$databricks$photon$PhotonResultHandler$$convertPhotonOOMIfNeeded(PhotonResultHandler.scala:53) + at com.databricks.photon.PhotonResultHandler.getResult(PhotonResultHandler.scala:74) + at com.databricks.photon.PhotonResultHandler.getResult$(PhotonResultHandler.scala:63) + at com.databricks.photon.PhotonPreShuffleResultHandler.getResult(PhotonExec.scala:1406) + at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.open(PhotonBasicEvaluatorFactory.scala:258) + at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.hasNextImpl(PhotonBasicEvaluatorFactory.scala:263) + at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.$anonfun$hasNext$1(PhotonBasicEvaluatorFactory.scala:283) + at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.scala:17) + at org.apache.spark.TaskContext.runFuncAsBillable(TaskContext.scala:274) + at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.hasNext(PhotonBasicEvaluatorFactory.scala:283) + at com.databricks.photon.CloseableIterator$$anon$10.hasNext(CloseableIterator.scala:211) + at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) + at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) + at com.databricks.photon.MetadataOnlyShuffleWriter.write(MetadataOnlyShuffleWriter.scala:50) + at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:58) + at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$5(ShuffleMapTask.scala:98) + at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) + at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:93) + at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) + at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:58) + at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:39) + at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:233) + at org.apache.spark.scheduler.Task.doRunTask(Task.scala:223) + at org.apache.spark.scheduler.Task.$anonfun$run$6(Task.scala:179) + at com.databricks.unity.UCSEphemeralState$Handle.runWith(UCSEphemeralState.scala:51) + at com.databricks.unity.HandleImpl.runWith(UCSHandle.scala:128) + at com.databricks.unity.HandleImpl.$anonfun$runWithAndClose$1(UCSHandle.scala:133) + at scala.util.Using$.resource(Using.scala:296) + at com.databricks.unity.HandleImpl.runWithAndClose(UCSHandle.scala:132) + at org.apache.spark.scheduler.TaskExecutionUtils$.withUCHandleForTaskExecution(Task.scala:391) + at org.apache.spark.scheduler.Task.$anonfun$run$3(Task.scala:173) + at org.apache.spark.scheduler.TaskExecutionUtils$.withCredentialsForTaskExecution(Task.scala:368) + at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119) + at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) + at org.apache.spark.scheduler.Task.run(Task.scala:114) + at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:1593) + at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86) + at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83) + at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:111) + at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:1598) + at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) + at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) + at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:1425) + at com.databricks.aether.RDDTask.run(RDDTask.scala:271) + at com.databricks.aether.worker.WorkerTaskAttemptThread.$anonfun$runInternal$1(AetherWorkerImpl.scala:362) + at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) + at com.databricks.aether.AetherInt64Gauge.scopedAdd(AetherServiceMetricImpls.scala:129) + at com.databricks.aether.worker.WorkerTaskAttemptThread.runInternal(AetherWorkerImpl.scala:351) + at com.databricks.aether.worker.WorkerTaskAttemptThread.run(AetherWorkerImpl.scala:328) + at com.databricks.aether.FairBlockingQueue$SlotCountingRunnable.run(FairBlockingQueue.java:647) + at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) + at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) + at java.base/java.lang.Thread.run(Thread.java:840) \ No newline at end of file From fc48d8ac7b64e0033c199b51f710a7f87f9a82ff Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Sun, 17 May 2026 16:30:59 -0700 Subject: [PATCH 19/53] Async cohort generation and compilation with improved error handling for any exception to carry on to other cohorts --- circe/api.py | 1 + circe/cohort_definition_set/__init__.py | 3 +- circe/cohort_definition_set/_generate.py | 249 ++++++++++++++++------- tests/test_cohort_definition_set.py | 239 ++++++++++++++++++++++ 4 files changed, 412 insertions(+), 80 deletions(-) diff --git a/circe/api.py b/circe/api.py index 6645907..801f8fb 100644 --- a/circe/api.py +++ b/circe/api.py @@ -15,6 +15,7 @@ CohortDefinition, CohortDefinitionSet, CohortGenerationResult, + async_generate_cohort_set, generate_cohort_set, summarise_generation_results, ) diff --git a/circe/cohort_definition_set/__init__.py b/circe/cohort_definition_set/__init__.py index 79e9b90..14bc755 100644 --- a/circe/cohort_definition_set/__init__.py +++ b/circe/cohort_definition_set/__init__.py @@ -23,12 +23,13 @@ """ from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult -from ._generate import generate_cohort_set, summarise_generation_results +from ._generate import async_generate_cohort_set, generate_cohort_set, summarise_generation_results __all__ = [ "CohortDefinition", "CohortDefinitionSet", "CohortGenerationResult", + "async_generate_cohort_set", "generate_cohort_set", "summarise_generation_results", ] diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 3727818..18e89ff 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -2,23 +2,68 @@ from __future__ import annotations +import asyncio import contextlib import logging +import threading from datetime import datetime from typing import TYPE_CHECKING, Literal -from ..execution.api import build_cohort, project_to_ohdsi_cohort_table, write_cohort -from ..execution.errors import ExecutionError +from ..execution.api import build_cohort, write_cohort from ._checksum_store import load_checksums, upsert_generation_history -from ._core import CohortDefinitionSet, CohortGenerationResult +from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult if TYPE_CHECKING: from ..execution.typing import IbisBackendLike logger = logging.getLogger(__name__) +_backend_lock = threading.Lock() -def generate_cohort_set( + +def _process_single_cohort( + cohort: CohortDefinition, + *, + backend: IbisBackendLike, + cdm_schema: str | None, + results_schema: str | None, + vocabulary_schema: str | None, + cohort_table: str, +) -> tuple[datetime, datetime]: + """Build and write a single cohort. Thread-safe via ``_backend_lock``. + + Returns ``(start_time, end_time)`` of the database-materialization + phase so the caller can compute execution duration. + """ + from ..execution.ibis.materialize import project_to_ohdsi_cohort_table + + with _backend_lock: + start_time = datetime.now() + new_rows = build_cohort( + cohort.expression, + backend=backend, + cdm_schema=cdm_schema, # type: ignore[arg-type] + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + use_persistent_cache=False, + cohort_id=cohort.cohort_id, + ) + projected = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) + write_cohort( + compiled_relation=projected, + backend=backend, + cdm_schema=cdm_schema, # type: ignore[arg-type] + cohort_table=cohort_table, + cohort_id=cohort.cohort_id, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + if_exists="replace", + ) + end_time = datetime.now() + return start_time, end_time + + +async def async_generate_cohort_set( cohort_definition_set: CohortDefinitionSet, *, backend: IbisBackendLike, @@ -29,66 +74,59 @@ def generate_cohort_set( incremental: bool = False, checksum_table: str = "cohort_checksum", stop_on_error: bool = True, + compile_timeout: float | None = None, ) -> list[CohortGenerationResult]: """Generate all cohorts in a CohortDefinitionSet and write them to a shared table. - This is the Python equivalent of OHDSI/CohortGenerator's ``generateCohortSet()``. - Each cohort is written to ``cohort_table`` with its ``cohort_id`` stamped into - ``cohort_definition_id``. If the table already contains rows for a cohort, they - are replaced (``if_exists="replace"`` semantics from ``write_cohort``). + This is the async counterpart of :func:`generate_cohort_set`. It wraps + the synchronous build/write pipeline in :func:`asyncio.to_thread` so + that each cohort's work does not block the event loop. When + *compile_timeout* is set, cohorts taking longer than the given number + of seconds are recorded as ``FAILED`` and the next cohort proceeds + (subject to *stop_on_error*). - When ``incremental=True``, cohorts whose expression checksum matches the stored - value in ``checksum_table`` are skipped. Successfully completed cohorts have - their checksums persisted to ``checksum_table`` so future runs can detect them. + All exception types (not just ``ExecutionError``) are caught and + recorded as ``FAILED``, ensuring that transient database errors such as + Databricks ``ServerOperationError`` do not abort the entire batch. Args: cohort_definition_set: The set of cohort definitions to generate. backend: Ibis backend connection pointing at the target database. cdm_schema: Schema containing the OMOP CDM source tables. cohort_table: Name of the OHDSI cohort table to write results into. - results_schema: Optional schema for both the cohort table and checksum table. - vocabulary_schema: Optional schema for vocabulary tables (defaults to cdm_schema). - incremental: If True, skip cohorts whose expression checksum is unchanged - since the last successful generation. - checksum_table: Name of the table used to persist checksums for incremental - runs. Defaults to ``"cohort_checksum"``. - stop_on_error: If True (default), raise the first ExecutionError encountered - and stop processing remaining cohorts. If False, record the failure and - continue. + results_schema: Optional schema for both the cohort table and + checksum table. + vocabulary_schema: Optional schema for vocabulary tables (defaults + to *cdm_schema*). + incremental: If True, skip cohorts whose expression checksum is + unchanged since the last successful generation. + checksum_table: Name of the table used to persist checksums for + incremental runs. Defaults to ``"cohort_checksum"``. + stop_on_error: If True (default), raise on the first failure and + stop processing remaining cohorts. If False, record the + failure and continue. + compile_timeout: Maximum time in seconds to allow per-cohort before + recording a timeout failure. ``None`` means no timeout. Returns: - A list of :class:`CohortGenerationResult` — one entry per cohort in the - set, in insertion order. + A list of :class:`CohortGenerationResult` — one entry per cohort + in the set, in insertion order. Raises: - ExecutionError: If a cohort fails to generate and ``stop_on_error=True``. - - Example: - >>> cds = CohortDefinitionSet() - >>> cds.add(1, "Diabetes", expr1) - >>> cds.add(2, "Hypertension", expr2) - >>> results = generate_cohort_set( - ... cds, - ... backend=conn, - ... cdm_schema="main", - ... cohort_table="cohort", - ... incremental=True, - ... ) - >>> for r in results: - ... print(r.cohort_name, r.status) + Exception: If a cohort fails and ``stop_on_error=True`` (the + exception type matches whatever the underlying operation + raised). """ total = len(cohort_definition_set) - # Clear the correlated-events compilation cache so that entries - # referencing a previous backend (whose ``id()`` may have been reused - # by the current connection) never collide. from ..execution.engine.group_operators import _COMPILED_CORRELATED_EVENTS _COMPILED_CORRELATED_EVENTS.clear() previous_checksums: dict[int, str] = {} if incremental: - previous_checksums = load_checksums( + previous_checksums = await asyncio.to_thread( + load_checksums, backend, schema=results_schema, table_name=checksum_table, @@ -103,7 +141,7 @@ def generate_cohort_set( if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: logger.info( - "[%d/%d] Skipping cohort %d (%s) — checksum unchanged", + "[%d/%d] Skipping cohort %d (%s) -- checksum unchanged", i, total, cohort.cohort_id, @@ -132,54 +170,69 @@ def generate_cohort_set( start_time: datetime | None = None end_time: datetime | None = None try: - # Compile cohort expression to an ibis relation (not timed for - # benchmark parity — benchmarks measure database execution only) - compile_start = datetime.now() - new_rows = build_cohort( - cohort.expression, - backend=backend, - cdm_schema=cdm_schema, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - use_persistent_cache=False, - cohort_id=cohort.cohort_id, + start_time, end_time = await asyncio.wait_for( + asyncio.to_thread( + _process_single_cohort, + cohort, + backend=backend, + cdm_schema=cdm_schema, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + cohort_table=cohort_table, + ), + timeout=compile_timeout, ) - compile_end = datetime.now() - compile_duration = (compile_end - compile_start).total_seconds() - new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) - - # Materialize the compiled relation — this is the DB IO we time - start_time = datetime.now() - logger.debug( - "[%d/%d] Executing cohort %d (%s) ...", + + duration = (end_time - start_time).total_seconds() + logger.info( + "[%d/%d] Completed cohort %d (%s) -- duration %.1fs", i, total, cohort.cohort_id, cohort.cohort_name, + duration, ) - write_cohort( - compiled_relation=new_rows, - backend=backend, - cdm_schema=cdm_schema, - cohort_table=cohort_table, - cohort_id=cohort.cohort_id, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - if_exists="replace", - ) - end_time = datetime.now() - - duration = (end_time - start_time).total_seconds() - logger.info( - "[%d/%d] Completed cohort %d (%s) — compile %.1fs, execute %.1fs", + except asyncio.TimeoutError: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] TIMED OUT cohort %d (%s) after %.1fs", i, total, cohort.cohort_id, cohort.cohort_name, - compile_duration, duration, ) - except ExecutionError as exc: + timeout_exc = TimeoutError( + f"Cohort {cohort.cohort_id} ({cohort.cohort_name}) exceeded timeout of {compile_timeout:.0f}s" + ) + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="FAILED", + checksum=current_checksum, + start_time=start_time or datetime.now(), + end_time=end_time, + error=timeout_exc, + ) + ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="FAILED", + start_time=start_time or datetime.now(), + end_time=end_time, + ) + if stop_on_error: + raise timeout_exc from None + continue + except Exception as exc: if end_time is None: end_time = datetime.now() duration = (end_time - (start_time or end_time)).total_seconds() @@ -257,6 +310,43 @@ def generate_cohort_set( return results +def generate_cohort_set( + cohort_definition_set: CohortDefinitionSet, + *, + backend: IbisBackendLike, + cdm_schema: str, + cohort_table: str, + results_schema: str | None = None, + vocabulary_schema: str | None = None, + incremental: bool = False, + checksum_table: str = "cohort_checksum", + stop_on_error: bool = True, +) -> list[CohortGenerationResult]: + """Generate all cohorts in a CohortDefinitionSet and write them to a shared table. + + This synchronous wrapper delegates to :func:`async_generate_cohort_set` + via :func:`asyncio.run`. See that function for full parameter + documentation. + + Raises: + RuntimeError: If called from within a running asyncio event loop. + Use :func:`async_generate_cohort_set` directly in that case. + """ + return asyncio.run( + async_generate_cohort_set( + cohort_definition_set, + backend=backend, + cdm_schema=cdm_schema, + cohort_table=cohort_table, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + incremental=incremental, + checksum_table=checksum_table, + stop_on_error=stop_on_error, + ) + ) + + def summarise_generation_results( results: list[CohortGenerationResult], ) -> dict[Literal["COMPLETE", "SKIPPED", "FAILED"], int]: @@ -266,7 +356,8 @@ def summarise_generation_results( results: List of CohortGenerationResult from generate_cohort_set. Returns: - dict with counts for each status, e.g. {"COMPLETE": 2, "SKIPPED": 1, "FAILED": 0}. + dict with counts for each status, e.g. + ``{"COMPLETE": 2, "SKIPPED": 1, "FAILED": 0}``. """ counts: dict[Literal["COMPLETE", "SKIPPED", "FAILED"], int] = { "COMPLETE": 0, diff --git a/tests/test_cohort_definition_set.py b/tests/test_cohort_definition_set.py index 91f1b06..3c85aad 100644 --- a/tests/test_cohort_definition_set.py +++ b/tests/test_cohort_definition_set.py @@ -307,6 +307,50 @@ def _failing_write_cohort(*, compiled_relation, cohort_id, **kwargs): assert results[0].error is not None +def test_generate_cohort_set_continue_on_non_execution_error(): + """Non-ExecutionError exceptions must also be caught and recorded.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from unittest.mock import patch + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + call_count = 0 + + def _failing_build(expression, *, backend, cohort_id, **kwargs): + nonlocal call_count + call_count += 1 + if cohort_id == 1: + raise RuntimeError("Simulated RuntimeError") + from circe.execution.api import build_cohort as real_build + + return real_build(expression, backend=backend, cohort_id=cohort_id, **kwargs) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Good", expression=_simple_expression()) + + with patch( + "circe.cohort_definition_set._generate.build_cohort", + side_effect=_failing_build, + ): + results = generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_non_exec", + stop_on_error=False, + ) + + assert call_count == 2 + statuses = {r.cohort_id: r.status for r in results} + assert statuses[1] == "FAILED" + assert statuses[2] == "COMPLETE" + assert isinstance(results[0].error, RuntimeError) + + def test_generate_cohort_set_stop_on_error(): ibis = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") @@ -531,5 +575,200 @@ def test_api_exports_cohort_definition_set(): assert hasattr(api, "CohortDefinitionSet") assert hasattr(api, "CohortDefinition") assert hasattr(api, "CohortGenerationResult") + assert hasattr(api, "async_generate_cohort_set") assert hasattr(api, "generate_cohort_set") assert hasattr(api, "summarise_generation_results") + + +# --------------------------------------------------------------------------- +# async_generate_cohort_set tests +# --------------------------------------------------------------------------- + + +def test_async_generate_cohort_set_basic(): + import asyncio + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from circe.cohort_definition_set._generate import async_generate_cohort_set + + cds = CohortDefinitionSet() + cds.add(cohort_id=10, cohort_name="Cohort 10", expression=_simple_expression()) + cds.add(cohort_id=20, cohort_name="Cohort 20", expression=_simple_expression()) + + results = asyncio.run( + async_generate_cohort_set(cds, backend=conn, cdm_schema="main", cohort_table="cohort_async") + ) + + assert len(results) == 2 + assert all(r.status == "COMPLETE" for r in results) + assert {r.cohort_id for r in results} == {10, 20} + + cohort_table = conn.table("cohort_async").execute() + assert set(cohort_table.cohort_definition_id) == {10, 20} + + +def test_async_generate_cohort_set_continue_on_non_execution_error(): + """Non-ExecutionError exceptions (e.g. databricks ServerOperationError) + must be caught and recorded as FAILED.""" + import asyncio + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from unittest.mock import patch + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from circe.cohort_definition_set._generate import async_generate_cohort_set + + call_count = 0 + + def _failing_build(expression, *, backend, cohort_id, **kwargs): + nonlocal call_count + call_count += 1 + if cohort_id == 1: + raise RuntimeError("Simulated non-ExecutionError failure") + from circe.execution.api import build_cohort as real_build + + return real_build(expression, backend=backend, cohort_id=cohort_id, **kwargs) + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Good", expression=_simple_expression()) + + with patch( + "circe.cohort_definition_set._generate.build_cohort", + side_effect=_failing_build, + ): + results = asyncio.run( + async_generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_non_exec", + stop_on_error=False, + ) + ) + + assert call_count == 2 + statuses = {r.cohort_id: r.status for r in results} + assert statuses[1] == "FAILED" + assert statuses[2] == "COMPLETE" + assert isinstance(results[0].error, RuntimeError) + + +def test_async_generate_cohort_set_stop_on_error(): + import asyncio + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from unittest.mock import patch + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from circe.cohort_definition_set._generate import async_generate_cohort_set + + def _always_fail(expression, *, backend, cohort_id, **kwargs): + raise ValueError("Simulated non-ExecutionError failure") + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Bad", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="Also bad", expression=_simple_expression()) + + with ( + patch( + "circe.cohort_definition_set._generate.build_cohort", + side_effect=_always_fail, + ), + pytest.raises(ValueError, match="Simulated non-ExecutionError failure"), + ): + asyncio.run( + async_generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_stop", + stop_on_error=True, + ) + ) + + +def test_async_generate_cohort_set_timeout(): + import asyncio + import time + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from unittest.mock import patch + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from circe.cohort_definition_set._generate import async_generate_cohort_set + + def _slow_build(expression, *, backend, cohort_id, **kwargs): + time.sleep(0.5) + raise RuntimeError("should have timed out") + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="Slow", expression=_simple_expression()) + + with patch( + "circe.cohort_definition_set._generate.build_cohort", + side_effect=_slow_build, + ): + results = asyncio.run( + async_generate_cohort_set( + cds, + backend=conn, + cdm_schema="main", + cohort_table="cohort_timeout", + stop_on_error=False, + compile_timeout=0.1, + ) + ) + + assert len(results) == 1 + assert results[0].status == "FAILED" + assert "timeout" in str(results[0].error).lower() + + +def test_async_generate_cohort_set_incremental_skip(): + import asyncio + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_tables(conn, ibis) + + from circe.cohort_definition_set._generate import async_generate_cohort_set + + cds = CohortDefinitionSet() + cds.add(cohort_id=1, cohort_name="A", expression=_simple_expression()) + cds.add(cohort_id=2, cohort_name="B", expression=_simple_expression()) + + # First run -- both COMPLETE + first = asyncio.run( + async_generate_cohort_set( + cds, backend=conn, cdm_schema="main", cohort_table="cohort_inc_async", incremental=True + ) + ) + assert all(r.status == "COMPLETE" for r in first) + + # Second run -- both SKIPPED + second = asyncio.run( + async_generate_cohort_set( + cds, backend=conn, cdm_schema="main", cohort_table="cohort_inc_async", incremental=True + ) + ) + assert all(r.status == "SKIPPED" for r in second) From f27ddb09d77aec25dbbcfcdbdde54f19b57d6434 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 08:48:07 -0700 Subject: [PATCH 20/53] updated concept set resolution to remove in memory python nonsense --- circe/cohort_definition_set/_generate.py | 402 ++++++----- circe/execution/__init__.py | 2 - circe/execution/api.py | 22 +- circe/execution/engine/cohort.py | 26 +- circe/execution/engine/custom_era.py | 2 +- circe/execution/engine/group_demographics.py | 61 +- circe/execution/ibis/codesets.py | 658 +++++++++++++----- circe/execution/ibis/compile_steps.py | 128 ++-- circe/execution/ibis/context.py | 62 +- circe/execution/ibis/person_filters.py | 28 +- .../test_codesets_persistent_cache.py | 206 ++---- tests/execution/test_compile_steps_helpers.py | 13 +- tests/execution/test_context_wiring.py | 138 ++-- tests/execution/test_group_demographics.py | 22 +- 14 files changed, 1052 insertions(+), 718 deletions(-) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 18e89ff..653ff20 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,17 +10,57 @@ from typing import TYPE_CHECKING, Literal from ..execution.api import build_cohort, write_cohort +from ..execution.ibis.codesets import _CODESET_TABLE, build_batch_codeset_table, drop_codeset_table +from ..execution.ibis.context import make_execution_context +from ..execution.normalize.cohort import normalize_cohort from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult if TYPE_CHECKING: - from ..execution.typing import IbisBackendLike + from ..execution.typing import IbisBackendLike, Table logger = logging.getLogger(__name__) _backend_lock = threading.Lock() +def _collect_concept_sets( + cohort_definition_set: CohortDefinitionSet, +) -> dict[int: NormalizedConceptSet]: # type: ignore + """Normalize all cohort expressions and merge concept sets.""" + from ..execution.normalize.cohort import NormalizedConceptSet # noqa: F401 + + all_sets: dict[int, NormalizedConceptSet] = {} + for cohort in cohort_definition_set: + normalized = normalize_cohort(cohort.expression) + for cid, cset in normalized.concept_sets.items(): + if cid not in all_sets: + all_sets[cid] = cset + return all_sets + + +def _build_and_return_batch_codesets( + *, + backend: IbisBackendLike, + concept_sets: dict, + results_schema: str | None, + vocabulary_schema: str | None, + results_table_name: str, +) -> Table: + """Build batch codeset table. Called inside a thread.""" + from ..execution.typing import Table as TableType + + return build_batch_codeset_table( + backend=backend, + concept_sets=concept_sets, + batch_table_name=results_table_name, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + use_persistent_cache=False, + temporary=False, + ) + + def _process_single_cohort( cohort: CohortDefinition, *, @@ -29,8 +69,11 @@ def _process_single_cohort( results_schema: str | None, vocabulary_schema: str | None, cohort_table: str, + codeset_table: Table, ) -> tuple[datetime, datetime]: - """Build and write a single cohort. Thread-safe via ``_backend_lock``. + """Build and write a single cohort against a pre-populated codeset table. + + Thread-safe via ``_backend_lock``. Returns ``(start_time, end_time)`` of the database-materialization phase so the caller can compute execution duration. @@ -45,8 +88,8 @@ def _process_single_cohort( cdm_schema=cdm_schema, # type: ignore[arg-type] results_schema=results_schema, vocabulary_schema=vocabulary_schema, - use_persistent_cache=False, cohort_id=cohort.cohort_id, + codeset_table=codeset_table, ) projected = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) write_cohort( @@ -89,33 +132,10 @@ async def async_generate_cohort_set( recorded as ``FAILED``, ensuring that transient database errors such as Databricks ``ServerOperationError`` do not abort the entire batch. - Args: - cohort_definition_set: The set of cohort definitions to generate. - backend: Ibis backend connection pointing at the target database. - cdm_schema: Schema containing the OMOP CDM source tables. - cohort_table: Name of the OHDSI cohort table to write results into. - results_schema: Optional schema for both the cohort table and - checksum table. - vocabulary_schema: Optional schema for vocabulary tables (defaults - to *cdm_schema*). - incremental: If True, skip cohorts whose expression checksum is - unchanged since the last successful generation. - checksum_table: Name of the table used to persist checksums for - incremental runs. Defaults to ``"cohort_checksum"``. - stop_on_error: If True (default), raise on the first failure and - stop processing remaining cohorts. If False, record the - failure and continue. - compile_timeout: Maximum time in seconds to allow per-cohort before - recording a timeout failure. ``None`` means no timeout. - - Returns: - A list of :class:`CohortGenerationResult` — one entry per cohort - in the set, in insertion order. - - Raises: - Exception: If a cohort fails and ``stop_on_error=True`` (the - exception type matches whatever the underlying operation - raised). + A single batch codeset table is populated at the start once with *all* + concept sets from *all* cohorts, so that concept-ancestor and + concept-relationship lookups happen in one bulk query rather than per + cohort. The table is dropped before returning. """ total = len(cohort_definition_set) @@ -132,128 +152,189 @@ async def async_generate_cohort_set( table_name=checksum_table, ) + # Collect and materialise concept sets once for the entire batch + all_concept_sets = await asyncio.to_thread(_collect_concept_sets, cohort_definition_set) + batch_codesets_table_name = _CODESET_TABLE + codeset_table: Table | None = None + if all_concept_sets: + codeset_table = await asyncio.to_thread( + _build_and_return_batch_codesets, + backend=backend, + concept_sets=all_concept_sets, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + results_table_name=batch_codesets_table_name, + ) + else: + # No concept sets -- create empty memtable to satisfy ExecutionContext + import ibis # noqa: PLC0415 + codeset_table = ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + results: list[CohortGenerationResult] = [] - logger.info("Generating %d cohort(s) (incremental=%s)", total, incremental) + logger.info( + "Generating %d cohort(s) (incremental=%s) using batch codeset table", + total, + incremental, + ) - for i, cohort in enumerate(cohort_definition_set, start=1): - current_checksum = cohort.expression.checksum() + try: + for i, cohort in enumerate(cohort_definition_set, start=1): + current_checksum = cohort.expression.checksum() + + if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: + logger.info( + "[%d/%d] Skipping cohort %d (%s) -- checksum unchanged", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + ) + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="SKIPPED", + checksum=current_checksum, + start_time=datetime.now(), + end_time=datetime.now(), + ) + ) + continue - if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: logger.info( - "[%d/%d] Skipping cohort %d (%s) -- checksum unchanged", + "[%d/%d] Building cohort %d (%s) ...", i, total, cohort.cohort_id, cohort.cohort_name, ) - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="SKIPPED", - checksum=current_checksum, - start_time=datetime.now(), - end_time=datetime.now(), - ) - ) - continue - - logger.info( - "[%d/%d] Building cohort %d (%s) ...", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - ) - start_time: datetime | None = None - end_time: datetime | None = None - try: - start_time, end_time = await asyncio.wait_for( - asyncio.to_thread( - _process_single_cohort, - cohort, - backend=backend, - cdm_schema=cdm_schema, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - cohort_table=cohort_table, - ), - timeout=compile_timeout, - ) + start_time: datetime | None = None + end_time: datetime | None = None + try: + start_time, end_time = await asyncio.wait_for( + asyncio.to_thread( + _process_single_cohort, + cohort, + backend=backend, + cdm_schema=cdm_schema, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + cohort_table=cohort_table, + codeset_table=codeset_table, + ), + timeout=compile_timeout, + ) - duration = (end_time - start_time).total_seconds() - logger.info( - "[%d/%d] Completed cohort %d (%s) -- duration %.1fs", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - ) - except asyncio.TimeoutError: - if end_time is None: - end_time = datetime.now() - duration = (end_time - (start_time or end_time)).total_seconds() - logger.error( - "[%d/%d] TIMED OUT cohort %d (%s) after %.1fs", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - ) - timeout_exc = TimeoutError( - f"Cohort {cohort.cohort_id} ({cohort.cohort_name}) exceeded timeout of {compile_timeout:.0f}s" - ) - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="FAILED", - checksum=current_checksum, - start_time=start_time or datetime.now(), - end_time=end_time, - error=timeout_exc, + duration = (end_time - start_time).total_seconds() + logger.info( + "[%d/%d] Completed cohort %d (%s) -- duration %.1fs", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, ) - ) - if incremental: - upsert_generation_history( - backend, - schema=results_schema, - table_name=checksum_table, - cohort_id=cohort.cohort_id, - checksum=current_checksum, - status="FAILED", - start_time=start_time or datetime.now(), - end_time=end_time, + except asyncio.TimeoutError: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] TIMED OUT cohort %d (%s) after %.1fs", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, ) - if stop_on_error: - raise timeout_exc from None - continue - except Exception as exc: - if end_time is None: - end_time = datetime.now() - duration = (end_time - (start_time or end_time)).total_seconds() - logger.error( - "[%d/%d] FAILED cohort %d (%s) after %.1fs: %s", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - exc, - ) + timeout_exc = TimeoutError( + f"Cohort {cohort.cohort_id} ({cohort.cohort_name}) " + f"exceeded timeout of {compile_timeout:.0f}s" + ) + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="FAILED", + checksum=current_checksum, + start_time=start_time or datetime.now(), + end_time=end_time, + error=timeout_exc, + ) + ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="FAILED", + start_time=start_time or datetime.now(), + end_time=end_time, + ) + if stop_on_error: + raise timeout_exc from None + continue + except Exception as exc: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] FAILED cohort %d (%s) after %.1fs: %s", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + exc, + ) + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="FAILED", + checksum=current_checksum, + start_time=start_time or datetime.now(), + end_time=end_time, + error=exc, + ) + ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="FAILED", + start_time=start_time or datetime.now(), + end_time=end_time, + ) + if stop_on_error: + raise + continue + + # Clean up staging tables created by the materialized pipeline + schema = results_schema or cdm_schema + for stage in ("primary", "qualified", "included", "ended"): + with contextlib.suppress(Exception): + backend.drop_table( + f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True + ) + results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, cohort_name=cohort.cohort_name, - status="FAILED", + status="COMPLETE", checksum=current_checksum, start_time=start_time or datetime.now(), - end_time=end_time, - error=exc, + end_time=end_time or datetime.now(), ) ) if incremental: @@ -263,52 +344,29 @@ async def async_generate_cohort_set( table_name=checksum_table, cohort_id=cohort.cohort_id, checksum=current_checksum, - status="FAILED", + status="COMPLETE", start_time=start_time or datetime.now(), - end_time=end_time, + end_time=end_time or datetime.now(), ) - if stop_on_error: - raise - continue - - # Clean up staging tables created by the materialized pipeline - schema = results_schema or cdm_schema - for stage in ("primary", "qualified", "included", "ended"): - with contextlib.suppress(Exception): - backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) - - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="COMPLETE", - checksum=current_checksum, - start_time=start_time or datetime.now(), - end_time=end_time or datetime.now(), - ) + + summary = summarise_generation_results(results) + logger.info( + "Cohort generation complete: %d completed, %d skipped, %d failed", + summary["COMPLETE"], + summary["SKIPPED"], + summary["FAILED"], ) - if incremental: - upsert_generation_history( + + return results + finally: + if all_concept_sets: + await asyncio.to_thread( + drop_codeset_table, backend, - schema=results_schema, - table_name=checksum_table, - cohort_id=cohort.cohort_id, - checksum=current_checksum, - status="COMPLETE", - start_time=start_time or datetime.now(), - end_time=end_time or datetime.now(), + batch_table_name=batch_codesets_table_name, + results_schema=results_schema, ) - summary = summarise_generation_results(results) - logger.info( - "Cohort generation complete: %d completed, %d skipped, %d failed", - summary["COMPLETE"], - summary["SKIPPED"], - summary["FAILED"], - ) - - return results - def generate_cohort_set( cohort_definition_set: CohortDefinitionSet, diff --git a/circe/execution/__init__.py b/circe/execution/__init__.py index 180d792..ba27df0 100644 --- a/circe/execution/__init__.py +++ b/circe/execution/__init__.py @@ -13,12 +13,10 @@ UnsupportedCriterionError, UnsupportedFeatureError, ) -from .ibis.codesets import clear_codeset_cache __all__ = [ "build_cohort", "write_cohort", - "clear_codeset_cache", "apply_databricks_post_connect_workaround", "ExecutionError", "ExecutionNormalizationError", diff --git a/circe/execution/api.py b/circe/execution/api.py index 63ae5c1..6d525f8 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -6,6 +6,7 @@ from .databricks_compat import maybe_apply_databricks_post_connect_workaround from .engine.cohort import build_cohort_table from .errors import ExecutionError +from .ibis.codesets import build_single_codeset_table from .ibis.context import make_execution_context from .ibis.materialize import project_to_ohdsi_cohort_table from .ibis.operations import ( @@ -32,6 +33,7 @@ def build_cohort( use_persistent_cache: bool = False, cohort_id: int = 0, materialize: bool = True, + codeset_table: Table | None = None, ) -> Table: """Normalize, compile, and assemble a cohort relation. @@ -39,20 +41,34 @@ def build_cohort( and *materialize* is True, so that the ibis expression tree never grows too large to compile. Set *materialize=False* for compile-only use (e.g. unit tests that only verify the expression tree can be built). + + When *codeset_table* is provided (from a batch-generation caller), it is + used directly. Otherwise one is auto-created for this single cohort and + dropped after the pipeline runs. """ maybe_apply_databricks_post_connect_workaround(backend) normalized = normalize_cohort(expression) + if codeset_table is not None: + own_table = False + else: + codeset_table = build_single_codeset_table( + backend=backend, + concept_sets=normalized.concept_sets, + batch_table_name=f"__cg_{cohort_id}_codesets", + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + ) + own_table = True + ctx = make_execution_context( backend=backend, cdm_schema=cdm_schema, results_schema=results_schema, vocabulary_schema=vocabulary_schema, - concept_sets=normalized.concept_sets, - use_persistent_cache=use_persistent_cache, + codeset_table=codeset_table, ) - return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 74e91bf..81825de 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -93,12 +93,26 @@ def build_cohort_table( ) # ── Inclusion rules ───────────────────────────────────────────────── - included_events = apply_inclusion_rules(qualified_events, normalized.inclusion_rules, ctx) - included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) - if materialize: - included_events = _materialize( - included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema - ) + # Materialise after every inclusion rule so that the ibis expression tree + # never grows deeper than one rule's worth of operations. Without this a + # cohort with N rules builds an N-level tree that, when compiled into a + # single SQL statement, produces query plans too large for some backends + # (e.g. Databricks Spark) to execute without resource exhaustion. + if materialize and normalized.inclusion_rules: + included_events = qualified_events + for rule in normalized.inclusion_rules: + included_events = apply_additional_criteria(included_events, rule.expression, ctx) + included_events = _materialize( + included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + ) + included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) + else: + included_events = apply_inclusion_rules(qualified_events, normalized.inclusion_rules, ctx) + included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) + if materialize: + included_events = _materialize( + included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + ) # ── End strategy ──────────────────────────────────────────────────── ended_events = apply_end_strategy(included_events, normalized.end_strategy, ctx) diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py index 8a80523..d223b88 100644 --- a/circe/execution/engine/custom_era.py +++ b/circe/execution/engine/custom_era.py @@ -83,7 +83,7 @@ def compute_drug_eras( days_supply_override: int | None, cohort_person_ids=None, ): - concept_ids = ctx.concept_ids_for_codeset(drug_codeset_id) + concept_ids = tuple(sorted(int(i) for i in ctx.concept_set_table(drug_codeset_id).execute().iloc[:, 0])) if not concept_ids: de = ctx.table("drug_exposure") diff --git a/circe/execution/engine/group_demographics.py b/circe/execution/engine/group_demographics.py index bc5920a..13c479b 100644 --- a/circe/execution/engine/group_demographics.py +++ b/circe/execution/engine/group_demographics.py @@ -3,7 +3,36 @@ import ibis from ..errors import UnsupportedFeatureError +from ..normalize.groups import NormalizedDemographicCriteria +from ..plan.schema import EVENT_ID, PERSON_ID +from ..typing import Table +from .group_keys import event_keys from ..ibis.context import ExecutionContext + + +def _apply_numeric_predicate(expr, predicate): + ... + + +def _apply_date_predicate(date_expr, predicate): + ... + + +def _demographic_concept_table( + *, + explicit_ids: tuple[int, ...], + codeset_id: int | None, + ctx: ExecutionContext, +) -> Table | None: + """Return an ibis Table with a single 'concept_id' column, or None if empty.""" + if codeset_id is not None: + return ctx.concept_set_table(codeset_id) + elif explicit_ids: + return ibis.memtable( + {"concept_id": list(explicit_ids)}, + schema={"concept_id": "int64"}, + ) + return None from ..normalize.groups import NormalizedDemographicCriteria from ..plan.schema import EVENT_ID, PERSON_ID from ..typing import Table @@ -80,20 +109,6 @@ def _apply_date_predicate(expr, predicate): ) -def _demographic_concept_ids( - *, - explicit_ids: tuple[int, ...], - codeset_id: int | None, - ctx: ExecutionContext, -) -> tuple[int, ...]: - all_ids = list(explicit_ids) - if codeset_id is not None: - for concept_id in ctx.concept_ids_for_codeset(codeset_id): - if concept_id not in all_ids: - all_ids.append(concept_id) - return tuple(all_ids) - - def demographic_match_keys( index_events: Table, demographic: NormalizedDemographicCriteria, @@ -115,29 +130,29 @@ def demographic_match_keys( age_years = event_date.year() - joined.year_of_birth predicates.append(_apply_numeric_predicate(age_years, demographic.age)) - gender_ids = _demographic_concept_ids( + gender_table = _demographic_concept_table( explicit_ids=demographic.gender_concept_ids, codeset_id=demographic.gender_codeset_id, ctx=ctx, ) - if gender_ids: - predicates.append(joined.gender_concept_id.isin(gender_ids)) + if gender_table is not None: + joined = joined.join(gender_table, joined.gender_concept_id == gender_table.concept_id) - race_ids = _demographic_concept_ids( + race_table = _demographic_concept_table( explicit_ids=demographic.race_concept_ids, codeset_id=demographic.race_codeset_id, ctx=ctx, ) - if race_ids: - predicates.append(joined.race_concept_id.isin(race_ids)) + if race_table is not None: + joined = joined.join(race_table, joined.race_concept_id == race_table.concept_id) - ethnicity_ids = _demographic_concept_ids( + ethnicity_table = _demographic_concept_table( explicit_ids=demographic.ethnicity_concept_ids, codeset_id=demographic.ethnicity_codeset_id, ctx=ctx, ) - if ethnicity_ids: - predicates.append(joined.ethnicity_concept_id.isin(ethnicity_ids)) + if ethnicity_table is not None: + joined = joined.join(ethnicity_table, joined.ethnicity_concept_id == ethnicity_table.concept_id) if demographic.occurrence_start_date is not None: predicates.append( diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index f0df82e..dcaae87 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -5,225 +5,519 @@ from collections.abc import Callable, Mapping from typing import Any +import ibis + from ..errors import CompilationError from ..normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem from ..plan.schema import CONCEPT_ID from ..typing import IbisBackendLike, Table +_CODESET_TABLE = "__cg_codesets" _CACHE_TABLE_NAME = "_circe_codeset_cache" def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: """Deterministic SHA-256 hash of sorted concept set items.""" canonical = sorted( - (item.concept_id, item.is_excluded, item.include_descendants, item.include_mapped) for item in items + (item.concept_id, item.is_excluded, item.include_descendants, item.include_mapped) + for item in items ) payload = json.dumps(canonical, separators=(",", ":")) return hashlib.sha256(payload.encode("utf-8")).hexdigest() -def clear_codeset_cache( - backend: IbisBackendLike, - results_schema: str | None, -) -> None: - """Drop the persistent codeset cache table if it exists.""" - from .operations import create_table, table_exists - - if not table_exists(backend, table_name=_CACHE_TABLE_NAME, schema=results_schema): - return +def _vocabulary_table( + table_name: str, + *, + vocabulary_schema: str | None, + table_getter: Callable[[str, str | None], Table], +) -> Table: + try: + return table_getter(table_name, vocabulary_schema) + except Exception as exc: + raise CompilationError( + f"Ibis executor compilation error: failed to access vocabulary table '{table_name}'." + ) from exc + + +def _descendant_expression( + ancestor_ids: tuple[int, ...], + *, + table_getter: Callable[[str, str | None], Table], + vocabulary_schema: str | None, +) -> Table: + """Build lazy ibis expression for descendant concept IDs of given ancestors. + + SELECT descendant_concept_id + FROM concept c + JOIN concept_ancestor ca ON c.concept_id = ca.descendant_concept_id + WHERE ca.ancestor_concept_id IN (...) AND c.invalid_reason IS NULL + """ + concept = _vocabulary_table("concept", vocabulary_schema=vocabulary_schema, table_getter=table_getter) + concept_ancestor = _vocabulary_table( + "concept_ancestor", vocabulary_schema=vocabulary_schema, table_getter=table_getter + ) + return ( + concept_ancestor.join(concept, concept_ancestor.descendant_concept_id == concept.concept_id) + .filter(concept_ancestor.ancestor_concept_id.isin(ancestor_ids)) + .filter(concept.invalid_reason.isnull()) + .select(concept_ancestor.descendant_concept_id.name(CONCEPT_ID)) + .distinct() + ) - import ibis - empty = ibis.memtable( - {"cache_key": [], "concept_id": []}, - schema={"cache_key": "string", "concept_id": "int64"}, +def _mapped_expression( + concept_ids: tuple[int, ...], + *, + table_getter: Callable[[str, str | None], Table], + vocabulary_schema: str | None, +) -> Table: + """Build lazy ibis expression for mapped-to concept IDs. + + SELECT DISTINCT cr.concept_id_1 AS concept_id + FROM concept_relationship cr + WHERE cr.concept_id_2 IN (...) + AND cr.relationship_id = 'Maps to' + AND cr.invalid_reason IS NULL + """ + concept_relationship = _vocabulary_table( + "concept_relationship", vocabulary_schema=vocabulary_schema, table_getter=table_getter ) - create_table(backend, table_name=_CACHE_TABLE_NAME, schema=results_schema, obj=empty, overwrite=True) - - -class CachedConceptSetResolver: - """Resolve concept sets to concrete concept IDs using vocabulary tables.""" - - def __init__( - self, - *, - table_getter: Callable[[str, str | None], Table], - vocabulary_schema: str | None, - concept_sets: Mapping[int, NormalizedConceptSet], - backend: IbisBackendLike | None = None, - results_schema: str | None = None, - use_persistent_cache: bool = False, - ) -> None: - self._table_getter = table_getter - self._vocabulary_schema = vocabulary_schema - self._concept_sets = concept_sets - self._cache: dict[int, tuple[int, ...]] = {} - self._backend = backend - self._results_schema = results_schema - self._use_persistent_cache = ( - use_persistent_cache and backend is not None and results_schema is not None - ) - self._persistent_cache_initialized: bool = False - - def resolve_codeset(self, codeset_id: int) -> tuple[int, ...]: - normalized_id = int(codeset_id) - if normalized_id in self._cache: - return self._cache[normalized_id] - - concept_set = self._concept_sets.get(normalized_id) - if concept_set is None or not concept_set.items: - return () - - # L2: persistent cache lookup - cache_key: str | None = None - if self._use_persistent_cache: - cache_key = _compute_cache_key(concept_set.items) - persistent_hit = self._read_persistent_cache(cache_key) - if persistent_hit is not None: - self._cache[normalized_id] = persistent_hit - return persistent_hit - - include_ids: set[int] = set() - exclude_ids: set[int] = set() - for item in concept_set.items: - expanded = self._expand_item(item) - if item.is_excluded: - exclude_ids.update(expanded) - else: - include_ids.update(expanded) + return ( + concept_relationship.filter(concept_relationship.concept_id_2.isin(concept_ids)) + .filter(concept_relationship.relationship_id == "Maps to") + .filter(concept_relationship.invalid_reason.isnull()) + .select(concept_relationship.concept_id_1.name(CONCEPT_ID)) + .distinct() + ) + + +def build_concept_set_expression( + concept_set: NormalizedConceptSet, + *, + table_getter: Callable[[str, str | None], Table], + vocabulary_schema: str | None, +) -> Table: + """Build a lazy ibis Table expression that resolves all concept IDs for one concept set. - resolved = tuple(sorted(include_ids - exclude_ids)) - self._cache[normalized_id] = resolved + The returned ibis expression is never executed at build time -- it becomes + a subquery embedded in the final cohort SQL. The database engine performs + the concept-ancestor and concept-relationship joins at execution time. + """ + include_parts: list[Table] = [] + exclude_ids: list[int] = [] - # L2: persistent cache write - if self._use_persistent_cache and cache_key is not None and resolved: - self._write_persistent_cache(cache_key, resolved) + for item in concept_set.items: + if item.concept_id is None: + continue - return resolved + direct: tuple[int, ...] = (int(item.concept_id),) - def _expand_item(self, item: NormalizedConceptSetItem) -> set[int]: - base_ids: set[int] = {int(item.concept_id)} if item.include_descendants: - base_ids.update(self._descendant_ids(base_ids)) + desc = _descendant_expression( + direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + else: + desc = ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}) + # For exclude items without descendants, just track the concept id + if item.is_excluded: + exclude_ids.append(int(item.concept_id)) + continue - expanded = set(base_ids) if item.include_mapped: - expanded.update(self._mapped_ids(base_ids)) - return expanded + mapped = _mapped_expression( + direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + if item.include_descendants: + # Need mapped for both direct AND descendants + desc_mapped = _mapped_expression( + direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + full = _union_all_tables([desc, desc_mapped]) + else: + full = _union_all_tables([desc, mapped]) + else: + full = desc - def _vocabulary_table(self, table_name: str) -> Table: - try: - return self._table_getter(table_name, self._vocabulary_schema) - except Exception as exc: # pragma: no cover - backend specific error types - raise CompilationError( - f"Ibis executor compilation error: failed to access vocabulary table '{table_name}'." - ) from exc - - def _descendant_ids(self, ancestor_ids: set[int]) -> set[int]: - if not ancestor_ids: - return set() - - concept = self._vocabulary_table("concept") - concept_ancestor = self._vocabulary_table("concept_ancestor") - query = ( - concept_ancestor.join( - concept, - concept_ancestor.descendant_concept_id == concept.concept_id, + if item.is_excluded: + if item.include_descendants or item.include_mapped: + # Complex exclude with expansion needs anti-join + include_parts.append(full) + exclude_ids.append(None) # marker for complex exclude + else: + exclude_ids.append(int(item.concept_id)) + else: + include_parts.append(full) + + # Build include part + if not include_parts: + if not exclude_ids: + return ibis.memtable({"concept_id": []}, schema={"concept_id": "int64"}) + # Only simple excludes -- just exclude those IDs from everything + concept = _vocabulary_table("concept", vocabulary_schema=vocabulary_schema, table_getter=table_getter) + return concept.filter( + ~concept.concept_id.isin(ibis.literal(list(exclude_ids), type="array")) + ).select(concept.concept_id.name(CONCEPT_ID)) + + # Actually, let me reconsider the exclude handling. Simple excludes (plain IDs with no + # descendants/mapped) can be handled via anti-join after the include union. + # Complex excludes (with descendants/mapped) need to be treated as included items + # that are then excluded via anti-join. + # For simplicity and correctness, let me handle excludes uniformly via anti-join. + + return _build_codeset_expression(concept_set, table_getter=table_getter, vocabulary_schema=vocabulary_schema) + + +def _build_codeset_expression( + concept_set: NormalizedConceptSet, + *, + table_getter: Callable[[str, str | None], Table], + vocabulary_schema: str | None, +) -> Table: + """Build lazy ibis expression for a concept set with include/exclude logic.""" + include_parts: list[Table] = [] + exclude_parts: list[Table] = [] + + for item in concept_set.items: + if item.concept_id is None: + continue + + direct: tuple[int, ...] = (int(item.concept_id),) + + # Build base expression for this item + if item.include_descendants: + base = _descendant_expression( + direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema ) - .filter(concept_ancestor.ancestor_concept_id.isin(tuple(ancestor_ids))) - .filter(concept.invalid_reason.isnull()) - .select(concept_ancestor.descendant_concept_id.name(CONCEPT_ID)) - .distinct() - ) - return self._execute_concept_id_query(query) - - def _mapped_ids(self, input_ids: set[int]) -> set[int]: - if not input_ids: - return set() - - concept_relationship = self._vocabulary_table("concept_relationship") - query = ( - concept_relationship.filter(concept_relationship.concept_id_2.isin(tuple(input_ids))) - .filter(concept_relationship.relationship_id == "Maps to") - .filter(concept_relationship.invalid_reason.isnull()) - .select(concept_relationship.concept_id_1.name(CONCEPT_ID)) - .distinct() - ) - return self._execute_concept_id_query(query) + else: + base = ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}) - def _execute_concept_id_query(self, query: Table) -> set[int]: - try: - rows = query.execute() - except Exception as exc: # pragma: no cover - backend specific error types - raise CompilationError( - "Ibis executor compilation error: failed executing concept-set expansion query." - ) from exc - - values: list[Any] - if hasattr(rows, "columns"): # pandas DataFrame - values = rows[CONCEPT_ID].tolist() if CONCEPT_ID in rows.columns else rows.iloc[:, 0].tolist() - elif isinstance(rows, (list, tuple, set)): - values = list(rows) + if item.include_mapped: + mapped = _mapped_expression( + direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + base = _union_all_tables([base, mapped]) + + if item.is_excluded: + exclude_parts.append(base) else: - values = [rows] + include_parts.append(base) - output: set[int] = set() - for value in values: - if value is None: - continue - output.add(int(value)) - return output + if not include_parts: + return ibis.memtable({"concept_id": []}, schema={"concept_id": "int64"}) - # ------------------------------------------------------------------ - # Persistent cache helpers - # ------------------------------------------------------------------ + # Union all include parts + if len(include_parts) == 1: + result = include_parts[0] + else: + result = include_parts[0] + for part in include_parts[1:]: + result = result.union(part, distinct=False) - def _read_persistent_cache(self, cache_key: str) -> tuple[int, ...] | None: - from .operations import read_table, table_exists + result = result.distinct() - try: - if not table_exists(self._backend, table_name=_CACHE_TABLE_NAME, schema=self._results_schema): - return None - tbl = read_table(self._backend, table_name=_CACHE_TABLE_NAME, schema=self._results_schema) - rows = tbl.filter(tbl.cache_key == cache_key).select("concept_id").execute() - if hasattr(rows, "columns"): - values = rows["concept_id"].tolist() - elif isinstance(rows, (list, tuple)): - values = list(rows) - else: - return None - if not values: - return None - return tuple(sorted(int(v) for v in values if v is not None)) - except Exception: - return None + # Anti-join excluded concepts + for e in exclude_parts: + marked = e.mutate(_cm=ibis.literal(1, type="int64")) + result = result.join(marked, result.concept_id == marked.concept_id, how="left") + result = result.filter(result._cm.isnull()).drop("_cm") + + return result.select(result.concept_id.name(CONCEPT_ID)) + + +def _union_all_tables(tables: list[Table]) -> Table: + """Union multiple single-column ibis tables.""" + if not tables: + raise ValueError("_union_all_tables requires at least one table") + if len(tables) == 1: + return tables[0] + result = tables[0] + for t in tables[1:]: + result = result.union(t, distinct=False) + return result + + +def _drop_table( + backend: IbisBackendLike, + table_name: str, + schema: str | None, +) -> None: + """Safely drop a backend table.""" + try: + backend.drop_table(table_name, database=schema, force=True) + except Exception: + pass - def _write_persistent_cache(self, cache_key: str, concept_ids: tuple[int, ...]) -> None: - import ibis - from .operations import create_table, insert_relation, table_exists +def _table_getter_from_backend( + backend: IbisBackendLike, + schema: str, +) -> Callable[[str, str | None], Table]: + """Build a table_getter callable from an ibis backend.""" + def _getter(table_name: str, table_schema: str | None) -> Table: try: - data = ibis.memtable( - {"cache_key": [cache_key] * len(concept_ids), "concept_id": list(concept_ids)}, - schema={"cache_key": "string", "concept_id": "int64"}, + if table_schema is not None: + return backend.table(table_name, database=table_schema) + except TypeError: + pass + return backend.table(table_name) + + return _getter + + +def build_batch_codeset_table( + *, + backend: IbisBackendLike, + concept_sets: Mapping[int, NormalizedConceptSet], + batch_table_name: str = _CODESET_TABLE, + results_schema: str | None = None, + vocabulary_schema: str | None = None, + use_persistent_cache: bool = False, + temporary: bool = False, +) -> Table: + """Populate a database table ``batch_table_name`` with all concept set IDs. + + The table has schema ``(codeset_id INT64, concept_id INT64)`` and is + overwritten each call. Each concept set in *concept_sets* is resolved + via lazy ibis expressions that join concept_ancestor and + concept_relationship -- the database engine performs the expansion. + + When *use_persistent_cache* is True, previously-resolved checksums are + loaded from ``_circe_codeset_cache`` so that already-expanded concept + sets skip the vocabulary-table queries. + + Returns an ibis Table reference to the batch table. + """ + table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") + + # Collect cache keys and check persistent cache + cache_hits: dict[int, list[int]] = {} + uncached: list[tuple[int, NormalizedConceptSet]] = [] + if use_persistent_cache: + for cid, cset in concept_sets.items(): + if not cset.items: + continue + key = _compute_cache_key(cset.items) + cached = _read_codeset_cache( + backend, cache_key=key, schema=results_schema, table_name=_CACHE_TABLE_NAME ) - if not self._persistent_cache_initialized: - if not table_exists(self._backend, table_name=_CACHE_TABLE_NAME, schema=self._results_schema): - create_table( - self._backend, - table_name=_CACHE_TABLE_NAME, - schema=self._results_schema, - obj=data, - ) - self._persistent_cache_initialized = True - return - self._persistent_cache_initialized = True - insert_relation( - data, - backend=self._backend, - target_table=_CACHE_TABLE_NAME, - target_schema=self._results_schema, + if cached is not None: + cache_hits[cid] = list(cached) + else: + uncached.append((cid, cset)) + else: + uncached = list(concept_sets.items()) + + # Build the full batch query + parts: list[Table] = [] + + # Cache hits: just use the cached IDs + for cid, ids in cache_hits.items(): + if ids: + tbl = ibis.memtable( + {"codeset_id": [cid] * len(ids), "concept_id": ids}, + schema={"codeset_id": "int64", "concept_id": "int64"}, ) - except Exception: - pass + parts.append(tbl) + + # Cache misses: build lazy expansion expressions + for cid, cset in uncached: + if not cset.items: + continue + expr = _build_codeset_expression( + cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + labeled = expr.mutate(codeset_id=ibis.literal(cid, type="int64")).select("codeset_id", CONCEPT_ID) + parts.append(labeled) + + # Write to persistent cache after materialization (done below) + + if not parts: + # No concept sets at all -- create empty table + empty = ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + create_table_op( + backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=temporary + ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + + # Union all parts and materialize + if len(parts) == 1: + combined = parts[0] + else: + combined = parts[0] + for p in parts[1:]: + combined = combined.union(p, distinct=False) + + from .operations import create_table as create_table_op + + create_table_op( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=combined, + overwrite=True, + temp=temporary, + ) + + # Write newly-resolved concept sets to persistent cache + if use_persistent_cache: + for cid, cset in uncached: + if not cset.items: + continue + key = _compute_cache_key(cset.items) + if key in cache_hits: + continue + # Read back from the table to get resolved IDs for this codeset + ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) + resolved = ( + ref.filter(ref.codeset_id == cid) + .select(CONCEPT_ID) + .distinct() + .execute() + ) + cids = _extract_column(resolved, CONCEPT_ID) + if cids: + _write_codeset_cache( + backend, + cache_key=key, + concept_ids=cids, + schema=results_schema, + table_name=_CACHE_TABLE_NAME, + ) + + ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) + return ref + + +def build_single_codeset_table( + *, + backend: IbisBackendLike, + concept_sets: Mapping[int, NormalizedConceptSet], + batch_table_name: str = _CODESET_TABLE, + results_schema: str | None = None, + vocabulary_schema: str | None = None, +) -> Table: + """Build a single-cohort codeset table as a temporary database table. + + Like :func:`build_batch_codeset_table` but without persistent cache + support and creates a temporary table. Used as the auto-creation + fallback in ``build_cohort()`` when no batch table is provided. + """ + return build_batch_codeset_table( + backend=backend, + concept_sets=concept_sets, + batch_table_name=batch_table_name, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + use_persistent_cache=False, + temporary=True, + ) + + +def _read_table( + backend: IbisBackendLike, + *, + table_name: str, + schema: str | None, +) -> Table: + """Read a backend table as an ibis relation.""" + try: + if schema is not None: + return backend.table(table_name, database=schema) + except TypeError: + pass + return backend.table(table_name) + + +def _extract_column(result: Any, col_name: str) -> tuple[int, ...]: + """Extract a column from various ibis execute() return types.""" + if hasattr(result, "columns"): # pandas + values = result[col_name].tolist() if col_name in result.columns else result.iloc[:, 0].tolist() + elif isinstance(result, (list, tuple, set)): + values = list(result) + else: + values = [result] if result is not None else [] + return tuple(int(v) for v in values if v is not None) + + +def _read_codeset_cache( + backend: IbisBackendLike, + *, + cache_key: str, + schema: str | None, + table_name: str, +) -> tuple[int, ...] | None: + """Read cached concept IDs for a cache key from the persistent cache table.""" + from .operations import table_exists + + try: + if not table_exists(backend, table_name=table_name, schema=schema): + return None + tbl = _read_table(backend, table_name=table_name, schema=schema) + rows = tbl.filter(tbl.cache_key == cache_key).select("concept_id").execute() + ids = _extract_column(rows, "concept_id") + return ids if ids else None + except Exception: + return None + + +def _write_codeset_cache( + backend: IbisBackendLike, + *, + cache_key: str, + concept_ids: tuple[int, ...], + schema: str | None, + table_name: str, +) -> None: + """Persist resolved concept IDs to the cache table.""" + from .operations import create_table as create_table_op, insert_relation, table_exists + + if not concept_ids: + return + + try: + data = ibis.memtable( + {"cache_key": [cache_key] * len(concept_ids), "concept_id": list(concept_ids)}, + schema={"cache_key": "string", "concept_id": "int64"}, + ) + if not table_exists(backend, table_name=table_name, schema=schema): + create_table_op(backend, table_name=table_name, schema=schema, obj=data) + return + insert_relation(data, backend=backend, target_table=table_name, target_schema=schema) + except Exception: + pass + + +def drop_codeset_table( + backend: IbisBackendLike, + *, + batch_table_name: str = _CODESET_TABLE, + results_schema: str | None = None, +) -> None: + """Drop the batch codeset table.""" + _drop_table(backend, batch_table_name, results_schema) + + +def _filter_by_concept_table( + table: Table, + concept_table: Table, + *, + column: str, + exclude: bool = False, +) -> Table: + """Filter *table* by semi-join (include) or anti-join (exclude) against *concept_table*. + + Returns a new ibis relation with only the original *table* columns. + """ + if not exclude: + joined = table.join(concept_table, table[column] == concept_table.concept_id) + return joined.select(*[joined[c] for c in table.columns]) + else: + marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) + joined = table.join(marked, table[column] == marked.concept_id, how="left") + filtered = joined.filter(joined._cm.isnull()) + return filtered.select(*[filtered[c] for c in table.columns]) diff --git a/circe/execution/ibis/compile_steps.py b/circe/execution/ibis/compile_steps.py index 0c6ad84..5450055 100644 --- a/circe/execution/ibis/compile_steps.py +++ b/circe/execution/ibis/compile_steps.py @@ -102,24 +102,22 @@ def _apply_date_predicate(expr, predicate: DateRangePredicate): raise CompilationError(f"Ibis executor compilation error: unsupported date range op {predicate.op!r}.") -def _resolve_concept_ids( - *, - direct_ids: tuple[int, ...], - codeset_id: int | None, - ctx: ExecutionContext, -) -> tuple[int, ...]: - all_ids = list(direct_ids) - if codeset_id is not None: - for cid in ctx.concept_ids_for_codeset(codeset_id): - if cid not in all_ids: - all_ids.append(cid) - return tuple(all_ids) - - def _select_original_columns(table, joined): return joined.select(*[joined[c] for c in table.columns]) +def _filter_by_concept_table(table, concept_table, *, column, exclude=False): + """Semi-join (include) or anti-join (exclude) *table* against *concept_table*.""" + if not exclude: + joined = table.join(concept_table, table[column] == concept_table.concept_id) + return _select_original_columns(table, joined) + else: + marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) + joined = table.join(marked, table[column] == marked.concept_id, how="left") + filtered = joined.filter(joined._cm.isnull()) + return _select_original_columns(table, filtered) + + def _filter_visit_concepts(table, ctx: ExecutionContext, *, step: FilterByVisit): visit = ctx.table("visit_occurrence") visit_lookup = visit.select( @@ -134,14 +132,24 @@ def _filter_visit_concepts(table, ctx: ExecutionContext, *, step: FilterByVisit) table[PERSON_ID] == visit_lookup._visit_person_id, ], ) - concept_ids = _resolve_concept_ids( - direct_ids=step.concept_ids, - codeset_id=step.codeset_id, - ctx=ctx, - ) - predicate = joined._visit_concept_id.isin(concept_ids) - filtered = joined.filter(~predicate if step.exclude else predicate) - return _select_original_columns(table, filtered) + + if step.codeset_id is not None: + concept_table = ctx.concept_set_table(step.codeset_id) + joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) + elif step.concept_ids: + concept_table = ibis.memtable( + {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} + ) + joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) + # If neither codeset_id nor concept_ids, no filtering needed + + if step.exclude: + marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) + joined = joined.join(marked, joined._visit_concept_id == marked.concept_id, how="left") + joined = joined.filter(joined._cm.isnull()) + return _select_original_columns(table, joined) + + return _select_original_columns(table, joined) def _filter_provider_specialty( @@ -159,14 +167,23 @@ def _filter_provider_specialty( provider_lookup, predicates=[table[step.provider_id_column] == provider_lookup._provider_id], ) - concept_ids = _resolve_concept_ids( - direct_ids=step.concept_ids, - codeset_id=step.codeset_id, - ctx=ctx, - ) - predicate = joined._specialty_concept_id.isin(concept_ids) - filtered = joined.filter(~predicate if step.exclude else predicate) - return _select_original_columns(table, filtered) + + if step.codeset_id is not None: + concept_table = ctx.concept_set_table(step.codeset_id) + joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) + elif step.concept_ids: + concept_table = ibis.memtable( + {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} + ) + joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) + + if step.exclude: + marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) + joined = joined.join(marked, joined._specialty_concept_id == marked.concept_id, how="left") + joined = joined.filter(joined._cm.isnull()) + return _select_original_columns(table, joined) + + return _select_original_columns(table, joined) def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): @@ -179,14 +196,23 @@ def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): care_site_lookup, predicates=[table[step.care_site_id_column] == care_site_lookup._care_site_id], ) - concept_ids = _resolve_concept_ids( - direct_ids=step.concept_ids, - codeset_id=step.codeset_id, - ctx=ctx, - ) - predicate = joined._place_of_service_concept_id.isin(concept_ids) - filtered = joined.filter(~predicate if step.exclude else predicate) - return _select_original_columns(table, filtered) + + if step.codeset_id is not None: + concept_table = ctx.concept_set_table(step.codeset_id) + joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) + elif step.concept_ids: + concept_table = ibis.memtable( + {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} + ) + joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) + + if step.exclude: + marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) + joined = joined.join(marked, joined._place_of_service_concept_id == marked.concept_id, how="left") + joined = joined.filter(joined._cm.isnull()) + return _select_original_columns(table, joined) + + return _select_original_columns(table, joined) def _filter_care_site_location_region( @@ -195,9 +221,7 @@ def _filter_care_site_location_region( *, step: FilterByCareSiteLocationRegion, ): - region_ids = ctx.concept_ids_for_codeset(step.codeset_id) - if not region_ids: - return table.limit(0) + concept_table = ctx.concept_set_table(step.codeset_id) location_history = ctx.table("location_history") history_lookup = location_history.select( @@ -233,8 +257,9 @@ def _filter_care_site_location_region( location_lookup, predicates=[joined_history._history_location_id == location_lookup._location_id], ) - filtered = joined.filter(joined._region_concept_id.isin(region_ids)) - return _select_original_columns(table, filtered) + + joined = joined.join(concept_table, joined._region_concept_id == concept_table.concept_id) + return _select_original_columns(table, joined) def apply_step(step, *, table, source, ctx: ExecutionContext): @@ -253,17 +278,20 @@ def apply_step(step, *, table, source, ctx: ExecutionContext): ) if isinstance(step, FilterByCodeset): - concept_ids = ctx.concept_ids_for_codeset(step.codeset_id) - if not concept_ids: - return table if step.exclude else table.limit(0) - predicate = table[step.column].isin(concept_ids) - return table.filter(~predicate if step.exclude else predicate) + concept_table = ctx.concept_set_table(step.codeset_id) + return _filter_by_concept_table( + table, concept_table, column=step.column, exclude=step.exclude + ) if isinstance(step, FilterByConceptSet): if not step.concept_ids: return table if step.exclude else table.limit(0) - predicate = table[step.column].isin(step.concept_ids) - return table.filter(~predicate if step.exclude else predicate) + concept_table = ibis.memtable( + {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} + ) + return _filter_by_concept_table( + table, concept_table, column=step.column, exclude=step.exclude + ) if isinstance(step, FilterByVisit): return _filter_visit_concepts(table, ctx, step=step) diff --git a/circe/execution/ibis/context.py b/circe/execution/ibis/context.py index b7b05ce..c0a98ce 100644 --- a/circe/execution/ibis/context.py +++ b/circe/execution/ibis/context.py @@ -1,11 +1,13 @@ from __future__ import annotations from collections.abc import Mapping +from typing import Any + +import ibis from .._dataclass import frozen_slots_dataclass from ..normalize.cohort import NormalizedConceptSet from ..typing import IbisBackendLike, Table -from .codesets import CachedConceptSetResolver def _table_with_schema_fallback( @@ -27,7 +29,7 @@ class ExecutionContext: cdm_schema: str results_schema: str | None vocabulary_schema: str | None - codeset_resolver: CachedConceptSetResolver + codeset_table: Table def table(self, table_name: str) -> Table: return self._table_from_schema(table_name, self.cdm_schema) @@ -41,37 +43,61 @@ def vocabulary_table(self, table_name: str) -> Table: def _table_from_schema(self, table_name: str, schema: str | None) -> Table: return _table_with_schema_fallback(self.backend, table_name, schema) - def concept_ids_for_codeset(self, codeset_id: int) -> tuple[int, ...]: - return self.codeset_resolver.resolve_codeset(codeset_id) + def concept_set_table(self, codeset_id: int) -> Table: + """Return an ibis Table with a single 'concept_id' column for this codeset. + + References the database-resident batch codeset table. No Python memory + is used for concept IDs -- filtering happens via SQL joins at execution time. + """ + return self.codeset_table.filter( + self.codeset_table.codeset_id == codeset_id + ).select("concept_id").distinct() + + +def _build_codeset_memtable( + concept_sets: Mapping[int, NormalizedConceptSet], +) -> Table: + """Build a simple memtable for concept sets with known concept IDs. + + Only handles simple includes (no descendant/mapped expansion needed). + This is a fallback for backward-compatible test usage. + """ + rows: list[dict[str, Any]] = [] + for cid, cset in concept_sets.items(): + for item in cset.items: + if not item.is_excluded and item.concept_id is not None: + rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) + if rows: + return ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) + return ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) def make_execution_context( *, backend: IbisBackendLike, cdm_schema: str, - concept_sets: Mapping[int, NormalizedConceptSet], + codeset_table: Table | None = None, + concept_sets: Mapping[int, NormalizedConceptSet] | None = None, results_schema: str | None = None, vocabulary_schema: str | None = None, - use_persistent_cache: bool = False, ) -> ExecutionContext: - """Construct an executor context from API-level wiring arguments.""" + """Construct an executor context from API-level wiring arguments. + + Provide *codeset_table* (preferred) for a database-resident codeset + table, or *concept_sets* for backward-compatible single-cohort use. + """ vocabulary_schema = vocabulary_schema or cdm_schema - def _table_getter(table_name: str, schema: str | None) -> Table: - return _table_with_schema_fallback(backend, table_name, schema) + if codeset_table is None: + codeset_table = _build_codeset_memtable(concept_sets or {}) - resolver = CachedConceptSetResolver( - table_getter=_table_getter, - vocabulary_schema=vocabulary_schema, - concept_sets=concept_sets, - backend=backend if use_persistent_cache else None, - results_schema=results_schema if use_persistent_cache else None, - use_persistent_cache=use_persistent_cache, - ) return ExecutionContext( backend=backend, cdm_schema=cdm_schema, results_schema=results_schema, vocabulary_schema=vocabulary_schema, - codeset_resolver=resolver, + codeset_table=codeset_table, ) diff --git a/circe/execution/ibis/person_filters.py b/circe/execution/ibis/person_filters.py index b46bd99..fd5a241 100644 --- a/circe/execution/ibis/person_filters.py +++ b/circe/execution/ibis/person_filters.py @@ -61,18 +61,18 @@ def apply_person_gender_filter( concept_ids: tuple[int, ...], codeset_id: int | None, ): - all_ids = list(concept_ids) if codeset_id is not None: - for cid in ctx.concept_ids_for_codeset(codeset_id): - if cid not in all_ids: - all_ids.append(cid) - - if not all_ids: + concept_table = ctx.concept_set_table(codeset_id) + elif concept_ids: + concept_table = ibis.memtable( + {"concept_id": list(concept_ids)}, schema={"concept_id": "int64"} + ) + else: return table person = ctx.table("person").select(PERSON_ID, "gender_concept_id") joined = table.join(person, table[PERSON_ID] == person[PERSON_ID]) - filtered = joined.filter(joined.gender_concept_id.isin(all_ids)) + filtered = joined.join(concept_table, joined.gender_concept_id == concept_table.concept_id) return filtered.select(*[filtered[c] for c in table.columns]) @@ -84,18 +84,18 @@ def _apply_person_concept_filter( concept_ids: tuple[int, ...], codeset_id: int | None, ): - all_ids = list(concept_ids) if codeset_id is not None: - for cid in ctx.concept_ids_for_codeset(codeset_id): - if cid not in all_ids: - all_ids.append(cid) - - if not all_ids: + concept_table = ctx.concept_set_table(codeset_id) + elif concept_ids: + concept_table = ibis.memtable( + {"concept_id": list(concept_ids)}, schema={"concept_id": "int64"} + ) + else: return table person = ctx.table("person").select(PERSON_ID, person_column) joined = table.join(person, table[PERSON_ID] == person[PERSON_ID]) - filtered = joined.filter(joined[person_column].isin(all_ids)) + filtered = joined.join(concept_table, joined[person_column] == concept_table.concept_id) return filtered.select(*[filtered[c] for c in table.columns]) diff --git a/tests/execution/test_codesets_persistent_cache.py b/tests/execution/test_codesets_persistent_cache.py index 087acf8..0f44aaf 100644 --- a/tests/execution/test_codesets_persistent_cache.py +++ b/tests/execution/test_codesets_persistent_cache.py @@ -4,17 +4,13 @@ from circe.execution.ibis.codesets import ( _CACHE_TABLE_NAME, - CachedConceptSetResolver, _compute_cache_key, - clear_codeset_cache, + build_batch_codeset_table, + _read_codeset_cache, + _write_codeset_cache, ) -from circe.execution.ibis.context import make_execution_context from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem -# ------------------------------------------------------------------ -# _compute_cache_key tests -# ------------------------------------------------------------------ - def _make_items(*specs: tuple[int, bool, bool, bool]) -> tuple[NormalizedConceptSetItem, ...]: return tuple( @@ -42,181 +38,65 @@ def test_compute_cache_key_different_items_different_hash(): assert _compute_cache_key(items_a) != _compute_cache_key(items_b) -# ------------------------------------------------------------------ -# Persistent cache integration tests using DuckDB -# ------------------------------------------------------------------ - - -@pytest.fixture -def duckdb_backend(): +def test_build_batch_codeset_table_round_trip(): ibis = pytest.importorskip("ibis") - backend = ibis.duckdb.connect() - backend.raw_sql("CREATE SCHEMA results") - return backend - + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + conn.create_table( + "concept", + obj=ibis.memtable( + { + "concept_id": [111, 222, 333], + "invalid_reason": [None, None, None], + } + ), + overwrite=True, + ) -def _concept_set_fixture(): - return { + concept_sets = { 1: NormalizedConceptSet( set_id=1, - items=( - NormalizedConceptSetItem( - concept_id=100, - is_excluded=False, - include_descendants=False, - include_mapped=False, - ), - ), - ) + items=(NormalizedConceptSetItem(concept_id=111, is_excluded=False),), + ), } - -def test_persistent_cache_write_and_read(duckdb_backend, monkeypatch): - """First resolve writes to persistent cache; second resolver instance reads from it.""" - concept_sets = _concept_set_fixture() - - resolver1 = CachedConceptSetResolver( - table_getter=lambda name, schema: duckdb_backend.table(name, database=schema), - vocabulary_schema=None, - concept_sets=concept_sets, - backend=duckdb_backend, - results_schema="results", - use_persistent_cache=True, - ) - - # Bypass vocabulary expansion — just return the concept_id directly - monkeypatch.setattr(resolver1, "_expand_item", lambda item: {item.concept_id}) - - result = resolver1.resolve_codeset(1) - assert result == (100,) - - # Verify the cache table was created with data - cache_tbl = duckdb_backend.table(_CACHE_TABLE_NAME, database="results") - rows = cache_tbl.execute() - assert len(rows) == 1 - - # Second resolver — _expand_item should NOT be called (persistent cache hit) - expand_calls = [] - - resolver2 = CachedConceptSetResolver( - table_getter=lambda name, schema: duckdb_backend.table(name, database=schema), - vocabulary_schema=None, - concept_sets=concept_sets, - backend=duckdb_backend, - results_schema="results", - use_persistent_cache=True, - ) - - def _expand_should_not_be_called(item): - expand_calls.append(item.concept_id) - return {item.concept_id} - - monkeypatch.setattr(resolver2, "_expand_item", _expand_should_not_be_called) - - result2 = resolver2.resolve_codeset(1) - assert result2 == (100,) - assert expand_calls == [], "Expected persistent cache hit — _expand_item should not be called" - - -def test_persistent_cache_disabled_by_default(monkeypatch): - """Without use_persistent_cache=True, no persistent ops happen.""" - concept_sets = _concept_set_fixture() - - resolver = CachedConceptSetResolver( - table_getter=lambda name, schema: None, - vocabulary_schema=None, + tbl = build_batch_codeset_table( + backend=conn, concept_sets=concept_sets, - ) - - monkeypatch.setattr(resolver, "_expand_item", lambda item: {item.concept_id}) - - result = resolver.resolve_codeset(1) - assert result == (100,) - assert not resolver._use_persistent_cache - - -def test_persistent_cache_read_failure_falls_back_silently(duckdb_backend, monkeypatch): - """If cache read raises, expansion still works.""" - concept_sets = _concept_set_fixture() - - resolver = CachedConceptSetResolver( - table_getter=lambda name, schema: duckdb_backend.table(name, database=schema), + batch_table_name="__test_codesets", vocabulary_schema=None, - concept_sets=concept_sets, - backend=duckdb_backend, - results_schema="results", - use_persistent_cache=True, ) + rows = tbl.execute() + assert set(rows["codeset_id"]) == {1} + assert set(rows["concept_id"]) == {111} - monkeypatch.setattr(resolver, "_expand_item", lambda item: {item.concept_id}) + conn.drop_table("__test_codesets", force=True) - # Force _read_persistent_cache to encounter an error internally by making - # table_exists raise. The method catches all exceptions and returns None. - from circe.execution.ibis import operations as ops - - def _broken_table_exists(*args, **kwargs): - raise RuntimeError("simulated db failure") - - monkeypatch.setattr(ops, "table_exists", _broken_table_exists) - - result = resolver.resolve_codeset(1) - assert result == (100,) +def test_persistent_cache_write_and_read(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") -def test_clear_codeset_cache(duckdb_backend, monkeypatch): - """clear_codeset_cache empties the cache table.""" - concept_sets = _concept_set_fixture() + conn = ibis.duckdb.connect() - resolver = CachedConceptSetResolver( - table_getter=lambda name, schema: duckdb_backend.table(name, database=schema), - vocabulary_schema=None, - concept_sets=concept_sets, - backend=duckdb_backend, - results_schema="results", - use_persistent_cache=True, + _write_codeset_cache( + conn, cache_key="testkey", concept_ids=(111, 222), schema=None, table_name=_CACHE_TABLE_NAME ) - monkeypatch.setattr(resolver, "_expand_item", lambda item: {item.concept_id}) - resolver.resolve_codeset(1) - # Verify rows exist - cache_tbl = duckdb_backend.table(_CACHE_TABLE_NAME, database="results") - assert len(cache_tbl.execute()) > 0 + result = _read_codeset_cache(conn, cache_key="testkey", schema=None, table_name=_CACHE_TABLE_NAME) + assert result == (111, 222) - # Clear and verify empty - clear_codeset_cache(duckdb_backend, "results") - cache_tbl = duckdb_backend.table(_CACHE_TABLE_NAME, database="results") - assert len(cache_tbl.execute()) == 0 + conn.drop_table(_CACHE_TABLE_NAME, force=True) -def test_make_execution_context_threads_persistent_cache(): - """make_execution_context passes persistent cache params to resolver.""" +def test_persistent_cache_miss_returns_none(): ibis = pytest.importorskip("ibis") - backend = ibis.duckdb.connect() - - ctx = make_execution_context( - backend=backend, - cdm_schema="main", - concept_sets={}, - results_schema="main", - use_persistent_cache=True, - ) - - assert ctx.codeset_resolver._use_persistent_cache is True - assert ctx.codeset_resolver._backend is backend - assert ctx.codeset_resolver._results_schema == "main" - + _ = pytest.importorskip("duckdb") -def test_make_execution_context_persistent_cache_disabled_without_results_schema(): - """Persistent cache gracefully disabled when results_schema is None.""" - ibis = pytest.importorskip("ibis") - backend = ibis.duckdb.connect() - - ctx = make_execution_context( - backend=backend, - cdm_schema="main", - concept_sets={}, - results_schema=None, - use_persistent_cache=True, - ) + conn = ibis.duckdb.connect() - assert ctx.codeset_resolver._use_persistent_cache is False + result = _read_codeset_cache(conn, cache_key="nonexistent", schema=None, table_name=_CACHE_TABLE_NAME) + assert result is None + if conn.exists_table(_CACHE_TABLE_NAME): + conn.drop_table(_CACHE_TABLE_NAME, force=True) diff --git a/tests/execution/test_compile_steps_helpers.py b/tests/execution/test_compile_steps_helpers.py index 74a3d87..f7bb00b 100644 --- a/tests/execution/test_compile_steps_helpers.py +++ b/tests/execution/test_compile_steps_helpers.py @@ -11,7 +11,6 @@ from circe.execution.ibis.compile_steps import ( _apply_date_predicate, _apply_numeric_predicate, - _resolve_concept_ids, apply_step, ) from circe.execution.normalize.windows import NormalizedWindow, NormalizedWindowBound @@ -34,8 +33,11 @@ def __init__(self, conn=None, *, codesets: dict[int, tuple[int, ...]] | None = N self.conn = conn self.codesets = codesets or {} - def concept_ids_for_codeset(self, codeset_id: int) -> tuple[int, ...]: - return self.codesets.get(codeset_id, ()) + def concept_set_table(self, codeset_id: int) -> ibis.Table: + ids = self.codesets.get(codeset_id, ()) + return ibis.memtable( + {"concept_id": list(ids)}, schema={"concept_id": "int64"} + ) def table(self, name: str): if self.conn is None: @@ -131,11 +133,6 @@ def test_apply_date_predicate_rejects_invalid_ranges(): _apply_date_predicate(expr, DateRangePredicate(op="weird", value="2020-01-01", extent=None)) -def test_resolve_concept_ids_deduplicates_codeset_ids(): - ctx = _Context(codesets={1: (2, 3, 4)}) - assert _resolve_concept_ids(direct_ids=(1, 2), codeset_id=1, ctx=ctx) == (1, 2, 3, 4) - - def test_apply_step_covers_text_codeset_concept_and_adjustment_paths(): ibis_mod = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") diff --git a/tests/execution/test_context_wiring.py b/tests/execution/test_context_wiring.py index 1231f12..bc06881 100644 --- a/tests/execution/test_context_wiring.py +++ b/tests/execution/test_context_wiring.py @@ -1,100 +1,96 @@ from __future__ import annotations -from types import SimpleNamespace +import ibis +import pytest -from circe.execution.ibis.codesets import CachedConceptSetResolver +from circe.execution.ibis.codesets import build_single_codeset_table from circe.execution.ibis.context import ExecutionContext, make_execution_context -from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem -class _BackendWithSchemaSupport: - def __init__(self): - self.calls: list[tuple[str, str | None]] = [] - - def table(self, name: str, database: str | None = None): - self.calls.append((name, database)) - return (name, database) - - -class _BackendWithoutSchemaSupport: - def __init__(self): - self.calls: list[tuple[str, str | None]] = [] - - def table(self, name: str, database: str | None = None): - self.calls.append((name, database)) - if database is not None: - raise TypeError("database kwarg not supported") - return (name, None) +def _make_codeset_table(backend): + return build_single_codeset_table( + backend=backend, + concept_sets={}, + batch_table_name="__test_codesets", + ) def test_make_execution_context_uses_cdm_schema_as_vocabulary_fallback(): - backend = _BackendWithSchemaSupport() + ibis_mod = pytest.importorskip("ibis") + conn = ibis_mod.duckdb.connect() + codeset_table = _make_codeset_table(conn) + ctx = make_execution_context( - backend=backend, - cdm_schema="cdm", - concept_sets={}, + backend=conn, + cdm_schema="main", + codeset_table=codeset_table, ) assert isinstance(ctx, ExecutionContext) - assert ctx.vocabulary_schema == "cdm" - assert isinstance(ctx.codeset_resolver, CachedConceptSetResolver) - assert ctx.table("person") == ("person", "cdm") - assert ctx.concept_ids_for_codeset(999) == () + assert ctx.vocabulary_schema == "main" + + conn.drop_table("__test_codesets", force=True) + +def test_make_execution_context_honors_vocabulary_schema_option(): + ibis_mod = pytest.importorskip("ibis") + conn = ibis_mod.duckdb.connect() + codeset_table = _make_codeset_table(conn) -def test_make_execution_context_honors_vocabulary_schema_option_and_backend_fallback(): - backend = _BackendWithoutSchemaSupport() ctx = make_execution_context( - backend=backend, + backend=conn, cdm_schema="cdm", - concept_sets={}, + codeset_table=codeset_table, vocabulary_schema="vocab", ) assert ctx.vocabulary_schema == "vocab" - assert ctx.vocabulary_table("concept") == ("concept", None) - assert backend.calls == [("concept", "vocab"), ("concept", None)] + conn.drop_table("__test_codesets", force=True) -def test_codeset_resolver_caches_expanded_results(monkeypatch): - resolver = CachedConceptSetResolver( - table_getter=lambda name, schema: (name, schema), - vocabulary_schema="vocab", - concept_sets={ - 1: NormalizedConceptSet( - set_id=1, - items=( - NormalizedConceptSetItem( - concept_id=123, - is_excluded=False, - include_descendants=False, - include_mapped=False, - ), - ), - ) - }, - ) - calls: list[int] = [] - - def _expand(item): - calls.append(item.concept_id) - return {item.concept_id} - monkeypatch.setattr(resolver, "_expand_item", _expand) +def test_codeset_table_returns_filtered_view(): + ibis_mod = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") - assert resolver.resolve_codeset(1) == (123,) - assert resolver.resolve_codeset(1) == (123,) - assert calls == [123] + conn = ibis_mod.duckdb.connect() + conn.create_table( + "concept", + obj=ibis.memtable( + { + "concept_id": [111, 222], + "invalid_reason": [None, None], + } + ), + overwrite=True, + ) + concept_sets = { + 1: ibis_mod.execution.normalize.cohort.NormalizedConceptSet( + set_id=1, + items=( + ibis_mod.execution.normalize.cohort.NormalizedConceptSetItem( + concept_id=111, is_excluded=False + ), + ), + ), + } + + codeset_table = build_single_codeset_table( + backend=conn, + concept_sets=concept_sets, + batch_table_name="__test_codesets2", + ) -def test_codeset_resolver_handles_empty_and_non_dataframe_query_results(): - resolver = CachedConceptSetResolver( - table_getter=lambda name, schema: (name, schema), - vocabulary_schema="vocab", - concept_sets={}, + ctx = make_execution_context( + backend=conn, + cdm_schema="main", + codeset_table=codeset_table, ) - assert resolver._descendant_ids(set()) == set() - assert resolver._mapped_ids(set()) == set() - assert resolver._execute_concept_id_query(SimpleNamespace(execute=lambda: [1, None, 2])) == {1, 2} - assert resolver._execute_concept_id_query(SimpleNamespace(execute=lambda: 3)) == {3} + # concept_set_table should filter by codeset_id + filtered = ctx.concept_set_table(1).execute() + assert len(filtered) == 1 + assert list(filtered["concept_id"]) == [111] + + conn.drop_table("__test_codesets2", force=True) diff --git a/tests/execution/test_group_demographics.py b/tests/execution/test_group_demographics.py index d11cc73..f5f04eb 100644 --- a/tests/execution/test_group_demographics.py +++ b/tests/execution/test_group_demographics.py @@ -1,15 +1,17 @@ from __future__ import annotations +import ibis import pytest from circe.execution.engine.group_demographics import ( _apply_date_predicate, - _demographic_concept_ids, + _demographic_concept_table, demographic_match_keys, ) from circe.execution.errors import UnsupportedFeatureError from circe.execution.normalize.groups import NormalizedDemographicCriteria from circe.execution.normalize.windows import NormalizedDateRange, NormalizedNumericRange +from circe.execution.ibis.context import ExecutionContext class _DemographicContext: @@ -20,8 +22,11 @@ def __init__(self, conn, *, codesets: dict[int, tuple[int, ...]] | None = None): def table(self, name: str): return self.conn.table(name) - def concept_ids_for_codeset(self, codeset_id: int) -> tuple[int, ...]: - return self.codesets.get(codeset_id, ()) + def concept_set_table(self, codeset_id: int) -> ibis.Table: + ids = self.codesets.get(codeset_id, ()) + return ibis.memtable( + {"concept_id": list(ids)}, schema={"concept_id": "int64"} + ) def _seed_demographic_tables(conn, ibis): @@ -68,10 +73,17 @@ def test_apply_date_predicate_rejects_invalid_between_and_op(): ) -def test_demographic_concept_ids_merge_codesets_without_duplicates(): +def test_demographic_concept_table_returns_table(): ctx = _DemographicContext(None, codesets={1: (8507, 8532)}) + tbl = _demographic_concept_table(explicit_ids=(8507,), codeset_id=1, ctx=ctx) + assert tbl is not None + assert list(tbl.execute()["concept_id"]) == [8507, 8532] + - assert _demographic_concept_ids(explicit_ids=(8507,), codeset_id=1, ctx=ctx) == (8507, 8532) +def test_demographic_concept_table_returns_none_when_empty(): + ctx = _DemographicContext(None) + tbl = _demographic_concept_table(explicit_ids=(), codeset_id=None, ctx=ctx) + assert tbl is None def test_demographic_match_keys_applies_all_supported_filters(): From a837d7ae0ae12d9df315c55b227de15027b6259e Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 09:32:07 -0700 Subject: [PATCH 21/53] concept set optimizations --- circe/cohort_definition_set/_generate.py | 9 +- circe/execution/api.py | 17 ++- circe/execution/engine/group_demographics.py | 86 +++++------ circe/execution/ibis/codesets.py | 133 ++++++++++++------ circe/execution/ibis/compile_steps.py | 24 +--- circe/execution/ibis/context.py | 8 +- circe/execution/ibis/person_filters.py | 14 +- .../test_codesets_persistent_cache.py | 15 +- tests/execution/test_compile_steps_helpers.py | 31 +++- tests/execution/test_context_wiring.py | 18 +-- tests/execution/test_custom_era.py | 4 +- tests/execution/test_group_demographics.py | 18 +-- tests/execution/test_person_filters.py | 6 +- 13 files changed, 218 insertions(+), 165 deletions(-) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 653ff20..5549a1b 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -11,7 +11,6 @@ from ..execution.api import build_cohort, write_cohort from ..execution.ibis.codesets import _CODESET_TABLE, build_batch_codeset_table, drop_codeset_table -from ..execution.ibis.context import make_execution_context from ..execution.normalize.cohort import normalize_cohort from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult @@ -26,7 +25,7 @@ def _collect_concept_sets( cohort_definition_set: CohortDefinitionSet, -) -> dict[int: NormalizedConceptSet]: # type: ignore +) -> dict: """Normalize all cohort expressions and merge concept sets.""" from ..execution.normalize.cohort import NormalizedConceptSet # noqa: F401 @@ -48,7 +47,6 @@ def _build_and_return_batch_codesets( results_table_name: str, ) -> Table: """Build batch codeset table. Called inside a thread.""" - from ..execution.typing import Table as TableType return build_batch_codeset_table( backend=backend, @@ -168,6 +166,7 @@ async def async_generate_cohort_set( else: # No concept sets -- create empty memtable to satisfy ExecutionContext import ibis # noqa: PLC0415 + codeset_table = ibis.memtable( {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"}, @@ -323,9 +322,7 @@ async def async_generate_cohort_set( schema = results_schema or cdm_schema for stage in ("primary", "qualified", "included", "ended"): with contextlib.suppress(Exception): - backend.drop_table( - f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True - ) + backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) results.append( CohortGenerationResult( diff --git a/circe/execution/api.py b/circe/execution/api.py index 6d525f8..19b23dc 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from typing import Literal from ..cohortdefinition import CohortExpression @@ -43,15 +44,14 @@ def build_cohort( (e.g. unit tests that only verify the expression tree can be built). When *codeset_table* is provided (from a batch-generation caller), it is - used directly. Otherwise one is auto-created for this single cohort and - dropped after the pipeline runs. + used directly. Otherwise one is auto-created for this single cohort. """ maybe_apply_databricks_post_connect_workaround(backend) normalized = normalize_cohort(expression) if codeset_table is not None: - own_table = False + pass else: codeset_table = build_single_codeset_table( backend=backend, @@ -60,7 +60,6 @@ def build_cohort( results_schema=results_schema, vocabulary_schema=vocabulary_schema, ) - own_table = True ctx = make_execution_context( backend=backend, @@ -69,7 +68,15 @@ def build_cohort( vocabulary_schema=vocabulary_schema, codeset_table=codeset_table, ) - return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) + + # Ibis SQL compilation for large cohorts may exceed the default recursion + # limit when walking deeply nested expression trees (e.g. 100-way UNION). + prev_limit = sys.getrecursionlimit() + sys.setrecursionlimit(max(prev_limit, 5000)) + try: + return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) + finally: + sys.setrecursionlimit(prev_limit) def write_relation( diff --git a/circe/execution/engine/group_demographics.py b/circe/execution/engine/group_demographics.py index 13c479b..45f9eee 100644 --- a/circe/execution/engine/group_demographics.py +++ b/circe/execution/engine/group_demographics.py @@ -3,36 +3,7 @@ import ibis from ..errors import UnsupportedFeatureError -from ..normalize.groups import NormalizedDemographicCriteria -from ..plan.schema import EVENT_ID, PERSON_ID -from ..typing import Table -from .group_keys import event_keys from ..ibis.context import ExecutionContext - - -def _apply_numeric_predicate(expr, predicate): - ... - - -def _apply_date_predicate(date_expr, predicate): - ... - - -def _demographic_concept_table( - *, - explicit_ids: tuple[int, ...], - codeset_id: int | None, - ctx: ExecutionContext, -) -> Table | None: - """Return an ibis Table with a single 'concept_id' column, or None if empty.""" - if codeset_id is not None: - return ctx.concept_set_table(codeset_id) - elif explicit_ids: - return ibis.memtable( - {"concept_id": list(explicit_ids)}, - schema={"concept_id": "int64"}, - ) - return None from ..normalize.groups import NormalizedDemographicCriteria from ..plan.schema import EVENT_ID, PERSON_ID from ..typing import Table @@ -72,7 +43,7 @@ def _apply_numeric_predicate(expr, predicate): ) -def _apply_date_predicate(expr, predicate): +def _apply_date_predicate(date_expr, predicate): op = (predicate.op or "eq").lower() value = predicate.value extent = predicate.extent @@ -81,19 +52,19 @@ def _apply_date_predicate(expr, predicate): return ibis.literal(True) value_expr = ibis.literal(value).cast("date") - date_expr = expr.cast("date") + if op in {"eq", "="}: - return date_expr == value_expr + return date_expr.cast("date") == value_expr if op in {"neq", "!=", "ne"}: - return date_expr != value_expr + return date_expr.cast("date") != value_expr if op in {"gt", ">"}: - return date_expr > value_expr + return date_expr.cast("date") > value_expr if op in {"gte", ">="}: - return date_expr >= value_expr + return date_expr.cast("date") >= value_expr if op in {"lt", "<"}: - return date_expr < value_expr + return date_expr.cast("date") < value_expr if op in {"lte", "<="}: - return date_expr <= value_expr + return date_expr.cast("date") <= value_expr if op in {"bt", "between"}: if extent is None: raise UnsupportedFeatureError( @@ -103,12 +74,33 @@ def _apply_date_predicate(expr, predicate): extent_expr = ibis.literal(extent).cast("date") lower = ibis.least(value_expr, extent_expr) upper = ibis.greatest(value_expr, extent_expr) - return (date_expr >= lower) & (date_expr <= upper) + return (date_expr.cast("date") >= lower) & (date_expr.cast("date") <= upper) raise UnsupportedFeatureError( f"Ibis executor group evaluation error: unsupported demographic date range op {predicate.op!r}." ) +def _demographic_concept_ids( + *, + explicit_ids: tuple[int, ...], + codeset_id: int | None, + ctx: ExecutionContext, +) -> tuple[int, ...]: + """Resolve concept IDs for a demographic filter. + + Returns a Python tuple of concept IDs. Demographic sets are tiny (1-5 + IDs) so this is cheap and avoids join complexity. + """ + all_ids = list(explicit_ids) + if codeset_id is not None: + t = ctx.concept_set_table(codeset_id) + for row in t.select("concept_id").distinct().to_pandas().itertuples(): + cid = row.concept_id + if cid not in all_ids: + all_ids.append(cid) + return tuple(all_ids) + + def demographic_match_keys( index_events: Table, demographic: NormalizedDemographicCriteria, @@ -130,29 +122,29 @@ def demographic_match_keys( age_years = event_date.year() - joined.year_of_birth predicates.append(_apply_numeric_predicate(age_years, demographic.age)) - gender_table = _demographic_concept_table( + gender_ids = _demographic_concept_ids( explicit_ids=demographic.gender_concept_ids, codeset_id=demographic.gender_codeset_id, ctx=ctx, ) - if gender_table is not None: - joined = joined.join(gender_table, joined.gender_concept_id == gender_table.concept_id) + if gender_ids: + predicates.append(joined.gender_concept_id.isin(gender_ids)) - race_table = _demographic_concept_table( + race_ids = _demographic_concept_ids( explicit_ids=demographic.race_concept_ids, codeset_id=demographic.race_codeset_id, ctx=ctx, ) - if race_table is not None: - joined = joined.join(race_table, joined.race_concept_id == race_table.concept_id) + if race_ids: + predicates.append(joined.race_concept_id.isin(race_ids)) - ethnicity_table = _demographic_concept_table( + ethnicity_ids = _demographic_concept_ids( explicit_ids=demographic.ethnicity_concept_ids, codeset_id=demographic.ethnicity_codeset_id, ctx=ctx, ) - if ethnicity_table is not None: - joined = joined.join(ethnicity_table, joined.ethnicity_concept_id == ethnicity_table.concept_id) + if ethnicity_ids: + predicates.append(joined.ethnicity_concept_id.isin(ethnicity_ids)) if demographic.occurrence_start_date is not None: predicates.append( diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index dcaae87..587fef3 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import hashlib import json from collections.abc import Callable, Mapping @@ -11,6 +12,7 @@ from ..normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem from ..plan.schema import CONCEPT_ID from ..typing import IbisBackendLike, Table +from .operations import create_table as _create_table_impl _CODESET_TABLE = "__cg_codesets" _CACHE_TABLE_NAME = "_circe_codeset_cache" @@ -19,8 +21,7 @@ def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: """Deterministic SHA-256 hash of sorted concept set items.""" canonical = sorted( - (item.concept_id, item.is_excluded, item.include_descendants, item.include_mapped) - for item in items + (item.concept_id, item.is_excluded, item.include_descendants, item.include_mapped) for item in items ) payload = json.dumps(canonical, separators=(",", ":")) return hashlib.sha256(payload.encode("utf-8")).hexdigest() @@ -165,7 +166,9 @@ def build_concept_set_expression( # that are then excluded via anti-join. # For simplicity and correctness, let me handle excludes uniformly via anti-join. - return _build_codeset_expression(concept_set, table_getter=table_getter, vocabulary_schema=vocabulary_schema) + return _build_codeset_expression( + concept_set, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) def _build_codeset_expression( @@ -186,9 +189,15 @@ def _build_codeset_expression( # Build base expression for this item if item.include_descendants: - base = _descendant_expression( + desc = _descendant_expression( direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema ) + base = _union_all_tables( + [ + ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}), + desc, + ] + ) else: base = ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}) @@ -226,15 +235,20 @@ def _build_codeset_expression( def _union_all_tables(tables: list[Table]) -> Table: - """Union multiple single-column ibis tables.""" + """Union multiple single-column ibis tables using binary-tree merge. + + Binary-tree merge caps expression-tree depth at O(log n) instead of + O(n), avoiding deeply nested UNION ALL chains for large numbers of + tables (e.g. 100+ concept sets). + """ if not tables: raise ValueError("_union_all_tables requires at least one table") if len(tables) == 1: return tables[0] - result = tables[0] - for t in tables[1:]: - result = result.union(t, distinct=False) - return result + mid = len(tables) // 2 + left = _union_all_tables(tables[:mid]) + right = _union_all_tables(tables[mid:]) + return left.union(right, distinct=False) def _drop_table( @@ -243,10 +257,8 @@ def _drop_table( schema: str | None, ) -> None: """Safely drop a backend table.""" - try: + with contextlib.suppress(Exception): backend.drop_table(table_name, database=schema, force=True) - except Exception: - pass def _table_getter_from_backend( @@ -321,40 +333,54 @@ def build_batch_codeset_table( ) parts.append(tbl) - # Cache misses: build lazy expansion expressions + # Split uncached into simple (direct IDs only) and complex (needs expansion) + simple_rows: list[dict[str, Any]] = [] + complex_csets: list[tuple[int, NormalizedConceptSet]] = [] for cid, cset in uncached: if not cset.items: continue - expr = _build_codeset_expression( - cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema + if _needs_vocabulary_expansion({cid: cset}): + complex_csets.append((cid, cset)) + else: + for item in cset.items: + if not item.is_excluded and item.concept_id is not None: + simple_rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) + + # Batch all simple IDs into one memtable + if simple_rows: + parts.append( + ibis.memtable( + simple_rows, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) ) + + # Complex concept sets: build lazy expansion expressions + for cid, cset in complex_csets: + expr = _build_codeset_expression(cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema) labeled = expr.mutate(codeset_id=ibis.literal(cid, type="int64")).select("codeset_id", CONCEPT_ID) parts.append(labeled) - # Write to persistent cache after materialization (done below) - if not parts: # No concept sets at all -- create empty table empty = ibis.memtable( {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"}, ) - create_table_op( - backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=temporary + _create_table_impl( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=empty, + overwrite=True, + temp=temporary, ) return _read_table(backend, table_name=batch_table_name, schema=results_schema) # Union all parts and materialize - if len(parts) == 1: - combined = parts[0] - else: - combined = parts[0] - for p in parts[1:]: - combined = combined.union(p, distinct=False) - - from .operations import create_table as create_table_op + combined = _union_all_tables(parts) - create_table_op( + _create_table_impl( backend, table_name=batch_table_name, schema=results_schema, @@ -373,12 +399,7 @@ def build_batch_codeset_table( continue # Read back from the table to get resolved IDs for this codeset ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) - resolved = ( - ref.filter(ref.codeset_id == cid) - .select(CONCEPT_ID) - .distinct() - .execute() - ) + resolved = ref.filter(ref.codeset_id == cid).select(CONCEPT_ID).distinct().execute() cids = _extract_column(resolved, CONCEPT_ID) if cids: _write_codeset_cache( @@ -393,6 +414,15 @@ def build_batch_codeset_table( return ref +def _needs_vocabulary_expansion(concept_sets: Mapping[int, NormalizedConceptSet]) -> bool: + """Return True if any concept set requires vocabulary-table queries.""" + for cset in concept_sets.values(): + for item in cset.items: + if item.include_descendants or item.include_mapped: + return True + return False + + def build_single_codeset_table( *, backend: IbisBackendLike, @@ -401,12 +431,35 @@ def build_single_codeset_table( results_schema: str | None = None, vocabulary_schema: str | None = None, ) -> Table: - """Build a single-cohort codeset table as a temporary database table. + """Build a codeset table for a single cohort. - Like :func:`build_batch_codeset_table` but without persistent cache - support and creates a temporary table. Used as the auto-creation - fallback in ``build_cohort()`` when no batch table is provided. + When all concept sets use only direct concept IDs (no descendant or + mapped expansion needed), builds a simple memtable -- no vocabulary + tables required. Falls back to full expansion otherwise. """ + if not _needs_vocabulary_expansion(concept_sets): + rows: list[dict[str, Any]] = [] + for cid, cset in concept_sets.items(): + for item in cset.items: + if not item.is_excluded and item.concept_id is not None: + rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) + if rows: + data = ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) + else: + data = ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + _create_table_impl( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=data, + overwrite=True, + temp=True, + ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + return build_batch_codeset_table( backend=backend, concept_sets=concept_sets, @@ -474,7 +527,7 @@ def _write_codeset_cache( table_name: str, ) -> None: """Persist resolved concept IDs to the cache table.""" - from .operations import create_table as create_table_op, insert_relation, table_exists + from .operations import insert_relation, table_exists if not concept_ids: return @@ -485,7 +538,7 @@ def _write_codeset_cache( schema={"cache_key": "string", "concept_id": "int64"}, ) if not table_exists(backend, table_name=table_name, schema=schema): - create_table_op(backend, table_name=table_name, schema=schema, obj=data) + _create_table_impl(backend, table_name=table_name, schema=schema, obj=data) return insert_relation(data, backend=backend, target_table=table_name, target_schema=schema) except Exception: diff --git a/circe/execution/ibis/compile_steps.py b/circe/execution/ibis/compile_steps.py index 5450055..58398eb 100644 --- a/circe/execution/ibis/compile_steps.py +++ b/circe/execution/ibis/compile_steps.py @@ -137,9 +137,7 @@ def _filter_visit_concepts(table, ctx: ExecutionContext, *, step: FilterByVisit) concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable( - {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} - ) + concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) # If neither codeset_id nor concept_ids, no filtering needed @@ -172,9 +170,7 @@ def _filter_provider_specialty( concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable( - {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} - ) + concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) if step.exclude: @@ -201,9 +197,7 @@ def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable( - {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} - ) + concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) if step.exclude: @@ -279,19 +273,13 @@ def apply_step(step, *, table, source, ctx: ExecutionContext): if isinstance(step, FilterByCodeset): concept_table = ctx.concept_set_table(step.codeset_id) - return _filter_by_concept_table( - table, concept_table, column=step.column, exclude=step.exclude - ) + return _filter_by_concept_table(table, concept_table, column=step.column, exclude=step.exclude) if isinstance(step, FilterByConceptSet): if not step.concept_ids: return table if step.exclude else table.limit(0) - concept_table = ibis.memtable( - {"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"} - ) - return _filter_by_concept_table( - table, concept_table, column=step.column, exclude=step.exclude - ) + concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) + return _filter_by_concept_table(table, concept_table, column=step.column, exclude=step.exclude) if isinstance(step, FilterByVisit): return _filter_visit_concepts(table, ctx, step=step) diff --git a/circe/execution/ibis/context.py b/circe/execution/ibis/context.py index c0a98ce..dcbd9c8 100644 --- a/circe/execution/ibis/context.py +++ b/circe/execution/ibis/context.py @@ -49,9 +49,11 @@ def concept_set_table(self, codeset_id: int) -> Table: References the database-resident batch codeset table. No Python memory is used for concept IDs -- filtering happens via SQL joins at execution time. """ - return self.codeset_table.filter( - self.codeset_table.codeset_id == codeset_id - ).select("concept_id").distinct() + return ( + self.codeset_table.filter(self.codeset_table.codeset_id == codeset_id) + .select("concept_id") + .distinct() + ) def _build_codeset_memtable( diff --git a/circe/execution/ibis/person_filters.py b/circe/execution/ibis/person_filters.py index fd5a241..ff88851 100644 --- a/circe/execution/ibis/person_filters.py +++ b/circe/execution/ibis/person_filters.py @@ -63,16 +63,15 @@ def apply_person_gender_filter( ): if codeset_id is not None: concept_table = ctx.concept_set_table(codeset_id) + concept_table = concept_table.select(concept_table.concept_id.name("_pconcept_id")) elif concept_ids: - concept_table = ibis.memtable( - {"concept_id": list(concept_ids)}, schema={"concept_id": "int64"} - ) + concept_table = ibis.memtable({"_pconcept_id": list(concept_ids)}, schema={"_pconcept_id": "int64"}) else: return table person = ctx.table("person").select(PERSON_ID, "gender_concept_id") joined = table.join(person, table[PERSON_ID] == person[PERSON_ID]) - filtered = joined.join(concept_table, joined.gender_concept_id == concept_table.concept_id) + filtered = joined.join(concept_table, joined.gender_concept_id == concept_table._pconcept_id) return filtered.select(*[filtered[c] for c in table.columns]) @@ -86,16 +85,15 @@ def _apply_person_concept_filter( ): if codeset_id is not None: concept_table = ctx.concept_set_table(codeset_id) + concept_table = concept_table.select(concept_table.concept_id.name("_pconcept_id")) elif concept_ids: - concept_table = ibis.memtable( - {"concept_id": list(concept_ids)}, schema={"concept_id": "int64"} - ) + concept_table = ibis.memtable({"_pconcept_id": list(concept_ids)}, schema={"_pconcept_id": "int64"}) else: return table person = ctx.table("person").select(PERSON_ID, person_column) joined = table.join(person, table[PERSON_ID] == person[PERSON_ID]) - filtered = joined.join(concept_table, joined[person_column] == concept_table.concept_id) + filtered = joined.join(concept_table, joined[person_column] == concept_table._pconcept_id) return filtered.select(*[filtered[c] for c in table.columns]) diff --git a/tests/execution/test_codesets_persistent_cache.py b/tests/execution/test_codesets_persistent_cache.py index 0f44aaf..b8514f8 100644 --- a/tests/execution/test_codesets_persistent_cache.py +++ b/tests/execution/test_codesets_persistent_cache.py @@ -5,9 +5,9 @@ from circe.execution.ibis.codesets import ( _CACHE_TABLE_NAME, _compute_cache_key, - build_batch_codeset_table, _read_codeset_cache, _write_codeset_cache, + build_batch_codeset_table, ) from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem @@ -48,8 +48,9 @@ def test_build_batch_codeset_table_round_trip(): obj=ibis.memtable( { "concept_id": [111, 222, 333], - "invalid_reason": [None, None, None], - } + "invalid_reason": ["X", None, None], + }, + schema={"concept_id": "int64", "invalid_reason": "string"}, ), overwrite=True, ) @@ -57,7 +58,11 @@ def test_build_batch_codeset_table_round_trip(): concept_sets = { 1: NormalizedConceptSet( set_id=1, - items=(NormalizedConceptSetItem(concept_id=111, is_excluded=False),), + items=( + NormalizedConceptSetItem( + concept_id=111, is_excluded=False, include_descendants=False, include_mapped=False + ), + ), ), } @@ -98,5 +103,3 @@ def test_persistent_cache_miss_returns_none(): result = _read_codeset_cache(conn, cache_key="nonexistent", schema=None, table_name=_CACHE_TABLE_NAME) assert result is None - if conn.exists_table(_CACHE_TABLE_NAME): - conn.drop_table(_CACHE_TABLE_NAME, force=True) diff --git a/tests/execution/test_compile_steps_helpers.py b/tests/execution/test_compile_steps_helpers.py index f7bb00b..54413fe 100644 --- a/tests/execution/test_compile_steps_helpers.py +++ b/tests/execution/test_compile_steps_helpers.py @@ -35,9 +35,7 @@ def __init__(self, conn=None, *, codesets: dict[int, tuple[int, ...]] | None = N def concept_set_table(self, codeset_id: int) -> ibis.Table: ids = self.codesets.get(codeset_id, ()) - return ibis.memtable( - {"concept_id": list(ids)}, schema={"concept_id": "int64"} - ) + return ibis.memtable({"concept_id": list(ids)}, schema={"concept_id": "int64"}) def table(self, name: str): if self.conn is None: @@ -64,6 +62,7 @@ def _events_table(conn): ], VISIT_OCCURRENCE_ID: [100, 101, 200], "concept_id": [1, 2, 3], + "care_site_id": [10, 20, 30], "text_value": ["alpha", "beta", "gamma"], } ), @@ -215,7 +214,7 @@ def test_apply_step_covers_keep_first_person_filter_and_error_paths(): table = _events_table(conn) conn.create_table( "person", - obj=ibis_mod.memtable( + obj=ibis.memtable( { PERSON_ID: [1, 2], "gender_concept_id": [8507, 8532], @@ -223,6 +222,30 @@ def test_apply_step_covers_keep_first_person_filter_and_error_paths(): ), overwrite=True, ) + # location_history is needed by FilterByCareSiteLocationRegion + conn.create_table( + "location_history", + obj=ibis.memtable( + { + "entity_id": [1], + "location_id": [1], + "domain_id": ["CARE_SITE"], + "start_date": ["2020-01-01"], + "end_date": ["2020-12-31"], + } + ), + overwrite=True, + ) + conn.create_table( + "location", + obj=ibis.memtable( + { + "location_id": [1], + "region_concept_id": [123], + } + ), + overwrite=True, + ) ctx = _Context(conn, codesets={9: ()}) first = apply_step( diff --git a/tests/execution/test_context_wiring.py b/tests/execution/test_context_wiring.py index bc06881..2105aa7 100644 --- a/tests/execution/test_context_wiring.py +++ b/tests/execution/test_context_wiring.py @@ -1,10 +1,10 @@ from __future__ import annotations -import ibis import pytest from circe.execution.ibis.codesets import build_single_codeset_table from circe.execution.ibis.context import ExecutionContext, make_execution_context +from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem def _make_codeset_table(backend): @@ -54,23 +54,13 @@ def test_codeset_table_returns_filtered_view(): _ = pytest.importorskip("duckdb") conn = ibis_mod.duckdb.connect() - conn.create_table( - "concept", - obj=ibis.memtable( - { - "concept_id": [111, 222], - "invalid_reason": [None, None], - } - ), - overwrite=True, - ) concept_sets = { - 1: ibis_mod.execution.normalize.cohort.NormalizedConceptSet( + 1: NormalizedConceptSet( set_id=1, items=( - ibis_mod.execution.normalize.cohort.NormalizedConceptSetItem( - concept_id=111, is_excluded=False + NormalizedConceptSetItem( + concept_id=111, is_excluded=False, include_descendants=False, include_mapped=False ), ), ), diff --git a/tests/execution/test_custom_era.py b/tests/execution/test_custom_era.py index ab327d4..90e087c 100644 --- a/tests/execution/test_custom_era.py +++ b/tests/execution/test_custom_era.py @@ -350,7 +350,9 @@ def test_compute_drug_eras_matches_java_sql_logic(): ctx = SimpleNamespace( table=lambda name: conn.table(name), - concept_ids_for_codeset=lambda cid: (222,) if cid == 2 else (), + concept_set_table=lambda cid: ibis.memtable( + {"concept_id": [222] if cid == 2 else []}, schema={"concept_id": "int64"} + ), ) # --- ibis path --- diff --git a/tests/execution/test_group_demographics.py b/tests/execution/test_group_demographics.py index f5f04eb..b4bdbc3 100644 --- a/tests/execution/test_group_demographics.py +++ b/tests/execution/test_group_demographics.py @@ -5,13 +5,12 @@ from circe.execution.engine.group_demographics import ( _apply_date_predicate, - _demographic_concept_table, + _demographic_concept_ids, demographic_match_keys, ) from circe.execution.errors import UnsupportedFeatureError from circe.execution.normalize.groups import NormalizedDemographicCriteria from circe.execution.normalize.windows import NormalizedDateRange, NormalizedNumericRange -from circe.execution.ibis.context import ExecutionContext class _DemographicContext: @@ -24,9 +23,7 @@ def table(self, name: str): def concept_set_table(self, codeset_id: int) -> ibis.Table: ids = self.codesets.get(codeset_id, ()) - return ibis.memtable( - {"concept_id": list(ids)}, schema={"concept_id": "int64"} - ) + return ibis.memtable({"concept_id": list(ids)}, schema={"concept_id": "int64"}) def _seed_demographic_tables(conn, ibis): @@ -75,15 +72,14 @@ def test_apply_date_predicate_rejects_invalid_between_and_op(): def test_demographic_concept_table_returns_table(): ctx = _DemographicContext(None, codesets={1: (8507, 8532)}) - tbl = _demographic_concept_table(explicit_ids=(8507,), codeset_id=1, ctx=ctx) - assert tbl is not None - assert list(tbl.execute()["concept_id"]) == [8507, 8532] + result = _demographic_concept_ids(explicit_ids=(8507,), codeset_id=1, ctx=ctx) + assert result == (8507, 8532) -def test_demographic_concept_table_returns_none_when_empty(): +def test_demographic_concept_table_returns_empty_when_empty(): ctx = _DemographicContext(None) - tbl = _demographic_concept_table(explicit_ids=(), codeset_id=None, ctx=ctx) - assert tbl is None + result = _demographic_concept_ids(explicit_ids=(), codeset_id=None, ctx=ctx) + assert result == () def test_demographic_match_keys_applies_all_supported_filters(): diff --git a/tests/execution/test_person_filters.py b/tests/execution/test_person_filters.py index 2bb40cd..0fe018a 100644 --- a/tests/execution/test_person_filters.py +++ b/tests/execution/test_person_filters.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ibis import pytest from circe.execution.errors import CompilationError @@ -21,8 +22,9 @@ def __init__(self, conn, *, codesets: dict[int, tuple[int, ...]] | None = None): def table(self, name: str): return self.conn.table(name) - def concept_ids_for_codeset(self, codeset_id: int) -> tuple[int, ...]: - return self.codesets.get(codeset_id, ()) + def concept_set_table(self, codeset_id: int): + ids = self.codesets.get(codeset_id, ()) + return ibis.memtable({"concept_id": list(ids)}, schema={"concept_id": "int64"}) def _seed_person_tables(conn, ibis): From 6a825dd6efb73a577c89c247bfe113022bdf6b09 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 11:48:06 -0700 Subject: [PATCH 22/53] concept set fixes --- circe/cohort_definition_set/_generate.py | 422 ++++++++++------------- circe/execution/api.py | 20 +- circe/execution/ibis/codesets.py | 184 +++++++++- 3 files changed, 351 insertions(+), 275 deletions(-) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 5549a1b..99a3a5f 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,55 +10,18 @@ from typing import TYPE_CHECKING, Literal from ..execution.api import build_cohort, write_cohort -from ..execution.ibis.codesets import _CODESET_TABLE, build_batch_codeset_table, drop_codeset_table -from ..execution.normalize.cohort import normalize_cohort +from ..execution.ibis.materialize import project_to_ohdsi_cohort_table from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult if TYPE_CHECKING: - from ..execution.typing import IbisBackendLike, Table + from ..execution.typing import IbisBackendLike logger = logging.getLogger(__name__) _backend_lock = threading.Lock() -def _collect_concept_sets( - cohort_definition_set: CohortDefinitionSet, -) -> dict: - """Normalize all cohort expressions and merge concept sets.""" - from ..execution.normalize.cohort import NormalizedConceptSet # noqa: F401 - - all_sets: dict[int, NormalizedConceptSet] = {} - for cohort in cohort_definition_set: - normalized = normalize_cohort(cohort.expression) - for cid, cset in normalized.concept_sets.items(): - if cid not in all_sets: - all_sets[cid] = cset - return all_sets - - -def _build_and_return_batch_codesets( - *, - backend: IbisBackendLike, - concept_sets: dict, - results_schema: str | None, - vocabulary_schema: str | None, - results_table_name: str, -) -> Table: - """Build batch codeset table. Called inside a thread.""" - - return build_batch_codeset_table( - backend=backend, - concept_sets=concept_sets, - batch_table_name=results_table_name, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - use_persistent_cache=False, - temporary=False, - ) - - def _process_single_cohort( cohort: CohortDefinition, *, @@ -67,33 +30,33 @@ def _process_single_cohort( results_schema: str | None, vocabulary_schema: str | None, cohort_table: str, - codeset_table: Table, + use_persistent_cache: bool, ) -> tuple[datetime, datetime]: - """Build and write a single cohort against a pre-populated codeset table. + """Build and write a single cohort. Thread-safe via ``_backend_lock``. - Thread-safe via ``_backend_lock``. + Each cohort uses its own per-cohort codeset table built from the + ``_circe_codeset_cache`` when *use_persistent_cache* is True, allowing + checksum-keyed concept set reuse across cohorts and runs. Returns ``(start_time, end_time)`` of the database-materialization phase so the caller can compute execution duration. """ - from ..execution.ibis.materialize import project_to_ohdsi_cohort_table - with _backend_lock: start_time = datetime.now() new_rows = build_cohort( cohort.expression, backend=backend, - cdm_schema=cdm_schema, # type: ignore[arg-type] + cdm_schema=cdm_schema, results_schema=results_schema, vocabulary_schema=vocabulary_schema, + use_persistent_cache=use_persistent_cache, cohort_id=cohort.cohort_id, - codeset_table=codeset_table, ) projected = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) write_cohort( compiled_relation=projected, backend=backend, - cdm_schema=cdm_schema, # type: ignore[arg-type] + cdm_schema=cdm_schema, cohort_table=cohort_table, cohort_id=cohort.cohort_id, results_schema=results_schema, @@ -119,21 +82,31 @@ async def async_generate_cohort_set( ) -> list[CohortGenerationResult]: """Generate all cohorts in a CohortDefinitionSet and write them to a shared table. - This is the async counterpart of :func:`generate_cohort_set`. It wraps - the synchronous build/write pipeline in :func:`asyncio.to_thread` so - that each cohort's work does not block the event loop. When - *compile_timeout* is set, cohorts taking longer than the given number - of seconds are recorded as ``FAILED`` and the next cohort proceeds - (subject to *stop_on_error*). - - All exception types (not just ``ExecutionError``) are caught and - recorded as ``FAILED``, ensuring that transient database errors such as - Databricks ``ServerOperationError`` do not abort the entire batch. - - A single batch codeset table is populated at the start once with *all* - concept sets from *all* cohorts, so that concept-ancestor and - concept-relationship lookups happen in one bulk query rather than per - cohort. The table is dropped before returning. + Each cohort builds its own per-cohort codeset table. When *incremental* + is True, concept sets are stored in the persistent + ``_circe_codeset_cache`` keyed by SHA-256 checksum so that identical + concept set definitions across cohorts are resolved only once per + batch. + + All exception types are caught and recorded as ``FAILED``. + + Args: + cohort_definition_set: The set of cohort definitions to generate. + backend: Ibis backend connection pointing at the target database. + cdm_schema: Schema containing the OMOP CDM source tables. + cohort_table: Name of the OHDSI cohort table to write results into. + results_schema: Optional schema for both the cohort table and + checksum table. + vocabulary_schema: Optional schema for vocabulary tables. + incremental: If True, skip cohorts whose expression checksum is + unchanged since the last successful generation. + checksum_table: Name of the table used to persist generation + history for incremental runs. + stop_on_error: If True, raise on the first failure. + compile_timeout: Maximum seconds per-cohort before timeout. + + Returns: + A list of :class:`CohortGenerationResult`. """ total = len(cohort_definition_set) @@ -150,188 +123,129 @@ async def async_generate_cohort_set( table_name=checksum_table, ) - # Collect and materialise concept sets once for the entire batch - all_concept_sets = await asyncio.to_thread(_collect_concept_sets, cohort_definition_set) - batch_codesets_table_name = _CODESET_TABLE - codeset_table: Table | None = None - if all_concept_sets: - codeset_table = await asyncio.to_thread( - _build_and_return_batch_codesets, - backend=backend, - concept_sets=all_concept_sets, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - results_table_name=batch_codesets_table_name, - ) - else: - # No concept sets -- create empty memtable to satisfy ExecutionContext - import ibis # noqa: PLC0415 - - codeset_table = ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - results: list[CohortGenerationResult] = [] - logger.info( - "Generating %d cohort(s) (incremental=%s) using batch codeset table", - total, - incremental, - ) + logger.info("Generating %d cohort(s) (incremental=%s)", total, incremental) - try: - for i, cohort in enumerate(cohort_definition_set, start=1): - current_checksum = cohort.expression.checksum() - - if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: - logger.info( - "[%d/%d] Skipping cohort %d (%s) -- checksum unchanged", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - ) - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="SKIPPED", - checksum=current_checksum, - start_time=datetime.now(), - end_time=datetime.now(), - ) - ) - continue + for i, cohort in enumerate(cohort_definition_set, start=1): + current_checksum = cohort.expression.checksum() + if incremental and previous_checksums.get(cohort.cohort_id) == current_checksum: logger.info( - "[%d/%d] Building cohort %d (%s) ...", + "[%d/%d] Skipping cohort %d (%s) -- checksum unchanged", i, total, cohort.cohort_id, cohort.cohort_name, ) - - start_time: datetime | None = None - end_time: datetime | None = None - try: - start_time, end_time = await asyncio.wait_for( - asyncio.to_thread( - _process_single_cohort, - cohort, - backend=backend, - cdm_schema=cdm_schema, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - cohort_table=cohort_table, - codeset_table=codeset_table, - ), - timeout=compile_timeout, + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="SKIPPED", + checksum=current_checksum, + start_time=datetime.now(), + end_time=datetime.now(), ) + ) + continue - duration = (end_time - start_time).total_seconds() - logger.info( - "[%d/%d] Completed cohort %d (%s) -- duration %.1fs", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - ) - except asyncio.TimeoutError: - if end_time is None: - end_time = datetime.now() - duration = (end_time - (start_time or end_time)).total_seconds() - logger.error( - "[%d/%d] TIMED OUT cohort %d (%s) after %.1fs", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - ) - timeout_exc = TimeoutError( - f"Cohort {cohort.cohort_id} ({cohort.cohort_name}) " - f"exceeded timeout of {compile_timeout:.0f}s" - ) - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="FAILED", - checksum=current_checksum, - start_time=start_time or datetime.now(), - end_time=end_time, - error=timeout_exc, - ) - ) - if incremental: - upsert_generation_history( - backend, - schema=results_schema, - table_name=checksum_table, - cohort_id=cohort.cohort_id, - checksum=current_checksum, - status="FAILED", - start_time=start_time or datetime.now(), - end_time=end_time, - ) - if stop_on_error: - raise timeout_exc from None - continue - except Exception as exc: - if end_time is None: - end_time = datetime.now() - duration = (end_time - (start_time or end_time)).total_seconds() - logger.error( - "[%d/%d] FAILED cohort %d (%s) after %.1fs: %s", - i, - total, - cohort.cohort_id, - cohort.cohort_name, - duration, - exc, + logger.info( + "[%d/%d] Building cohort %d (%s) ...", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + ) + + start_time: datetime | None = None + end_time: datetime | None = None + try: + start_time, end_time = await asyncio.wait_for( + asyncio.to_thread( + _process_single_cohort, + cohort, + backend=backend, + cdm_schema=cdm_schema, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + cohort_table=cohort_table, + use_persistent_cache=incremental, + ), + timeout=compile_timeout, + ) + + duration = (end_time - start_time).total_seconds() + logger.info( + "[%d/%d] Completed cohort %d (%s) -- duration %.1fs", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + ) + except asyncio.TimeoutError: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] TIMED OUT cohort %d (%s) after %.1fs", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + ) + timeout_exc = TimeoutError( + f"Cohort {cohort.cohort_id} ({cohort.cohort_name}) exceeded timeout of {compile_timeout:.0f}s" + ) + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="FAILED", + checksum=current_checksum, + start_time=start_time or datetime.now(), + end_time=end_time, + error=timeout_exc, ) - results.append( - CohortGenerationResult( - cohort_id=cohort.cohort_id, - cohort_name=cohort.cohort_name, - status="FAILED", - checksum=current_checksum, - start_time=start_time or datetime.now(), - end_time=end_time, - error=exc, - ) + ) + if incremental: + upsert_generation_history( + backend, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="FAILED", + start_time=start_time or datetime.now(), + end_time=end_time, ) - if incremental: - upsert_generation_history( - backend, - schema=results_schema, - table_name=checksum_table, - cohort_id=cohort.cohort_id, - checksum=current_checksum, - status="FAILED", - start_time=start_time or datetime.now(), - end_time=end_time, - ) - if stop_on_error: - raise - continue - - # Clean up staging tables created by the materialized pipeline - schema = results_schema or cdm_schema - for stage in ("primary", "qualified", "included", "ended"): - with contextlib.suppress(Exception): - backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) - + if stop_on_error: + raise timeout_exc from None + continue + except Exception as exc: + if end_time is None: + end_time = datetime.now() + duration = (end_time - (start_time or end_time)).total_seconds() + logger.error( + "[%d/%d] FAILED cohort %d (%s) after %.1fs: %s", + i, + total, + cohort.cohort_id, + cohort.cohort_name, + duration, + exc, + ) results.append( CohortGenerationResult( cohort_id=cohort.cohort_id, cohort_name=cohort.cohort_name, - status="COMPLETE", + status="FAILED", checksum=current_checksum, start_time=start_time or datetime.now(), - end_time=end_time or datetime.now(), + end_time=end_time, + error=exc, ) ) if incremental: @@ -341,29 +255,52 @@ async def async_generate_cohort_set( table_name=checksum_table, cohort_id=cohort.cohort_id, checksum=current_checksum, - status="COMPLETE", + status="FAILED", start_time=start_time or datetime.now(), - end_time=end_time or datetime.now(), + end_time=end_time, ) - - summary = summarise_generation_results(results) - logger.info( - "Cohort generation complete: %d completed, %d skipped, %d failed", - summary["COMPLETE"], - summary["SKIPPED"], - summary["FAILED"], + if stop_on_error: + raise + continue + + # Clean up staging tables created by the materialized pipeline + schema = results_schema or cdm_schema + for stage in ("primary", "qualified", "included", "ended"): + with contextlib.suppress(Exception): + backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) + + results.append( + CohortGenerationResult( + cohort_id=cohort.cohort_id, + cohort_name=cohort.cohort_name, + status="COMPLETE", + checksum=current_checksum, + start_time=start_time or datetime.now(), + end_time=end_time or datetime.now(), + ) ) - - return results - finally: - if all_concept_sets: - await asyncio.to_thread( - drop_codeset_table, + if incremental: + upsert_generation_history( backend, - batch_table_name=batch_codesets_table_name, - results_schema=results_schema, + schema=results_schema, + table_name=checksum_table, + cohort_id=cohort.cohort_id, + checksum=current_checksum, + status="COMPLETE", + start_time=start_time or datetime.now(), + end_time=end_time or datetime.now(), ) + summary = summarise_generation_results(results) + logger.info( + "Cohort generation complete: %d completed, %d skipped, %d failed", + summary["COMPLETE"], + summary["SKIPPED"], + summary["FAILED"], + ) + + return results + def generate_cohort_set( cohort_definition_set: CohortDefinitionSet, @@ -411,8 +348,7 @@ def summarise_generation_results( results: List of CohortGenerationResult from generate_cohort_set. Returns: - dict with counts for each status, e.g. - ``{"COMPLETE": 2, "SKIPPED": 1, "FAILED": 0}``. + dict with counts for each status. """ counts: dict[Literal["COMPLETE", "SKIPPED", "FAILED"], int] = { "COMPLETE": 0, diff --git a/circe/execution/api.py b/circe/execution/api.py index 19b23dc..2cb867b 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from typing import Literal from ..cohortdefinition import CohortExpression @@ -44,21 +43,23 @@ def build_cohort( (e.g. unit tests that only verify the expression tree can be built). When *codeset_table* is provided (from a batch-generation caller), it is - used directly. Otherwise one is auto-created for this single cohort. + used directly. Otherwise a per-cohort codeset table is auto-created. + With *use_persistent_cache=True*, concept sets are stored in + ``_circe_codeset_cache`` keyed by SHA-256 checksum, enabling reuse + across cohorts and runs. """ maybe_apply_databricks_post_connect_workaround(backend) normalized = normalize_cohort(expression) - if codeset_table is not None: - pass - else: + if codeset_table is None: codeset_table = build_single_codeset_table( backend=backend, concept_sets=normalized.concept_sets, batch_table_name=f"__cg_{cohort_id}_codesets", results_schema=results_schema, vocabulary_schema=vocabulary_schema, + use_persistent_cache=use_persistent_cache, ) ctx = make_execution_context( @@ -69,14 +70,7 @@ def build_cohort( codeset_table=codeset_table, ) - # Ibis SQL compilation for large cohorts may exceed the default recursion - # limit when walking deeply nested expression trees (e.g. 100-way UNION). - prev_limit = sys.getrecursionlimit() - sys.setrecursionlimit(max(prev_limit, 5000)) - try: - return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) - finally: - sys.setrecursionlimit(prev_limit) + return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) def write_relation( diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 587fef3..042dfec 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -414,13 +414,55 @@ def build_batch_codeset_table( return ref -def _needs_vocabulary_expansion(concept_sets: Mapping[int, NormalizedConceptSet]) -> bool: - """Return True if any concept set requires vocabulary-table queries.""" - for cset in concept_sets.values(): - for item in cset.items: - if item.include_descendants or item.include_mapped: - return True - return False +def _find_existing_checksums( + backend: IbisBackendLike, + checksums: set[str], + schema: str | None, +) -> set[str]: + """Return the subset of *checksums* that already exist in ``_circe_codeset_cache``.""" + if not checksums: + return set() + from .operations import table_exists + + if not table_exists(backend, table_name=_CACHE_TABLE_NAME, schema=schema): + return set() + tbl = _read_table(backend, table_name=_CACHE_TABLE_NAME, schema=schema) + existing = tbl.filter(tbl.cache_key.isin(tuple(checksums))).select("cache_key").distinct().execute() + if hasattr(existing, "columns"): + return {str(v) for v in existing["cache_key"].tolist() if v is not None} + return set() + + +def _resolve_codeset_ids( + cset: NormalizedConceptSet, + *, + table_getter: Callable[[str, str | None], Table], + vocabulary_schema: str | None, +) -> tuple[int, ...]: + """Resolve a concept set to concrete concept IDs. + + Uses ``_build_codeset_expression`` which handles descendants, mapped + codes, and exclusions. The result is a tuple of concrete int IDs. + """ + expr = _build_codeset_expression(cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema) + try: + result = expr.execute() + return _extract_column(result, CONCEPT_ID) + except Exception as exc: + raise CompilationError( + "Ibis executor compilation error: failed executing concept-set resolution query." + ) from exc + + +def _compute_checksum_map( + concept_sets: Mapping[int, NormalizedConceptSet], +) -> dict[int, str]: + """Return a dict mapping ``codeset_id -> cache_key`` for each concept set.""" + result: dict[int, str] = {} + for cid, cset in concept_sets.items(): + if cset.items: + result[int(cid)] = _compute_cache_key(cset.items) + return result def build_single_codeset_table( @@ -430,14 +472,25 @@ def build_single_codeset_table( batch_table_name: str = _CODESET_TABLE, results_schema: str | None = None, vocabulary_schema: str | None = None, + use_persistent_cache: bool = False, ) -> Table: """Build a codeset table for a single cohort. - When all concept sets use only direct concept IDs (no descendant or - mapped expansion needed), builds a simple memtable -- no vocabulary - tables required. Falls back to full expansion otherwise. + When *use_persistent_cache* is True, concept sets are stored in + ``_circe_codeset_cache`` keyed by SHA-256 checksum of the concept set + items. Cache misses are resolved and inserted. The per-cohort table + is then built by selecting from the cache -- identical concept sets + across cohorts share the same cache entry and need only one resolution. + + When *use_persistent_cache* is False and all concept sets are simple + (no descendant or mapped expansion), builds a lightweight memtable + directly without any vocabulary-table queries. """ - if not _needs_vocabulary_expansion(concept_sets): + checksum_map = _compute_checksum_map(concept_sets) + needs_vocab = _needs_vocabulary_expansion(concept_sets) + + # Fast path: all concept sets are simple, no persistent cache needed + if not needs_vocab and not use_persistent_cache: rows: list[dict[str, Any]] = [] for cid, cset in concept_sets.items(): for item in cset.items: @@ -460,15 +513,108 @@ def build_single_codeset_table( ) return _read_table(backend, table_name=batch_table_name, schema=results_schema) - return build_batch_codeset_table( - backend=backend, - concept_sets=concept_sets, - batch_table_name=batch_table_name, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - use_persistent_cache=False, - temporary=True, + if use_persistent_cache: + # Persistent-cache path: resolve concept sets once into the cache, + # then build per-cohort table from the cache entries. + + existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema) + + table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") + + for cid, key in checksum_map.items(): + if key not in existing: + cset = concept_sets[int(cid)] + resolved = _resolve_codeset_ids( + cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + if resolved: + _write_codeset_cache( + backend, + cache_key=key, + concept_ids=resolved, + schema=results_schema, + table_name=_CACHE_TABLE_NAME, + ) + + # Build per-cohort table from cache + cache_ref = _read_table(backend, table_name=_CACHE_TABLE_NAME, schema=results_schema) + + parts: list[Table] = [] + for cid, key in checksum_map.items(): + part = ( + cache_ref.filter(cache_ref.cache_key == key) + .select(cache_ref.concept_id.name(CONCEPT_ID)) + .mutate(codeset_id=ibis.literal(int(cid), type="int64")) + .select("codeset_id", CONCEPT_ID) + ) + parts.append(part) + + combined = _union_all_tables(parts) + _create_table_impl( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=combined, + overwrite=True, + ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + + # No cache, but some concept sets need vocabulary expansion + table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") + + parts = [] + for cid, cset in concept_sets.items(): + if not cset.items: + continue + needs_vocab = any(item.include_descendants or item.include_mapped for item in cset.items) + if needs_vocab: + expr = _build_codeset_expression( + cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + labeled = expr.mutate(codeset_id=ibis.literal(int(cid), type="int64")).select( + "codeset_id", CONCEPT_ID + ) + parts.append(labeled) + else: + for item in cset.items: + if not item.is_excluded and item.concept_id is not None: + rows = [{"codeset_id": int(cid), "concept_id": int(item.concept_id)}] + parts.append(ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"})) + + if not parts: + empty = ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + _create_table_impl( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=empty, + overwrite=True, + temp=True, + ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + + combined = _union_all_tables(parts) + _create_table_impl( + backend, + table_name=batch_table_name, + schema=results_schema, + obj=combined, + overwrite=True, + temp=True, ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + + +def _needs_vocabulary_expansion(concept_sets: Mapping[int, NormalizedConceptSet]) -> bool: + """Return True if any concept set requires vocabulary-table queries.""" + for cset in concept_sets.values(): + for item in cset.items: + if item.include_descendants or item.include_mapped: + return True + return False def _read_table( From 072e5a99836600e2623801b21a2b8ded9c18a0fa Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 12:50:55 -0700 Subject: [PATCH 23/53] concept set fixes --- circe/api.py | 35 +++++++ circe/cohort_definition_set/_generate.py | 2 +- circe/execution/__init__.py | 2 + circe/execution/ibis/codesets.py | 124 ++++++++++++++++------- 4 files changed, 127 insertions(+), 36 deletions(-) diff --git a/circe/api.py b/circe/api.py index 801f8fb..6208a1a 100644 --- a/circe/api.py +++ b/circe/api.py @@ -266,6 +266,41 @@ def write_cohort( ) +def resolve_concept_sets( + concept_sets: Any, + *, + backend: IbisBackendLike, + results_schema: Optional[str] = None, + vocabulary_schema: Optional[str] = None, +) -> set[str]: + """Resolve concept sets into the persistent ``_circe_codeset_cache`` table. + + Concept sets are keyed by SHA-256 checksum of their items. Cache hits + are skipped; cache misses are resolved via a single batch query per + vocabulary table and stored permanently. + + This is independent of cohort generation and can be called at any time + to pre-populate the cache. + + Args: + concept_sets: Mapping of ``codeset_id -> NormalizedConceptSet``. + backend: Ibis backend connection. + results_schema: Schema for the cache table. + vocabulary_schema: Schema for vocabulary tables. + + Returns: + Set of checksums that were resolved (cache hits excluded). + """ + from .execution import resolve_concept_sets as _resolve + + return _resolve( + concept_sets, + backend=backend, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + ) + + def cohort_print_friendly( expression: CohortExpression, concept_sets: Optional[list[ConceptSet]] = None, diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 99a3a5f..872fbe9 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -265,7 +265,7 @@ async def async_generate_cohort_set( # Clean up staging tables created by the materialized pipeline schema = results_schema or cdm_schema - for stage in ("primary", "qualified", "included", "ended"): + for stage in ("codesets", "primary", "qualified", "included", "ended"): with contextlib.suppress(Exception): backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) diff --git a/circe/execution/__init__.py b/circe/execution/__init__.py index ba27df0..f5a97ba 100644 --- a/circe/execution/__init__.py +++ b/circe/execution/__init__.py @@ -13,10 +13,12 @@ UnsupportedCriterionError, UnsupportedFeatureError, ) +from .ibis.codesets import resolve_concept_sets __all__ = [ "build_cohort", "write_cohort", + "resolve_concept_sets", "apply_databricks_post_connect_workaround", "ExecutionError", "ExecutionNormalizationError", diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 042dfec..f259df7 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -3,6 +3,7 @@ import contextlib import hashlib import json +import logging from collections.abc import Callable, Mapping from typing import Any @@ -13,6 +14,9 @@ from ..plan.schema import CONCEPT_ID from ..typing import IbisBackendLike, Table from .operations import create_table as _create_table_impl +from .operations import insert_relation, table_exists + +logger = logging.getLogger(__name__) _CODESET_TABLE = "__cg_codesets" _CACHE_TABLE_NAME = "_circe_codeset_cache" @@ -433,25 +437,87 @@ def _find_existing_checksums( return set() -def _resolve_codeset_ids( - cset: NormalizedConceptSet, +def _populate_cache_batch( + backend: IbisBackendLike, + expression: Table, + schema: str | None, +) -> None: + """Insert a batch of concept set expansions into ``_circe_codeset_cache``. + + The expression must have ``(cache_key TEXT, concept_id INT64)`` columns. + If the cache table does not exist it is created; otherwise rows are + appended. An empty expression is silently skipped. + """ + if not table_exists(backend, table_name=_CACHE_TABLE_NAME, schema=schema): + _create_table_impl( + backend, + table_name=_CACHE_TABLE_NAME, + schema=schema, + obj=expression, + overwrite=False, + ) + else: + insert_relation( + expression, + backend=backend, + target_table=_CACHE_TABLE_NAME, + target_schema=schema, + ) + + +def resolve_concept_sets( + concept_sets: Mapping[int, NormalizedConceptSet], *, - table_getter: Callable[[str, str | None], Table], - vocabulary_schema: str | None, -) -> tuple[int, ...]: - """Resolve a concept set to concrete concept IDs. + backend: IbisBackendLike, + results_schema: str | None = None, + vocabulary_schema: str | None = None, +) -> set[str]: + """Resolve concept sets into ``_circe_codeset_cache``. + + For each unique concept set (identified by SHA-256 checksum): + - Cache hit: skipped (already in ``_circe_codeset_cache``) + - Cache miss: resolved via a single bulk query (vocabulary-table joins + for descendants/mapped codes) and inserted into the cache. + + Returns the set of checksums that were resolved (cache misses). - Uses ``_build_codeset_expression`` which handles descendants, mapped - codes, and exclusions. The result is a tuple of concrete int IDs. + This function is idempotent and safe to call at any time, independent + of cohort generation. It never uses Python memory for resolved IDs -- + the resolution query runs directly on the backend. """ - expr = _build_codeset_expression(cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema) + if not concept_sets: + return set() + + checksum_map = _compute_checksum_map(concept_sets) + existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema) + table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") + + miss_parts: list[Table] = [] + resolved_keys: set[str] = set() + + for cid, key in checksum_map.items(): + if key in existing: + continue + try: + expr = _build_codeset_expression( + concept_sets[int(cid)], table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + labeled = expr.mutate(cache_key=ibis.literal(key, type="string")).select("cache_key", CONCEPT_ID) + miss_parts.append(labeled) + resolved_keys.add(key) + except Exception as exc: + logger.warning("Failed to build concept set resolution query for key %s: %s", key, exc) + + if not miss_parts: + return set() + + combined = _union_all_tables(miss_parts) try: - result = expr.execute() - return _extract_column(result, CONCEPT_ID) + _populate_cache_batch(backend, combined, results_schema) except Exception as exc: - raise CompilationError( - "Ibis executor compilation error: failed executing concept-set resolution query." - ) from exc + logger.warning("Failed to populate codeset cache: %s", exc) + + return resolved_keys def _compute_checksum_map( @@ -514,27 +580,15 @@ def build_single_codeset_table( return _read_table(backend, table_name=batch_table_name, schema=results_schema) if use_persistent_cache: - # Persistent-cache path: resolve concept sets once into the cache, - # then build per-cohort table from the cache entries. - - existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema) - - table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") - - for cid, key in checksum_map.items(): - if key not in existing: - cset = concept_sets[int(cid)] - resolved = _resolve_codeset_ids( - cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - if resolved: - _write_codeset_cache( - backend, - cache_key=key, - concept_ids=resolved, - schema=results_schema, - table_name=_CACHE_TABLE_NAME, - ) + # Resolve cache misses as a single bulk INSERT into _circe_codeset_cache. + # Cache hits are skipped. No per-concept-set round trips, no Python + # memory for resolved IDs. + resolve_concept_sets( + concept_sets, + backend=backend, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + ) # Build per-cohort table from cache cache_ref = _read_table(backend, table_name=_CACHE_TABLE_NAME, schema=results_schema) From e99f27f11791f7f9811b72b195a6592baa250c5a Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 13:14:15 -0700 Subject: [PATCH 24/53] concept set cohort table names --- circe/api.py | 22 ++----- circe/cohort_definition_set/_generate.py | 16 +++-- circe/execution/api.py | 13 +++-- circe/execution/engine/cohort.py | 46 ++++++++++++--- circe/execution/ibis/codesets.py | 74 ++++++++++++++++-------- 5 files changed, 112 insertions(+), 59 deletions(-) diff --git a/circe/api.py b/circe/api.py index 6208a1a..42ff2fa 100644 --- a/circe/api.py +++ b/circe/api.py @@ -272,24 +272,13 @@ def resolve_concept_sets( backend: IbisBackendLike, results_schema: Optional[str] = None, vocabulary_schema: Optional[str] = None, + cohort_table: str = "cohort", ) -> set[str]: - """Resolve concept sets into the persistent ``_circe_codeset_cache`` table. + """Resolve concept sets into the persistent codeset cache table. - Concept sets are keyed by SHA-256 checksum of their items. Cache hits - are skipped; cache misses are resolved via a single batch query per - vocabulary table and stored permanently. - - This is independent of cohort generation and can be called at any time - to pre-populate the cache. - - Args: - concept_sets: Mapping of ``codeset_id -> NormalizedConceptSet``. - backend: Ibis backend connection. - results_schema: Schema for the cache table. - vocabulary_schema: Schema for vocabulary tables. - - Returns: - Set of checksums that were resolved (cache hits excluded). + The cache table name is derived from *cohort_table*. Cache hits are + skipped; cache misses are resolved via a single batch query and stored + permanently. Independent of cohort generation. """ from .execution import resolve_concept_sets as _resolve @@ -298,6 +287,7 @@ def resolve_concept_sets( backend=backend, results_schema=results_schema, vocabulary_schema=vocabulary_schema, + cohort_table=cohort_table, ) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 872fbe9..696d4ca 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -35,8 +35,8 @@ def _process_single_cohort( """Build and write a single cohort. Thread-safe via ``_backend_lock``. Each cohort uses its own per-cohort codeset table built from the - ``_circe_codeset_cache`` when *use_persistent_cache* is True, allowing - checksum-keyed concept set reuse across cohorts and runs. + codeset cache (named from *cohort_table*) when *use_persistent_cache* + is True, allowing checksum-keyed concept set reuse. Returns ``(start_time, end_time)`` of the database-materialization phase so the caller can compute execution duration. @@ -51,6 +51,7 @@ def _process_single_cohort( vocabulary_schema=vocabulary_schema, use_persistent_cache=use_persistent_cache, cohort_id=cohort.cohort_id, + cohort_table=cohort_table, ) projected = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) write_cohort( @@ -76,7 +77,7 @@ async def async_generate_cohort_set( results_schema: str | None = None, vocabulary_schema: str | None = None, incremental: bool = False, - checksum_table: str = "cohort_checksum", + checksum_table: str | None = None, stop_on_error: bool = True, compile_timeout: float | None = None, ) -> list[CohortGenerationResult]: @@ -114,6 +115,9 @@ async def async_generate_cohort_set( _COMPILED_CORRELATED_EVENTS.clear() + if checksum_table is None: + checksum_table = f"{cohort_table}_checksum" + previous_checksums: dict[int, str] = {} if incremental: previous_checksums = await asyncio.to_thread( @@ -267,7 +271,9 @@ async def async_generate_cohort_set( schema = results_schema or cdm_schema for stage in ("codesets", "primary", "qualified", "included", "ended"): with contextlib.suppress(Exception): - backend.drop_table(f"__cg_{cohort.cohort_id}_{stage}", database=schema, force=True) + backend.drop_table( + f"__{cohort_table}_{cohort.cohort_id}_{stage}", database=schema, force=True + ) results.append( CohortGenerationResult( @@ -311,7 +317,7 @@ def generate_cohort_set( results_schema: str | None = None, vocabulary_schema: str | None = None, incremental: bool = False, - checksum_table: str = "cohort_checksum", + checksum_table: str | None = None, stop_on_error: bool = True, ) -> list[CohortGenerationResult]: """Generate all cohorts in a CohortDefinitionSet and write them to a shared table. diff --git a/circe/execution/api.py b/circe/execution/api.py index 2cb867b..a3c3e34 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -34,6 +34,7 @@ def build_cohort( cohort_id: int = 0, materialize: bool = True, codeset_table: Table | None = None, + cohort_table: str = "cohort", ) -> Table: """Normalize, compile, and assemble a cohort relation. @@ -44,9 +45,8 @@ def build_cohort( When *codeset_table* is provided (from a batch-generation caller), it is used directly. Otherwise a per-cohort codeset table is auto-created. - With *use_persistent_cache=True*, concept sets are stored in - ``_circe_codeset_cache`` keyed by SHA-256 checksum, enabling reuse - across cohorts and runs. + With *use_persistent_cache=True*, concept sets are cached in a table + named from *cohort_table* via ``_codeset_cache_table()``. """ maybe_apply_databricks_post_connect_workaround(backend) @@ -56,10 +56,11 @@ def build_cohort( codeset_table = build_single_codeset_table( backend=backend, concept_sets=normalized.concept_sets, - batch_table_name=f"__cg_{cohort_id}_codesets", + batch_table_name=f"__{cohort_table}_{cohort_id}_codesets", results_schema=results_schema, vocabulary_schema=vocabulary_schema, use_persistent_cache=use_persistent_cache, + cohort_table=cohort_table, ) ctx = make_execution_context( @@ -70,7 +71,9 @@ def build_cohort( codeset_table=codeset_table, ) - return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) + return build_cohort_table( + normalized, ctx, cohort_id=cohort_id, materialize=materialize, cohort_table=cohort_table + ) def write_relation( diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 81825de..52a643b 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -24,6 +24,7 @@ def _materialize( cohort_id: int, stage: str, schema: str | None, + cohort_table: str = "cohort", ) -> Table: """Write *table* to a backend staging table and return a fresh reference. @@ -35,7 +36,7 @@ def _materialize( to the compiler shallow — each stage only builds on a simple ``DatabaseTable`` reference. """ - name = f"__cg_{cohort_id}_{stage}" + name = f"__{cohort_table}_{cohort_id}_{stage}" create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) return read_table(ctx.backend, table_name=name, schema=schema) @@ -44,10 +45,11 @@ def _drop_staging_tables( ctx: ExecutionContext, cohort_id: int, schema: str | None, + cohort_table: str = "cohort", ) -> None: """Remove all staging tables for *cohort_id* from the database.""" - for stage in ("primary", "qualified", "included", "ended"): - name = f"__cg_{cohort_id}_{stage}" + for stage in ("codesets", "primary", "qualified", "included", "ended"): + name = f"__{cohort_table}_{cohort_id}_{stage}" with contextlib.suppress(Exception): ctx.backend.drop_table(name, database=schema, force=True) @@ -58,6 +60,7 @@ def build_cohort_table( *, cohort_id: int = 0, materialize: bool = True, + cohort_table: str = "cohort", ) -> Table: primary_plans = tuple( PrimaryEventInput( @@ -80,7 +83,12 @@ def build_cohort_table( primary_events = build_primary_events(cohort_plan, ctx) if materialize: primary_events = _materialize( - primary_events, ctx=ctx, cohort_id=cohort_id, stage="primary", schema=schema + primary_events, + ctx=ctx, + cohort_id=cohort_id, + stage="primary", + schema=schema, + cohort_table=cohort_table, ) # ── Additional (correlated) criteria ──────────────────────────────── @@ -89,7 +97,12 @@ def build_cohort_table( qualified_events = apply_result_limit(qualified_events, cohort_plan.qualified_limit_type) if materialize: qualified_events = _materialize( - qualified_events, ctx=ctx, cohort_id=cohort_id, stage="qualified", schema=schema + qualified_events, + ctx=ctx, + cohort_id=cohort_id, + stage="qualified", + schema=schema, + cohort_table=cohort_table, ) # ── Inclusion rules ───────────────────────────────────────────────── @@ -103,7 +116,12 @@ def build_cohort_table( for rule in normalized.inclusion_rules: included_events = apply_additional_criteria(included_events, rule.expression, ctx) included_events = _materialize( - included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + included_events, + ctx=ctx, + cohort_id=cohort_id, + stage="included", + schema=schema, + cohort_table=cohort_table, ) included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) else: @@ -111,13 +129,25 @@ def build_cohort_table( included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) if materialize: included_events = _materialize( - included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + included_events, + ctx=ctx, + cohort_id=cohort_id, + stage="included", + schema=schema, + cohort_table=cohort_table, ) # ── End strategy ──────────────────────────────────────────────────── ended_events = apply_end_strategy(included_events, normalized.end_strategy, ctx) if materialize: - ended_events = _materialize(ended_events, ctx=ctx, cohort_id=cohort_id, stage="ended", schema=schema) + ended_events = _materialize( + ended_events, + ctx=ctx, + cohort_id=cohort_id, + stage="ended", + schema=schema, + cohort_table=cohort_table, + ) # ── Censoring + collapse (final stage — no materialize after) ────── censored_events = apply_censoring( diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index f259df7..99bf818 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -22,6 +22,20 @@ _CACHE_TABLE_NAME = "_circe_codeset_cache" +def _codeset_cache_table(cohort_table: str) -> str: + """Return the codeset cache table name derived from the cohort table name. + + Two cohort tables in the same schema get separate caches, avoiding + collisions when multiple CDMs share a results schema. + """ + return f"_{cohort_table}_codeset_cache" + + +def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: + """Return a staging table name derived from the cohort table and id.""" + return f"__{cohort_table}_{cohort_id}_{stage}" + + def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: """Deterministic SHA-256 hash of sorted concept set items.""" canonical = sorted( @@ -291,6 +305,7 @@ def build_batch_codeset_table( vocabulary_schema: str | None = None, use_persistent_cache: bool = False, temporary: bool = False, + cohort_table: str = "cohort", ) -> Table: """Populate a database table ``batch_table_name`` with all concept set IDs. @@ -300,11 +315,10 @@ def build_batch_codeset_table( concept_relationship -- the database engine performs the expansion. When *use_persistent_cache* is True, previously-resolved checksums are - loaded from ``_circe_codeset_cache`` so that already-expanded concept - sets skip the vocabulary-table queries. - - Returns an ibis Table reference to the batch table. + loaded from the codeset cache (named from *cohort_table*) so that + already-expanded concept sets skip the vocabulary-table queries. """ + cache_table_name = _codeset_cache_table(cohort_table) table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") # Collect cache keys and check persistent cache @@ -316,7 +330,7 @@ def build_batch_codeset_table( continue key = _compute_cache_key(cset.items) cached = _read_codeset_cache( - backend, cache_key=key, schema=results_schema, table_name=_CACHE_TABLE_NAME + backend, cache_key=key, schema=results_schema, table_name=cache_table_name ) if cached is not None: cache_hits[cid] = list(cached) @@ -411,7 +425,7 @@ def build_batch_codeset_table( cache_key=key, concept_ids=cids, schema=results_schema, - table_name=_CACHE_TABLE_NAME, + table_name=cache_table_name, ) ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) @@ -422,15 +436,16 @@ def _find_existing_checksums( backend: IbisBackendLike, checksums: set[str], schema: str | None, + cache_table_name: str = _CACHE_TABLE_NAME, ) -> set[str]: - """Return the subset of *checksums* that already exist in ``_circe_codeset_cache``.""" + """Return the subset of *checksums* that already exist in the cache table.""" if not checksums: return set() from .operations import table_exists - if not table_exists(backend, table_name=_CACHE_TABLE_NAME, schema=schema): + if not table_exists(backend, table_name=cache_table_name, schema=schema): return set() - tbl = _read_table(backend, table_name=_CACHE_TABLE_NAME, schema=schema) + tbl = _read_table(backend, table_name=cache_table_name, schema=schema) existing = tbl.filter(tbl.cache_key.isin(tuple(checksums))).select("cache_key").distinct().execute() if hasattr(existing, "columns"): return {str(v) for v in existing["cache_key"].tolist() if v is not None} @@ -441,17 +456,18 @@ def _populate_cache_batch( backend: IbisBackendLike, expression: Table, schema: str | None, + cache_table_name: str = _CACHE_TABLE_NAME, ) -> None: - """Insert a batch of concept set expansions into ``_circe_codeset_cache``. + """Insert a batch of concept set expansions into the codeset cache table. The expression must have ``(cache_key TEXT, concept_id INT64)`` columns. If the cache table does not exist it is created; otherwise rows are appended. An empty expression is silently skipped. """ - if not table_exists(backend, table_name=_CACHE_TABLE_NAME, schema=schema): + if not table_exists(backend, table_name=cache_table_name, schema=schema): _create_table_impl( backend, - table_name=_CACHE_TABLE_NAME, + table_name=cache_table_name, schema=schema, obj=expression, overwrite=False, @@ -460,7 +476,7 @@ def _populate_cache_batch( insert_relation( expression, backend=backend, - target_table=_CACHE_TABLE_NAME, + target_table=cache_table_name, target_schema=schema, ) @@ -471,11 +487,16 @@ def resolve_concept_sets( backend: IbisBackendLike, results_schema: str | None = None, vocabulary_schema: str | None = None, + cohort_table: str = "cohort", ) -> set[str]: - """Resolve concept sets into ``_circe_codeset_cache``. + """Resolve concept sets into the persistent codeset cache table. + + The cache table name is derived from *cohort_table* via + ``_codeset_cache_table()`` so that separate cohort tables in the same + schema get separate caches. For each unique concept set (identified by SHA-256 checksum): - - Cache hit: skipped (already in ``_circe_codeset_cache``) + - Cache hit: skipped (already in the cache) - Cache miss: resolved via a single bulk query (vocabulary-table joins for descendants/mapped codes) and inserted into the cache. @@ -487,9 +508,9 @@ def resolve_concept_sets( """ if not concept_sets: return set() - + cache_table_name = _codeset_cache_table(cohort_table) checksum_map = _compute_checksum_map(concept_sets) - existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema) + existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema, cache_table_name) table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") miss_parts: list[Table] = [] @@ -513,7 +534,7 @@ def resolve_concept_sets( combined = _union_all_tables(miss_parts) try: - _populate_cache_batch(backend, combined, results_schema) + _populate_cache_batch(backend, combined, results_schema, cache_table_name) except Exception as exc: logger.warning("Failed to populate codeset cache: %s", exc) @@ -539,14 +560,15 @@ def build_single_codeset_table( results_schema: str | None = None, vocabulary_schema: str | None = None, use_persistent_cache: bool = False, + cohort_table: str = "cohort", ) -> Table: """Build a codeset table for a single cohort. - When *use_persistent_cache* is True, concept sets are stored in - ``_circe_codeset_cache`` keyed by SHA-256 checksum of the concept set - items. Cache misses are resolved and inserted. The per-cohort table - is then built by selecting from the cache -- identical concept sets - across cohorts share the same cache entry and need only one resolution. + When *use_persistent_cache* is True, concept sets are stored in a + persistent cache table keyed by SHA-256 checksum. The cache table + name is derived from *cohort_table* via ``_codeset_cache_table()``. + Cache misses are resolved and inserted. The per-cohort table is + then built by selecting from the cache. When *use_persistent_cache* is False and all concept sets are simple (no descendant or mapped expansion), builds a lightweight memtable @@ -580,7 +602,7 @@ def build_single_codeset_table( return _read_table(backend, table_name=batch_table_name, schema=results_schema) if use_persistent_cache: - # Resolve cache misses as a single bulk INSERT into _circe_codeset_cache. + # Resolve cache misses as a single bulk INSERT into the codeset cache. # Cache hits are skipped. No per-concept-set round trips, no Python # memory for resolved IDs. resolve_concept_sets( @@ -588,10 +610,12 @@ def build_single_codeset_table( backend=backend, results_schema=results_schema, vocabulary_schema=vocabulary_schema, + cohort_table=cohort_table, ) # Build per-cohort table from cache - cache_ref = _read_table(backend, table_name=_CACHE_TABLE_NAME, schema=results_schema) + cache_name = _codeset_cache_table(cohort_table) + cache_ref = _read_table(backend, table_name=cache_name, schema=results_schema) parts: list[Table] = [] for cid, key in checksum_map.items(): From d00c78e1ab5570c6279a91bd6af07dffb915b939 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 13:35:56 -0700 Subject: [PATCH 25/53] concept set cache table creation up front --- circe/cohort_definition_set/_generate.py | 12 ++++ circe/execution/ibis/codesets.py | 71 +++++++++++++++++++++++- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 696d4ca..aed7b40 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Literal from ..execution.api import build_cohort, write_cohort +from ..execution.ibis.codesets import ensure_codeset_cache from ..execution.ibis.materialize import project_to_ohdsi_cohort_table from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult @@ -118,6 +119,17 @@ async def async_generate_cohort_set( if checksum_table is None: checksum_table = f"{cohort_table}_checksum" + # Ensure the persistent codeset cache table exists before any cohort + # processing, so that every cohort can INSERT/read from it without + # checking for existence on each call. + if incremental: + await asyncio.to_thread( + ensure_codeset_cache, + backend, + cohort_table=cohort_table, + results_schema=results_schema, + ) + previous_checksums: dict[int, str] = {} if incremental: previous_checksums = await asyncio.to_thread( diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 99bf818..31f7c77 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -36,6 +36,39 @@ def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: return f"__{cohort_table}_{cohort_id}_{stage}" +def ensure_codeset_cache(backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None) -> None: + """Create the codeset cache table if it doesn't exist. + + The cache table name is derived from *cohort_table*. Creating it + up front avoids per-cohort ``table_exists`` checks and ensures the + table is always available for INSERT by ``resolve_concept_sets``. + """ + cache_name = _codeset_cache_table(cohort_table) + has_schema = results_schema is not None + try: + if has_schema: + tables = backend.list_tables(database=results_schema) + else: + tables = backend.list_tables() + except Exception: + tables = None + + if tables is not None and cache_name in tables: + return + + empty = ibis.memtable( + {"cache_key": [], CONCEPT_ID: []}, + schema={"cache_key": "string", CONCEPT_ID: "int64"}, + ) + _create_table_impl( + backend, + table_name=cache_name, + schema=results_schema, + obj=empty, + overwrite=False, + ) + + def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: """Deterministic SHA-256 hash of sorted concept set items.""" canonical = sorted( @@ -462,7 +495,9 @@ def _populate_cache_batch( The expression must have ``(cache_key TEXT, concept_id INT64)`` columns. If the cache table does not exist it is created; otherwise rows are - appended. An empty expression is silently skipped. + appended. When the backend does not support ``insert`` (e.g. + Databricks), the table is recreated by UNION-ing existing cache data + with the new expression. """ if not table_exists(backend, table_name=cache_table_name, schema=schema): _create_table_impl( @@ -472,13 +507,29 @@ def _populate_cache_batch( obj=expression, overwrite=False, ) - else: + return + + try: insert_relation( expression, backend=backend, target_table=cache_table_name, target_schema=schema, ) + except Exception: + logger.info( + "Backend does not support insert for %s — recreating table via UNION", + cache_table_name, + ) + existing = _read_table(backend, table_name=cache_table_name, schema=schema) + merged = existing.union(expression, distinct=False) + _create_table_impl( + backend, + table_name=cache_table_name, + schema=schema, + obj=merged, + overwrite=True, + ) def resolve_concept_sets( @@ -613,8 +664,22 @@ def build_single_codeset_table( cohort_table=cohort_table, ) - # Build per-cohort table from cache + # Ensure cache table exists — if ALL concept sets were already cached + # and the cache table was somehow missing, create a placeholder. cache_name = _codeset_cache_table(cohort_table) + if not table_exists(backend, table_name=cache_name, schema=results_schema): + empty = ibis.memtable( + {"cache_key": [], CONCEPT_ID: []}, + schema={"cache_key": "string", CONCEPT_ID: "int64"}, + ) + _create_table_impl( + backend, + table_name=cache_name, + schema=results_schema, + obj=empty, + overwrite=False, + ) + cache_ref = _read_table(backend, table_name=cache_name, schema=results_schema) parts: list[Table] = [] From 817727d5729534de116924fc6e9214a069abc167 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 13:38:56 -0700 Subject: [PATCH 26/53] concept set cache table creation up front --- circe/execution/ibis/codesets.py | 62 +++++++++++--------------------- 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 31f7c77..83f817a 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -36,37 +36,29 @@ def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: return f"__{cohort_table}_{cohort_id}_{stage}" -def ensure_codeset_cache(backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None) -> None: +def ensure_codeset_cache( + backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None +) -> None: """Create the codeset cache table if it doesn't exist. - The cache table name is derived from *cohort_table*. Creating it - up front avoids per-cohort ``table_exists`` checks and ensures the - table is always available for INSERT by ``resolve_concept_sets``. + The cache table name is derived from *cohort_table*. Created up front + so that every cohort can INSERT into it without checking existence. """ - cache_name = _codeset_cache_table(cohort_table) - has_schema = results_schema is not None - try: - if has_schema: - tables = backend.list_tables(database=results_schema) - else: - tables = backend.list_tables() - except Exception: - tables = None - - if tables is not None and cache_name in tables: - return + from contextlib import suppress - empty = ibis.memtable( - {"cache_key": [], CONCEPT_ID: []}, - schema={"cache_key": "string", CONCEPT_ID: "int64"}, - ) - _create_table_impl( - backend, - table_name=cache_name, - schema=results_schema, - obj=empty, - overwrite=False, - ) + cache_name = _codeset_cache_table(cohort_table) + with suppress(Exception): + empty = ibis.memtable( + {"cache_key": [], CONCEPT_ID: []}, + schema={"cache_key": "string", CONCEPT_ID: "int64"}, + ) + _create_table_impl( + backend, + table_name=cache_name, + schema=results_schema, + obj=empty, + overwrite=False, + ) def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: @@ -664,22 +656,8 @@ def build_single_codeset_table( cohort_table=cohort_table, ) - # Ensure cache table exists — if ALL concept sets were already cached - # and the cache table was somehow missing, create a placeholder. + # Build per-cohort table from cache cache_name = _codeset_cache_table(cohort_table) - if not table_exists(backend, table_name=cache_name, schema=results_schema): - empty = ibis.memtable( - {"cache_key": [], CONCEPT_ID: []}, - schema={"cache_key": "string", CONCEPT_ID: "int64"}, - ) - _create_table_impl( - backend, - table_name=cache_name, - schema=results_schema, - obj=empty, - overwrite=False, - ) - cache_ref = _read_table(backend, table_name=cache_name, schema=results_schema) parts: list[Table] = [] From 8dcd3d2aadf675159fa26883272b75c70c87f2a1 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 13:43:10 -0700 Subject: [PATCH 27/53] concept set cache table creation up front --- circe/execution/ibis/codesets.py | 56 +++++++++++++------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 83f817a..8ffdc45 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -39,26 +39,28 @@ def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: def ensure_codeset_cache( backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None ) -> None: - """Create the codeset cache table if it doesn't exist. + """Create the codeset cache table with an empty schema if it doesn't exist. - The cache table name is derived from *cohort_table*. Created up front - so that every cohort can INSERT into it without checking existence. + Matches the pattern used by ``upsert_generation_history`` in the checksum + store: create with a simple ibis memtable, then insert data separately. """ - from contextlib import suppress + from ..ibis.operations import table_exists cache_name = _codeset_cache_table(cohort_table) - with suppress(Exception): - empty = ibis.memtable( - {"cache_key": [], CONCEPT_ID: []}, - schema={"cache_key": "string", CONCEPT_ID: "int64"}, - ) - _create_table_impl( - backend, - table_name=cache_name, - schema=results_schema, - obj=empty, - overwrite=False, - ) + if table_exists(backend, table_name=cache_name, schema=results_schema): + return + + empty = ibis.memtable( + {"cache_key": [], CONCEPT_ID: []}, + schema={"cache_key": "string", CONCEPT_ID: "int64"}, + ) + _create_table_impl( + backend, + table_name=cache_name, + schema=results_schema, + obj=empty, + overwrite=False, + ) def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: @@ -466,7 +468,6 @@ def _find_existing_checksums( """Return the subset of *checksums* that already exist in the cache table.""" if not checksums: return set() - from .operations import table_exists if not table_exists(backend, table_name=cache_table_name, schema=schema): return set() @@ -485,22 +486,10 @@ def _populate_cache_batch( ) -> None: """Insert a batch of concept set expansions into the codeset cache table. - The expression must have ``(cache_key TEXT, concept_id INT64)`` columns. - If the cache table does not exist it is created; otherwise rows are - appended. When the backend does not support ``insert`` (e.g. - Databricks), the table is recreated by UNION-ing existing cache data - with the new expression. + The table must already exist (created by :func:`ensure_codeset_cache`). + When the backend does not support ``insert``, the table is recreated + by UNION-ing existing cache data with the new expression. """ - if not table_exists(backend, table_name=cache_table_name, schema=schema): - _create_table_impl( - backend, - table_name=cache_table_name, - schema=schema, - obj=expression, - overwrite=False, - ) - return - try: insert_relation( expression, @@ -772,7 +761,6 @@ def _read_codeset_cache( table_name: str, ) -> tuple[int, ...] | None: """Read cached concept IDs for a cache key from the persistent cache table.""" - from .operations import table_exists try: if not table_exists(backend, table_name=table_name, schema=schema): @@ -794,7 +782,7 @@ def _write_codeset_cache( table_name: str, ) -> None: """Persist resolved concept IDs to the cache table.""" - from .operations import insert_relation, table_exists + from .operations import insert_relation if not concept_ids: return From ba4825b787e4e76a7ff6ca48d4764269bbea91ba Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 13:45:45 -0700 Subject: [PATCH 28/53] concept set cache table creation --- circe/execution/ibis/codesets.py | 34 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 8ffdc45..8b2c16a 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -39,28 +39,32 @@ def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: def ensure_codeset_cache( backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None ) -> None: - """Create the codeset cache table with an empty schema if it doesn't exist. - - Matches the pattern used by ``upsert_generation_history`` in the checksum - store: create with a simple ibis memtable, then insert data separately. + """Create the codeset cache table with columns defined by schema. + + Uses a plain ``CREATE TABLE`` with an ibis schema -- no memtable or + local file operations, so it works on Databricks without + ``staging_allowed_local_path`` constraints. """ from ..ibis.operations import table_exists + import ibis.expr.datatypes as dt cache_name = _codeset_cache_table(cohort_table) if table_exists(backend, table_name=cache_name, schema=results_schema): return - empty = ibis.memtable( - {"cache_key": [], CONCEPT_ID: []}, - schema={"cache_key": "string", CONCEPT_ID: "int64"}, - ) - _create_table_impl( - backend, - table_name=cache_name, - schema=results_schema, - obj=empty, - overwrite=False, - ) + schema = ibis.schema({"cache_key": dt.string, "concept_id": dt.int64}) + try: + if results_schema is not None: + backend.create_table(cache_name, schema=schema, database=results_schema, overwrite=False) + else: + backend.create_table(cache_name, schema=schema, overwrite=False) + except Exception: + # Fallback for backends that require an ``obj`` parameter + empty = ibis.memtable( + {"cache_key": [], "concept_id": []}, + schema={"cache_key": "string", "concept_id": "int64"}, + ) + backend.create_table(cache_name, obj=empty, database=results_schema, overwrite=False) def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: From e7cf48775d1a4c8fe328c5e40a9d273184ab1eb5 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 14:53:04 -0700 Subject: [PATCH 29/53] Removed ill-faited concept set cache - need to refine design better --- circe/api.py | 27 - circe/cohort_definition_set/_generate.py | 20 +- circe/execution/__init__.py | 2 - circe/execution/api.py | 9 +- circe/execution/ibis/codesets.py | 681 +++--------------- .../test_codesets_persistent_cache.py | 60 +- 6 files changed, 90 insertions(+), 709 deletions(-) diff --git a/circe/api.py b/circe/api.py index 42ff2fa..1279fc7 100644 --- a/circe/api.py +++ b/circe/api.py @@ -266,31 +266,6 @@ def write_cohort( ) -def resolve_concept_sets( - concept_sets: Any, - *, - backend: IbisBackendLike, - results_schema: Optional[str] = None, - vocabulary_schema: Optional[str] = None, - cohort_table: str = "cohort", -) -> set[str]: - """Resolve concept sets into the persistent codeset cache table. - - The cache table name is derived from *cohort_table*. Cache hits are - skipped; cache misses are resolved via a single batch query and stored - permanently. Independent of cohort generation. - """ - from .execution import resolve_concept_sets as _resolve - - return _resolve( - concept_sets, - backend=backend, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - cohort_table=cohort_table, - ) - - def cohort_print_friendly( expression: CohortExpression, concept_sets: Optional[list[ConceptSet]] = None, @@ -299,8 +274,6 @@ def cohort_print_friendly( ) -> str: """Generate human-readable Markdown from a cohort expression. - This is equivalent to R CirceR's `cohortPrintFriendly()` function. - Args: expression: CohortExpression instance concept_sets: Optional list of concept sets (uses expression.concept_sets if None) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index aed7b40..0c2f502 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Literal from ..execution.api import build_cohort, write_cohort -from ..execution.ibis.codesets import ensure_codeset_cache from ..execution.ibis.materialize import project_to_ohdsi_cohort_table from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult @@ -31,13 +30,11 @@ def _process_single_cohort( results_schema: str | None, vocabulary_schema: str | None, cohort_table: str, - use_persistent_cache: bool, ) -> tuple[datetime, datetime]: """Build and write a single cohort. Thread-safe via ``_backend_lock``. - Each cohort uses its own per-cohort codeset table built from the - codeset cache (named from *cohort_table*) when *use_persistent_cache* - is True, allowing checksum-keyed concept set reuse. + Each cohort gets its own per-cohort codeset temp table populated and + dropped as it runs, mirroring the Java ``#Codesets`` pattern. Returns ``(start_time, end_time)`` of the database-materialization phase so the caller can compute execution duration. @@ -50,7 +47,6 @@ def _process_single_cohort( cdm_schema=cdm_schema, results_schema=results_schema, vocabulary_schema=vocabulary_schema, - use_persistent_cache=use_persistent_cache, cohort_id=cohort.cohort_id, cohort_table=cohort_table, ) @@ -119,17 +115,6 @@ async def async_generate_cohort_set( if checksum_table is None: checksum_table = f"{cohort_table}_checksum" - # Ensure the persistent codeset cache table exists before any cohort - # processing, so that every cohort can INSERT/read from it without - # checking for existence on each call. - if incremental: - await asyncio.to_thread( - ensure_codeset_cache, - backend, - cohort_table=cohort_table, - results_schema=results_schema, - ) - previous_checksums: dict[int, str] = {} if incremental: previous_checksums = await asyncio.to_thread( @@ -186,7 +171,6 @@ async def async_generate_cohort_set( results_schema=results_schema, vocabulary_schema=vocabulary_schema, cohort_table=cohort_table, - use_persistent_cache=incremental, ), timeout=compile_timeout, ) diff --git a/circe/execution/__init__.py b/circe/execution/__init__.py index f5a97ba..ba27df0 100644 --- a/circe/execution/__init__.py +++ b/circe/execution/__init__.py @@ -13,12 +13,10 @@ UnsupportedCriterionError, UnsupportedFeatureError, ) -from .ibis.codesets import resolve_concept_sets __all__ = [ "build_cohort", "write_cohort", - "resolve_concept_sets", "apply_databricks_post_connect_workaround", "ExecutionError", "ExecutionNormalizationError", diff --git a/circe/execution/api.py b/circe/execution/api.py index a3c3e34..6f95f5b 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -30,7 +30,6 @@ def build_cohort( cdm_schema: str, results_schema: str | None = None, vocabulary_schema: str | None = None, - use_persistent_cache: bool = False, cohort_id: int = 0, materialize: bool = True, codeset_table: Table | None = None, @@ -44,9 +43,7 @@ def build_cohort( (e.g. unit tests that only verify the expression tree can be built). When *codeset_table* is provided (from a batch-generation caller), it is - used directly. Otherwise a per-cohort codeset table is auto-created. - With *use_persistent_cache=True*, concept sets are cached in a table - named from *cohort_table* via ``_codeset_cache_table()``. + used directly. Otherwise a per-cohort codeset temp table is auto-created. """ maybe_apply_databricks_post_connect_workaround(backend) @@ -59,8 +56,6 @@ def build_cohort( batch_table_name=f"__{cohort_table}_{cohort_id}_codesets", results_schema=results_schema, vocabulary_schema=vocabulary_schema, - use_persistent_cache=use_persistent_cache, - cohort_table=cohort_table, ) ctx = make_execution_context( @@ -125,7 +120,6 @@ def write_cohort( results_schema: str | None = None, vocabulary_schema: str | None = None, if_exists: Literal["fail", "replace"] = "fail", - use_persistent_cache: bool = False, ) -> None: """Build cohort rows and materialize them with cohort-scoped semantics. @@ -167,7 +161,6 @@ def write_cohort( cdm_schema=cdm_schema, results_schema=results_schema, vocabulary_schema=vocabulary_schema, - use_persistent_cache=use_persistent_cache, cohort_id=cohort_id, ) new_rows = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort_id) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 8b2c16a..0c3a44c 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -1,79 +1,16 @@ from __future__ import annotations import contextlib -import hashlib -import json -import logging from collections.abc import Callable, Mapping from typing import Any import ibis from ..errors import CompilationError -from ..normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem +from ..normalize.cohort import NormalizedConceptSet from ..plan.schema import CONCEPT_ID from ..typing import IbisBackendLike, Table from .operations import create_table as _create_table_impl -from .operations import insert_relation, table_exists - -logger = logging.getLogger(__name__) - -_CODESET_TABLE = "__cg_codesets" -_CACHE_TABLE_NAME = "_circe_codeset_cache" - - -def _codeset_cache_table(cohort_table: str) -> str: - """Return the codeset cache table name derived from the cohort table name. - - Two cohort tables in the same schema get separate caches, avoiding - collisions when multiple CDMs share a results schema. - """ - return f"_{cohort_table}_codeset_cache" - - -def _staging_table(cohort_table: str, cohort_id: int, stage: str) -> str: - """Return a staging table name derived from the cohort table and id.""" - return f"__{cohort_table}_{cohort_id}_{stage}" - - -def ensure_codeset_cache( - backend: IbisBackendLike, *, cohort_table: str, results_schema: str | None = None -) -> None: - """Create the codeset cache table with columns defined by schema. - - Uses a plain ``CREATE TABLE`` with an ibis schema -- no memtable or - local file operations, so it works on Databricks without - ``staging_allowed_local_path`` constraints. - """ - from ..ibis.operations import table_exists - import ibis.expr.datatypes as dt - - cache_name = _codeset_cache_table(cohort_table) - if table_exists(backend, table_name=cache_name, schema=results_schema): - return - - schema = ibis.schema({"cache_key": dt.string, "concept_id": dt.int64}) - try: - if results_schema is not None: - backend.create_table(cache_name, schema=schema, database=results_schema, overwrite=False) - else: - backend.create_table(cache_name, schema=schema, overwrite=False) - except Exception: - # Fallback for backends that require an ``obj`` parameter - empty = ibis.memtable( - {"cache_key": [], "concept_id": []}, - schema={"cache_key": "string", "concept_id": "int64"}, - ) - backend.create_table(cache_name, obj=empty, database=results_schema, overwrite=False) - - -def _compute_cache_key(items: tuple[NormalizedConceptSetItem, ...]) -> str: - """Deterministic SHA-256 hash of sorted concept set items.""" - canonical = sorted( - (item.concept_id, item.is_excluded, item.include_descendants, item.include_mapped) for item in items - ) - payload = json.dumps(canonical, separators=(",", ":")) - return hashlib.sha256(payload.encode("utf-8")).hexdigest() def _vocabulary_table( @@ -96,13 +33,6 @@ def _descendant_expression( table_getter: Callable[[str, str | None], Table], vocabulary_schema: str | None, ) -> Table: - """Build lazy ibis expression for descendant concept IDs of given ancestors. - - SELECT descendant_concept_id - FROM concept c - JOIN concept_ancestor ca ON c.concept_id = ca.descendant_concept_id - WHERE ca.ancestor_concept_id IN (...) AND c.invalid_reason IS NULL - """ concept = _vocabulary_table("concept", vocabulary_schema=vocabulary_schema, table_getter=table_getter) concept_ancestor = _vocabulary_table( "concept_ancestor", vocabulary_schema=vocabulary_schema, table_getter=table_getter @@ -122,14 +52,6 @@ def _mapped_expression( table_getter: Callable[[str, str | None], Table], vocabulary_schema: str | None, ) -> Table: - """Build lazy ibis expression for mapped-to concept IDs. - - SELECT DISTINCT cr.concept_id_1 AS concept_id - FROM concept_relationship cr - WHERE cr.concept_id_2 IN (...) - AND cr.relationship_id = 'Maps to' - AND cr.invalid_reason IS NULL - """ concept_relationship = _vocabulary_table( "concept_relationship", vocabulary_schema=vocabulary_schema, table_getter=table_getter ) @@ -142,92 +64,18 @@ def _mapped_expression( ) -def build_concept_set_expression( +def _build_codeset_expression( concept_set: NormalizedConceptSet, *, table_getter: Callable[[str, str | None], Table], vocabulary_schema: str | None, ) -> Table: - """Build a lazy ibis Table expression that resolves all concept IDs for one concept set. + """Build a lazy ibis expression for a concept set with include/exclude logic. - The returned ibis expression is never executed at build time -- it becomes - a subquery embedded in the final cohort SQL. The database engine performs - the concept-ancestor and concept-relationship joins at execution time. + Handles descendants, mapped codes, and exclusions via ibis JOINs. + The database engine performs the expansion at execution time. """ include_parts: list[Table] = [] - exclude_ids: list[int] = [] - - for item in concept_set.items: - if item.concept_id is None: - continue - - direct: tuple[int, ...] = (int(item.concept_id),) - - if item.include_descendants: - desc = _descendant_expression( - direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - else: - desc = ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}) - # For exclude items without descendants, just track the concept id - if item.is_excluded: - exclude_ids.append(int(item.concept_id)) - continue - - if item.include_mapped: - mapped = _mapped_expression( - direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - if item.include_descendants: - # Need mapped for both direct AND descendants - desc_mapped = _mapped_expression( - direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - full = _union_all_tables([desc, desc_mapped]) - else: - full = _union_all_tables([desc, mapped]) - else: - full = desc - - if item.is_excluded: - if item.include_descendants or item.include_mapped: - # Complex exclude with expansion needs anti-join - include_parts.append(full) - exclude_ids.append(None) # marker for complex exclude - else: - exclude_ids.append(int(item.concept_id)) - else: - include_parts.append(full) - - # Build include part - if not include_parts: - if not exclude_ids: - return ibis.memtable({"concept_id": []}, schema={"concept_id": "int64"}) - # Only simple excludes -- just exclude those IDs from everything - concept = _vocabulary_table("concept", vocabulary_schema=vocabulary_schema, table_getter=table_getter) - return concept.filter( - ~concept.concept_id.isin(ibis.literal(list(exclude_ids), type="array")) - ).select(concept.concept_id.name(CONCEPT_ID)) - - # Actually, let me reconsider the exclude handling. Simple excludes (plain IDs with no - # descendants/mapped) can be handled via anti-join after the include union. - # Complex excludes (with descendants/mapped) need to be treated as included items - # that are then excluded via anti-join. - # For simplicity and correctness, let me handle excludes uniformly via anti-join. - - return _build_codeset_expression( - concept_set, table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - - -def _build_codeset_expression( - concept_set: NormalizedConceptSet, - *, - table_getter: Callable[[str, str | None], Table], - vocabulary_schema: str | None, -) -> Table: - """Build lazy ibis expression for a concept set with include/exclude logic.""" - include_parts: list[Table] = [] exclude_parts: list[Table] = [] for item in concept_set.items: @@ -236,7 +84,6 @@ def _build_codeset_expression( direct: tuple[int, ...] = (int(item.concept_id),) - # Build base expression for this item if item.include_descendants: desc = _descendant_expression( direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema @@ -264,7 +111,6 @@ def _build_codeset_expression( if not include_parts: return ibis.memtable({"concept_id": []}, schema={"concept_id": "int64"}) - # Union all include parts if len(include_parts) == 1: result = include_parts[0] else: @@ -274,7 +120,6 @@ def _build_codeset_expression( result = result.distinct() - # Anti-join excluded concepts for e in exclude_parts: marked = e.mutate(_cm=ibis.literal(1, type="int64")) result = result.join(marked, result.concept_id == marked.concept_id, how="left") @@ -284,12 +129,7 @@ def _build_codeset_expression( def _union_all_tables(tables: list[Table]) -> Table: - """Union multiple single-column ibis tables using binary-tree merge. - - Binary-tree merge caps expression-tree depth at O(log n) instead of - O(n), avoiding deeply nested UNION ALL chains for large numbers of - tables (e.g. 100+ concept sets). - """ + """Union multiple single-column ibis tables using binary-tree merge.""" if not tables: raise ValueError("_union_all_tables requires at least one table") if len(tables) == 1: @@ -300,22 +140,37 @@ def _union_all_tables(tables: list[Table]) -> Table: return left.union(right, distinct=False) -def _drop_table( +def _needs_vocabulary_expansion(concept_sets: Mapping[int, NormalizedConceptSet]) -> bool: + for cset in concept_sets.values(): + for item in cset.items: + if item.include_descendants or item.include_mapped: + return True + return False + + +def build_batch_codeset_table( + *, backend: IbisBackendLike, - table_name: str, - schema: str | None, -) -> None: - """Safely drop a backend table.""" - with contextlib.suppress(Exception): - backend.drop_table(table_name, database=schema, force=True) + concept_sets: Mapping[int, NormalizedConceptSet], + batch_table_name: str, + results_schema: str | None = None, + vocabulary_schema: str | None = None, + temporary: bool = False, +) -> Table: + """Build a ``(codeset_id, concept_id)`` table from multiple concept sets.""" + return build_single_codeset_table( + backend=backend, + concept_sets=concept_sets, + batch_table_name=batch_table_name, + results_schema=results_schema, + vocabulary_schema=vocabulary_schema, + ) def _table_getter_from_backend( backend: IbisBackendLike, schema: str, ) -> Callable[[str, str | None], Table]: - """Build a table_getter callable from an ibis backend.""" - def _getter(table_name: str, table_schema: str | None) -> Table: try: if table_schema is not None: @@ -327,361 +182,90 @@ def _getter(table_name: str, table_schema: str | None) -> Table: return _getter -def build_batch_codeset_table( - *, +def _read_table( backend: IbisBackendLike, - concept_sets: Mapping[int, NormalizedConceptSet], - batch_table_name: str = _CODESET_TABLE, - results_schema: str | None = None, - vocabulary_schema: str | None = None, - use_persistent_cache: bool = False, - temporary: bool = False, - cohort_table: str = "cohort", + *, + table_name: str, + schema: str | None, ) -> Table: - """Populate a database table ``batch_table_name`` with all concept set IDs. - - The table has schema ``(codeset_id INT64, concept_id INT64)`` and is - overwritten each call. Each concept set in *concept_sets* is resolved - via lazy ibis expressions that join concept_ancestor and - concept_relationship -- the database engine performs the expansion. + try: + if schema is not None: + return backend.table(table_name, database=schema) + except TypeError: + pass + return backend.table(table_name) - When *use_persistent_cache* is True, previously-resolved checksums are - loaded from the codeset cache (named from *cohort_table*) so that - already-expanded concept sets skip the vocabulary-table queries. - """ - cache_table_name = _codeset_cache_table(cohort_table) - table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") - # Collect cache keys and check persistent cache - cache_hits: dict[int, list[int]] = {} - uncached: list[tuple[int, NormalizedConceptSet]] = [] - if use_persistent_cache: - for cid, cset in concept_sets.items(): - if not cset.items: - continue - key = _compute_cache_key(cset.items) - cached = _read_codeset_cache( - backend, cache_key=key, schema=results_schema, table_name=cache_table_name - ) - if cached is not None: - cache_hits[cid] = list(cached) - else: - uncached.append((cid, cset)) +def _extract_column(result: Any, col_name: str) -> tuple[int, ...]: + if hasattr(result, "columns"): + values = result[col_name].tolist() if col_name in result.columns else result.iloc[:, 0].tolist() + elif isinstance(result, (list, tuple, set)): + values = list(result) else: - uncached = list(concept_sets.items()) - - # Build the full batch query - parts: list[Table] = [] - - # Cache hits: just use the cached IDs - for cid, ids in cache_hits.items(): - if ids: - tbl = ibis.memtable( - {"codeset_id": [cid] * len(ids), "concept_id": ids}, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - parts.append(tbl) - - # Split uncached into simple (direct IDs only) and complex (needs expansion) - simple_rows: list[dict[str, Any]] = [] - complex_csets: list[tuple[int, NormalizedConceptSet]] = [] - for cid, cset in uncached: - if not cset.items: - continue - if _needs_vocabulary_expansion({cid: cset}): - complex_csets.append((cid, cset)) - else: - for item in cset.items: - if not item.is_excluded and item.concept_id is not None: - simple_rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) - - # Batch all simple IDs into one memtable - if simple_rows: - parts.append( - ibis.memtable( - simple_rows, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - ) - - # Complex concept sets: build lazy expansion expressions - for cid, cset in complex_csets: - expr = _build_codeset_expression(cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema) - labeled = expr.mutate(codeset_id=ibis.literal(cid, type="int64")).select("codeset_id", CONCEPT_ID) - parts.append(labeled) - - if not parts: - # No concept sets at all -- create empty table - empty = ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=empty, - overwrite=True, - temp=temporary, - ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) - - # Union all parts and materialize - combined = _union_all_tables(parts) + values = [result] if result is not None else [] + return tuple(int(v) for v in values if v is not None) - _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=combined, - overwrite=True, - temp=temporary, - ) - # Write newly-resolved concept sets to persistent cache - if use_persistent_cache: - for cid, cset in uncached: - if not cset.items: - continue - key = _compute_cache_key(cset.items) - if key in cache_hits: - continue - # Read back from the table to get resolved IDs for this codeset - ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) - resolved = ref.filter(ref.codeset_id == cid).select(CONCEPT_ID).distinct().execute() - cids = _extract_column(resolved, CONCEPT_ID) - if cids: - _write_codeset_cache( - backend, - cache_key=key, - concept_ids=cids, - schema=results_schema, - table_name=cache_table_name, - ) - - ref = _read_table(backend, table_name=batch_table_name, schema=results_schema) - return ref - - -def _find_existing_checksums( - backend: IbisBackendLike, - checksums: set[str], - schema: str | None, - cache_table_name: str = _CACHE_TABLE_NAME, -) -> set[str]: - """Return the subset of *checksums* that already exist in the cache table.""" - if not checksums: - return set() - - if not table_exists(backend, table_name=cache_table_name, schema=schema): - return set() - tbl = _read_table(backend, table_name=cache_table_name, schema=schema) - existing = tbl.filter(tbl.cache_key.isin(tuple(checksums))).select("cache_key").distinct().execute() - if hasattr(existing, "columns"): - return {str(v) for v in existing["cache_key"].tolist() if v is not None} - return set() - - -def _populate_cache_batch( +def _drop_table( backend: IbisBackendLike, - expression: Table, + table_name: str, schema: str | None, - cache_table_name: str = _CACHE_TABLE_NAME, ) -> None: - """Insert a batch of concept set expansions into the codeset cache table. - - The table must already exist (created by :func:`ensure_codeset_cache`). - When the backend does not support ``insert``, the table is recreated - by UNION-ing existing cache data with the new expression. - """ - try: - insert_relation( - expression, - backend=backend, - target_table=cache_table_name, - target_schema=schema, - ) - except Exception: - logger.info( - "Backend does not support insert for %s — recreating table via UNION", - cache_table_name, - ) - existing = _read_table(backend, table_name=cache_table_name, schema=schema) - merged = existing.union(expression, distinct=False) - _create_table_impl( - backend, - table_name=cache_table_name, - schema=schema, - obj=merged, - overwrite=True, - ) - - -def resolve_concept_sets( - concept_sets: Mapping[int, NormalizedConceptSet], - *, - backend: IbisBackendLike, - results_schema: str | None = None, - vocabulary_schema: str | None = None, - cohort_table: str = "cohort", -) -> set[str]: - """Resolve concept sets into the persistent codeset cache table. - - The cache table name is derived from *cohort_table* via - ``_codeset_cache_table()`` so that separate cohort tables in the same - schema get separate caches. - - For each unique concept set (identified by SHA-256 checksum): - - Cache hit: skipped (already in the cache) - - Cache miss: resolved via a single bulk query (vocabulary-table joins - for descendants/mapped codes) and inserted into the cache. - - Returns the set of checksums that were resolved (cache misses). - - This function is idempotent and safe to call at any time, independent - of cohort generation. It never uses Python memory for resolved IDs -- - the resolution query runs directly on the backend. - """ - if not concept_sets: - return set() - cache_table_name = _codeset_cache_table(cohort_table) - checksum_map = _compute_checksum_map(concept_sets) - existing = _find_existing_checksums(backend, set(checksum_map.values()), results_schema, cache_table_name) - table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") - - miss_parts: list[Table] = [] - resolved_keys: set[str] = set() - - for cid, key in checksum_map.items(): - if key in existing: - continue - try: - expr = _build_codeset_expression( - concept_sets[int(cid)], table_getter=table_getter, vocabulary_schema=vocabulary_schema - ) - labeled = expr.mutate(cache_key=ibis.literal(key, type="string")).select("cache_key", CONCEPT_ID) - miss_parts.append(labeled) - resolved_keys.add(key) - except Exception as exc: - logger.warning("Failed to build concept set resolution query for key %s: %s", key, exc) - - if not miss_parts: - return set() - - combined = _union_all_tables(miss_parts) - try: - _populate_cache_batch(backend, combined, results_schema, cache_table_name) - except Exception as exc: - logger.warning("Failed to populate codeset cache: %s", exc) - - return resolved_keys - - -def _compute_checksum_map( - concept_sets: Mapping[int, NormalizedConceptSet], -) -> dict[int, str]: - """Return a dict mapping ``codeset_id -> cache_key`` for each concept set.""" - result: dict[int, str] = {} - for cid, cset in concept_sets.items(): - if cset.items: - result[int(cid)] = _compute_cache_key(cset.items) - return result + with contextlib.suppress(Exception): + backend.drop_table(table_name, database=schema, force=True) def build_single_codeset_table( *, backend: IbisBackendLike, concept_sets: Mapping[int, NormalizedConceptSet], - batch_table_name: str = _CODESET_TABLE, + batch_table_name: str, results_schema: str | None = None, vocabulary_schema: str | None = None, - use_persistent_cache: bool = False, - cohort_table: str = "cohort", ) -> Table: - """Build a codeset table for a single cohort. + """Build a per-cohort codeset temp table ``(codeset_id, concept_id)``. - When *use_persistent_cache* is True, concept sets are stored in a - persistent cache table keyed by SHA-256 checksum. The cache table - name is derived from *cohort_table* via ``_codeset_cache_table()``. - Cache misses are resolved and inserted. The per-cohort table is - then built by selecting from the cache. - - When *use_persistent_cache* is False and all concept sets are simple - (no descendant or mapped expansion), builds a lightweight memtable - directly without any vocabulary-table queries. + Like the Java ``#Codesets`` table: a per-cohort temp table populated + with concept set IDs, used by criteria via JOIN, then dropped. """ - checksum_map = _compute_checksum_map(concept_sets) + if not concept_sets: + empty = ibis.memtable( + {"codeset_id": [], "concept_id": []}, + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + _create_table_impl( + backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=True + ) + return _read_table(backend, table_name=batch_table_name, schema=results_schema) + needs_vocab = _needs_vocabulary_expansion(concept_sets) - # Fast path: all concept sets are simple, no persistent cache needed - if not needs_vocab and not use_persistent_cache: + if not needs_vocab: rows: list[dict[str, Any]] = [] for cid, cset in concept_sets.items(): for item in cset.items: if not item.is_excluded and item.concept_id is not None: rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) - if rows: - data = ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) - else: - data = ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, + data = ( + ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) + if rows + else ibis.memtable( + {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"} ) - _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=data, - overwrite=True, - temp=True, ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) - - if use_persistent_cache: - # Resolve cache misses as a single bulk INSERT into the codeset cache. - # Cache hits are skipped. No per-concept-set round trips, no Python - # memory for resolved IDs. - resolve_concept_sets( - concept_sets, - backend=backend, - results_schema=results_schema, - vocabulary_schema=vocabulary_schema, - cohort_table=cohort_table, - ) - - # Build per-cohort table from cache - cache_name = _codeset_cache_table(cohort_table) - cache_ref = _read_table(backend, table_name=cache_name, schema=results_schema) - - parts: list[Table] = [] - for cid, key in checksum_map.items(): - part = ( - cache_ref.filter(cache_ref.cache_key == key) - .select(cache_ref.concept_id.name(CONCEPT_ID)) - .mutate(codeset_id=ibis.literal(int(cid), type="int64")) - .select("codeset_id", CONCEPT_ID) - ) - parts.append(part) - - combined = _union_all_tables(parts) _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=combined, - overwrite=True, + backend, table_name=batch_table_name, schema=results_schema, obj=data, overwrite=True, temp=True ) return _read_table(backend, table_name=batch_table_name, schema=results_schema) - # No cache, but some concept sets need vocabulary expansion table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") - parts = [] + parts: list[Table] = [] for cid, cset in concept_sets.items(): if not cset.items: continue - needs_vocab = any(item.include_descendants or item.include_mapped for item in cset.items) - if needs_vocab: + has_vocab = any(item.include_descendants or item.include_mapped for item in cset.items) + if has_vocab: expr = _build_codeset_expression( cset, table_getter=table_getter, vocabulary_schema=vocabulary_schema ) @@ -692,125 +276,35 @@ def build_single_codeset_table( else: for item in cset.items: if not item.is_excluded and item.concept_id is not None: - rows = [{"codeset_id": int(cid), "concept_id": int(item.concept_id)}] - parts.append(ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"})) + parts.append( + ibis.memtable( + [{"codeset_id": int(cid), "concept_id": int(item.concept_id)}], + schema={"codeset_id": "int64", "concept_id": "int64"}, + ) + ) if not parts: empty = ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, + {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"} ) _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=empty, - overwrite=True, - temp=True, + backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=True ) return _read_table(backend, table_name=batch_table_name, schema=results_schema) combined = _union_all_tables(parts) _create_table_impl( - backend, - table_name=batch_table_name, - schema=results_schema, - obj=combined, - overwrite=True, - temp=True, + backend, table_name=batch_table_name, schema=results_schema, obj=combined, overwrite=True, temp=True ) return _read_table(backend, table_name=batch_table_name, schema=results_schema) -def _needs_vocabulary_expansion(concept_sets: Mapping[int, NormalizedConceptSet]) -> bool: - """Return True if any concept set requires vocabulary-table queries.""" - for cset in concept_sets.values(): - for item in cset.items: - if item.include_descendants or item.include_mapped: - return True - return False - - -def _read_table( - backend: IbisBackendLike, - *, - table_name: str, - schema: str | None, -) -> Table: - """Read a backend table as an ibis relation.""" - try: - if schema is not None: - return backend.table(table_name, database=schema) - except TypeError: - pass - return backend.table(table_name) - - -def _extract_column(result: Any, col_name: str) -> tuple[int, ...]: - """Extract a column from various ibis execute() return types.""" - if hasattr(result, "columns"): # pandas - values = result[col_name].tolist() if col_name in result.columns else result.iloc[:, 0].tolist() - elif isinstance(result, (list, tuple, set)): - values = list(result) - else: - values = [result] if result is not None else [] - return tuple(int(v) for v in values if v is not None) - - -def _read_codeset_cache( - backend: IbisBackendLike, - *, - cache_key: str, - schema: str | None, - table_name: str, -) -> tuple[int, ...] | None: - """Read cached concept IDs for a cache key from the persistent cache table.""" - - try: - if not table_exists(backend, table_name=table_name, schema=schema): - return None - tbl = _read_table(backend, table_name=table_name, schema=schema) - rows = tbl.filter(tbl.cache_key == cache_key).select("concept_id").execute() - ids = _extract_column(rows, "concept_id") - return ids if ids else None - except Exception: - return None - - -def _write_codeset_cache( - backend: IbisBackendLike, - *, - cache_key: str, - concept_ids: tuple[int, ...], - schema: str | None, - table_name: str, -) -> None: - """Persist resolved concept IDs to the cache table.""" - from .operations import insert_relation - - if not concept_ids: - return - - try: - data = ibis.memtable( - {"cache_key": [cache_key] * len(concept_ids), "concept_id": list(concept_ids)}, - schema={"cache_key": "string", "concept_id": "int64"}, - ) - if not table_exists(backend, table_name=table_name, schema=schema): - _create_table_impl(backend, table_name=table_name, schema=schema, obj=data) - return - insert_relation(data, backend=backend, target_table=table_name, target_schema=schema) - except Exception: - pass - - def drop_codeset_table( backend: IbisBackendLike, *, - batch_table_name: str = _CODESET_TABLE, + batch_table_name: str, results_schema: str | None = None, ) -> None: - """Drop the batch codeset table.""" _drop_table(backend, batch_table_name, results_schema) @@ -821,10 +315,7 @@ def _filter_by_concept_table( column: str, exclude: bool = False, ) -> Table: - """Filter *table* by semi-join (include) or anti-join (exclude) against *concept_table*. - - Returns a new ibis relation with only the original *table* columns. - """ + """Semi-join (include) or anti-join (exclude) *table* against *concept_table*.""" if not exclude: joined = table.join(concept_table, table[column] == concept_table.concept_id) return joined.select(*[joined[c] for c in table.columns]) diff --git a/tests/execution/test_codesets_persistent_cache.py b/tests/execution/test_codesets_persistent_cache.py index b8514f8..c6d19b9 100644 --- a/tests/execution/test_codesets_persistent_cache.py +++ b/tests/execution/test_codesets_persistent_cache.py @@ -2,42 +2,10 @@ import pytest -from circe.execution.ibis.codesets import ( - _CACHE_TABLE_NAME, - _compute_cache_key, - _read_codeset_cache, - _write_codeset_cache, - build_batch_codeset_table, -) +from circe.execution.ibis.codesets import build_batch_codeset_table from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem -def _make_items(*specs: tuple[int, bool, bool, bool]) -> tuple[NormalizedConceptSetItem, ...]: - return tuple( - NormalizedConceptSetItem( - concept_id=s[0], is_excluded=s[1], include_descendants=s[2], include_mapped=s[3] - ) - for s in specs - ) - - -def test_compute_cache_key_deterministic(): - items = _make_items((1, False, True, False), (2, True, False, True)) - assert _compute_cache_key(items) == _compute_cache_key(items) - - -def test_compute_cache_key_order_independent(): - items_a = _make_items((1, False, True, False), (2, True, False, True)) - items_b = _make_items((2, True, False, True), (1, False, True, False)) - assert _compute_cache_key(items_a) == _compute_cache_key(items_b) - - -def test_compute_cache_key_different_items_different_hash(): - items_a = _make_items((1, False, True, False)) - items_b = _make_items((1, False, False, False)) - assert _compute_cache_key(items_a) != _compute_cache_key(items_b) - - def test_build_batch_codeset_table_round_trip(): ibis = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") @@ -77,29 +45,3 @@ def test_build_batch_codeset_table_round_trip(): assert set(rows["concept_id"]) == {111} conn.drop_table("__test_codesets", force=True) - - -def test_persistent_cache_write_and_read(): - ibis = pytest.importorskip("ibis") - _ = pytest.importorskip("duckdb") - - conn = ibis.duckdb.connect() - - _write_codeset_cache( - conn, cache_key="testkey", concept_ids=(111, 222), schema=None, table_name=_CACHE_TABLE_NAME - ) - - result = _read_codeset_cache(conn, cache_key="testkey", schema=None, table_name=_CACHE_TABLE_NAME) - assert result == (111, 222) - - conn.drop_table(_CACHE_TABLE_NAME, force=True) - - -def test_persistent_cache_miss_returns_none(): - ibis = pytest.importorskip("ibis") - _ = pytest.importorskip("duckdb") - - conn = ibis.duckdb.connect() - - result = _read_codeset_cache(conn, cache_key="nonexistent", schema=None, table_name=_CACHE_TABLE_NAME) - assert result is None From bdcacbf4839acf8ba670d7e2a36e0b4fb3e90197 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 15:33:26 -0700 Subject: [PATCH 30/53] Refactor table usage for concept sets --- CLAUDE.md | 29 +++++++ circe/cohort_definition_set/_generate.py | 39 ++++++--- circe/execution/api.py | 16 ++-- circe/execution/engine/cohort.py | 23 +++--- circe/execution/ibis/codesets.py | 100 +++++++++++++---------- 5 files changed, 134 insertions(+), 73 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 50a1d49..c60dada 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -32,6 +32,35 @@ git pre-commit run --all-files If pre-commit checks fail, fix the issues and re-run until they pass. +## Ibis Execution Layer: NEVER use Python in-memory operations + +The datasets this software processes are large (often 100M+ rows). Operations that pull data into Python memory will crash the process. All data processing MUST remain as lazy ibis expressions executed on the database backend. + +### Forbidden patterns in production code (`circe/execution/` and `circe/cohort_definition_set/`): + +| Pattern | Example (NEVER do this) | Instead | +|---|---|---| +| `.execute()` | `table.execute()` loads entire table into a pandas DataFrame in memory | Compose ibis expressions; let the backend execute the full query | +| `.to_pandas()` | `table.to_pandas()` pulls result set into Python | Use ibis expressions; only call `.execute()` for small scalars (e.g., `table.limit(1).count().execute()`) | +| Python iteration over results | `for row in table.select(...).distinct().to_pandas().itertuples()` | Push aggregation/distinct into ibis; use window functions or joins | +| `ibis.memtable()` with large DataFrames | Constructing a large `pd.DataFrame` and passing to `ibis.memtable()` | Read directly from the database table (passed tables already exist in the backend) | +| Loading files into Python | `pd.read_csv(...)`, reading Parquet into memory | Use ibis to read files: `ibis.read_csv()`, `ibis.read_parquet()` | + +### Existing violations in production code (DO NOT FIX — examples for reference): + +1. **`circe/cohort_definition_set/_checksum_store.py`** — uses `pandas`, `.execute()`, `pd.DataFrame()`, row iteration — should use ibis expressions end-to-end +2. **`circe/execution/engine/custom_era.py:86`** — `.execute().iloc[:, 0]` to pull concept IDs into a Python tuple +3. **`circe/execution/engine/group_demographics.py:97`** — `.to_pandas().itertuples()` to iterate over distinct concept IDs +4. **`circe/execution/ibis/operations.py:86`** — `.execute()` to check if rows exist (use `table.limit(1).count()` instead) +5. **`benchmarks/compare_cohort_outputs.py`** — full table `.execute()`, pandas row iteration, set comparison in memory + +### Allowed uses of `.execute()`: + +- **Tests only** — tests run against small in-memory DuckDB databases with tiny fixtures. Assertions on small result sets are fine. +- **Scalar values** — getting a single count or checking existence: `table.count().execute()`, `table.limit(1).execute()` (only returns 1 row) + +When writing new production code, if you find yourself reaching for `.execute()`, `.to_pandas()`, or Python iteration over ibis results, **stop** — the query can be rewritten as a lazy ibis expression. + ## Git Workflow - Do not run `git commit` — the user will handle commits - Run pre-commit checks to validate code quality before marking tasks complete diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 0c2f502..ce981dd 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -6,6 +6,7 @@ import contextlib import logging import threading +import uuid from datetime import datetime from typing import TYPE_CHECKING, Literal @@ -22,6 +23,18 @@ _backend_lock = threading.Lock() +def _drop_tables_by_prefix(backend: IbisBackendLike, prefix: str, schema: str | None) -> None: + """Drop all tables in *schema* whose name starts with *prefix*.""" + try: + tables = backend.list_tables(database=schema) + except Exception: + tables = backend.list_tables() + for table_name in tables: + if table_name.startswith(prefix): + with contextlib.suppress(Exception): + backend.drop_table(table_name, database=schema, force=True) + + def _process_single_cohort( cohort: CohortDefinition, *, @@ -30,14 +43,12 @@ def _process_single_cohort( results_schema: str | None, vocabulary_schema: str | None, cohort_table: str, + session_prefix: str, ) -> tuple[datetime, datetime]: """Build and write a single cohort. Thread-safe via ``_backend_lock``. - Each cohort gets its own per-cohort codeset temp table populated and + Each cohort gets its own per-cohort codeset table populated and dropped as it runs, mirroring the Java ``#Codesets`` pattern. - - Returns ``(start_time, end_time)`` of the database-materialization - phase so the caller can compute execution duration. """ with _backend_lock: start_time = datetime.now() @@ -49,6 +60,7 @@ def _process_single_cohort( vocabulary_schema=vocabulary_schema, cohort_id=cohort.cohort_id, cohort_table=cohort_table, + session_prefix=session_prefix, ) projected = project_to_ohdsi_cohort_table(new_rows, cohort_id=cohort.cohort_id) write_cohort( @@ -112,6 +124,8 @@ async def async_generate_cohort_set( _COMPILED_CORRELATED_EVENTS.clear() + session_prefix = f"__s_{uuid.uuid4().hex[:8]}_" + if checksum_table is None: checksum_table = f"{cohort_table}_checksum" @@ -171,6 +185,7 @@ async def async_generate_cohort_set( results_schema=results_schema, vocabulary_schema=vocabulary_schema, cohort_table=cohort_table, + session_prefix=session_prefix, ), timeout=compile_timeout, ) @@ -263,13 +278,7 @@ async def async_generate_cohort_set( raise continue - # Clean up staging tables created by the materialized pipeline - schema = results_schema or cdm_schema - for stage in ("codesets", "primary", "qualified", "included", "ended"): - with contextlib.suppress(Exception): - backend.drop_table( - f"__{cohort_table}_{cohort.cohort_id}_{stage}", database=schema, force=True - ) + # Individual staging tables are cleaned by prefix at batch end results.append( CohortGenerationResult( @@ -301,6 +310,14 @@ async def async_generate_cohort_set( summary["FAILED"], ) + # Drop all staging tables from this batch run + await asyncio.to_thread( + _drop_tables_by_prefix, + backend, + session_prefix, + results_schema or cdm_schema, + ) + return results diff --git a/circe/execution/api.py b/circe/execution/api.py index 6f95f5b..6589700 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -34,16 +34,16 @@ def build_cohort( materialize: bool = True, codeset_table: Table | None = None, cohort_table: str = "cohort", + session_prefix: str = "", ) -> Table: """Normalize, compile, and assemble a cohort relation. Paths through stage-by-stage temp tables when *cohort_id* is provided and *materialize* is True, so that the ibis expression tree never grows - too large to compile. Set *materialize=False* for compile-only use - (e.g. unit tests that only verify the expression tree can be built). + too large to compile. Set *materialize=False* for compile-only use. - When *codeset_table* is provided (from a batch-generation caller), it is - used directly. Otherwise a per-cohort codeset temp table is auto-created. + When *codeset_table* is provided it is used directly. Otherwise a + per-cohort codeset table is auto-created. """ maybe_apply_databricks_post_connect_workaround(backend) @@ -56,6 +56,7 @@ def build_cohort( batch_table_name=f"__{cohort_table}_{cohort_id}_codesets", results_schema=results_schema, vocabulary_schema=vocabulary_schema, + session_prefix=session_prefix, ) ctx = make_execution_context( @@ -67,7 +68,12 @@ def build_cohort( ) return build_cohort_table( - normalized, ctx, cohort_id=cohort_id, materialize=materialize, cohort_table=cohort_table + normalized, + ctx, + cohort_id=cohort_id, + materialize=materialize, + cohort_table=cohort_table, + session_prefix=session_prefix, ) diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 52a643b..5754535 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -25,18 +25,10 @@ def _materialize( stage: str, schema: str | None, cohort_table: str = "cohort", + session_prefix: str = "", ) -> Table: - """Write *table* to a backend staging table and return a fresh reference. - - Without this step every pipeline stage accumulates on top of the previous - ibis expression tree. For cohorts with many primary criteria the tree - grows too large for the ibis SQL compiler to traverse in reasonable time. - - Materialising at each pipeline boundary keeps the expression tree sent - to the compiler shallow — each stage only builds on a simple - ``DatabaseTable`` reference. - """ - name = f"__{cohort_table}_{cohort_id}_{stage}" + """Write *table* to a backend staging table and return a fresh reference.""" + name = f"{session_prefix}__{cohort_table}_{cohort_id}_{stage}" create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) return read_table(ctx.backend, table_name=name, schema=schema) @@ -46,10 +38,11 @@ def _drop_staging_tables( cohort_id: int, schema: str | None, cohort_table: str = "cohort", + session_prefix: str = "", ) -> None: """Remove all staging tables for *cohort_id* from the database.""" for stage in ("codesets", "primary", "qualified", "included", "ended"): - name = f"__{cohort_table}_{cohort_id}_{stage}" + name = f"{session_prefix}__{cohort_table}_{cohort_id}_{stage}" with contextlib.suppress(Exception): ctx.backend.drop_table(name, database=schema, force=True) @@ -61,6 +54,7 @@ def build_cohort_table( cohort_id: int = 0, materialize: bool = True, cohort_table: str = "cohort", + session_prefix: str = "", ) -> Table: primary_plans = tuple( PrimaryEventInput( @@ -88,6 +82,7 @@ def build_cohort_table( cohort_id=cohort_id, stage="primary", schema=schema, + session_prefix=session_prefix, cohort_table=cohort_table, ) @@ -102,6 +97,7 @@ def build_cohort_table( cohort_id=cohort_id, stage="qualified", schema=schema, + session_prefix=session_prefix, cohort_table=cohort_table, ) @@ -121,6 +117,7 @@ def build_cohort_table( cohort_id=cohort_id, stage="included", schema=schema, + session_prefix=session_prefix, cohort_table=cohort_table, ) included_events = apply_result_limit(included_events, cohort_plan.expression_limit_type) @@ -134,6 +131,7 @@ def build_cohort_table( cohort_id=cohort_id, stage="included", schema=schema, + session_prefix=session_prefix, cohort_table=cohort_table, ) @@ -146,6 +144,7 @@ def build_cohort_table( cohort_id=cohort_id, stage="ended", schema=schema, + session_prefix=session_prefix, cohort_table=cohort_table, ) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 0c3a44c..8eb19d1 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -13,6 +13,30 @@ from .operations import create_table as _create_table_impl +def _literal_select(**columns: int) -> Table: + """Return an ibis table expression that selects literal values. + + Builds ``SELECT val1 AS col1, val2 AS col2`` using ``as_table()`` and + ``mutate()`` so no memtable or local-file staging is required. + """ + items = list(columns.items()) + t = ibis.literal(items[0][1], type="int64").name(items[0][0]).as_table() + for name, val in items[1:]: + t = t.mutate(**{name: ibis.literal(val, type="int64")}) + return t + + +def _empty_table(*, columns: tuple[tuple[str, int], ...]) -> Table: + """Return an ibis expression for an empty table with the given column types. + + Produces ``SELECT ... WHERE FALSE`` -- no local files. + """ + if not columns: + raise ValueError("_empty_table requires at least one column") + t = _literal_select(**dict(columns)) + return t.filter(ibis.literal(False)) + + def _vocabulary_table( table_name: str, *, @@ -74,6 +98,8 @@ def _build_codeset_expression( Handles descendants, mapped codes, and exclusions via ibis JOINs. The database engine performs the expansion at execution time. + Never uses ``ibis.memtable`` -- all leaf values use ``as_table().mutate()`` + to avoid local-file staging on Databricks. """ include_parts: list[Table] = [] exclude_parts: list[Table] = [] @@ -90,12 +116,12 @@ def _build_codeset_expression( ) base = _union_all_tables( [ - ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}), + _literal_select(concept_id=int(item.concept_id)), desc, ] ) else: - base = ibis.memtable({"concept_id": [int(item.concept_id)]}, schema={"concept_id": "int64"}) + base = _literal_select(concept_id=int(item.concept_id)) if item.include_mapped: mapped = _mapped_expression( @@ -109,7 +135,7 @@ def _build_codeset_expression( include_parts.append(base) if not include_parts: - return ibis.memtable({"concept_id": []}, schema={"concept_id": "int64"}) + return _empty_table(columns=(("concept_id", 0),)) if len(include_parts) == 1: result = include_parts[0] @@ -222,45 +248,40 @@ def build_single_codeset_table( batch_table_name: str, results_schema: str | None = None, vocabulary_schema: str | None = None, + session_prefix: str = "", ) -> Table: - """Build a per-cohort codeset temp table ``(codeset_id, concept_id)``. + """Build a per-cohort codeset table ``(codeset_id, concept_id)``. - Like the Java ``#Codesets`` table: a per-cohort temp table populated - with concept set IDs, used by criteria via JOIN, then dropped. + Like the Java ``#Codesets`` table. All work stays in the database -- + no ``ibis.memtable``, no local-file staging, no ``temp`` tables + (which Databricks / Oracle / BigQuery do not support). """ + name = f"{session_prefix}{batch_table_name}" + if not concept_sets: - empty = ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - _create_table_impl( - backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=True - ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) + empty = _empty_table(columns=(("codeset_id", 0), ("concept_id", 0))) + _create_table_impl(backend, table_name=name, schema=results_schema, obj=empty, overwrite=True) + return _read_table(backend, table_name=name, schema=results_schema) needs_vocab = _needs_vocabulary_expansion(concept_sets) if not needs_vocab: - rows: list[dict[str, Any]] = [] + parts: list[Table] = [] for cid, cset in concept_sets.items(): for item in cset.items: if not item.is_excluded and item.concept_id is not None: - rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) - data = ( - ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) - if rows - else ibis.memtable( - {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"} - ) - ) - _create_table_impl( - backend, table_name=batch_table_name, schema=results_schema, obj=data, overwrite=True, temp=True - ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) + parts.append(_literal_select(codeset_id=int(cid), concept_id=int(item.concept_id))) + if not parts: + empty = _empty_table(columns=(("codeset_id", 0), ("concept_id", 0))) + _create_table_impl(backend, table_name=name, schema=results_schema, obj=empty, overwrite=True) + return _read_table(backend, table_name=name, schema=results_schema) + combined = _union_all_tables(parts) + _create_table_impl(backend, table_name=name, schema=results_schema, obj=combined, overwrite=True) + return _read_table(backend, table_name=name, schema=results_schema) table_getter = _table_getter_from_backend(backend, vocabulary_schema or "") - parts: list[Table] = [] + parts = [] for cid, cset in concept_sets.items(): if not cset.items: continue @@ -276,27 +297,16 @@ def build_single_codeset_table( else: for item in cset.items: if not item.is_excluded and item.concept_id is not None: - parts.append( - ibis.memtable( - [{"codeset_id": int(cid), "concept_id": int(item.concept_id)}], - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) - ) + parts.append(_literal_select(codeset_id=int(cid), concept_id=int(item.concept_id))) if not parts: - empty = ibis.memtable( - {"codeset_id": [], "concept_id": []}, schema={"codeset_id": "int64", "concept_id": "int64"} - ) - _create_table_impl( - backend, table_name=batch_table_name, schema=results_schema, obj=empty, overwrite=True, temp=True - ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) + empty = _empty_table(columns=(("codeset_id", 0), ("concept_id", 0))) + _create_table_impl(backend, table_name=name, schema=results_schema, obj=empty, overwrite=True) + return _read_table(backend, table_name=name, schema=results_schema) combined = _union_all_tables(parts) - _create_table_impl( - backend, table_name=batch_table_name, schema=results_schema, obj=combined, overwrite=True, temp=True - ) - return _read_table(backend, table_name=batch_table_name, schema=results_schema) + _create_table_impl(backend, table_name=name, schema=results_schema, obj=combined, overwrite=True) + return _read_table(backend, table_name=name, schema=results_schema) def drop_codeset_table( From 15802d6b4d1a1701edd8eb961156f6f8eb44a77f Mon Sep 17 00:00:00 2001 From: James Gilbert Date: Mon, 18 May 2026 18:37:18 -0400 Subject: [PATCH 31/53] Error log --- benchmarks/error_repoer.txt | 181 ++++++++++++------------------------ 1 file changed, 61 insertions(+), 120 deletions(-) diff --git a/benchmarks/error_repoer.txt b/benchmarks/error_repoer.txt index c8f1cb7..71aa831 100644 --- a/benchmarks/error_repoer.txt +++ b/benchmarks/error_repoer.txt @@ -1,3 +1,4 @@ +gh or Sputum) -- duration 105.8s Traceback (most recent call last): File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 160, in main() @@ -8,21 +9,45 @@ Traceback (most recent call last): ...<7 lines>... stop_on_error=False, ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 138, in generate_cohort_set - new_rows = build_cohort( - cohort.expression, - ...<5 lines>... - cohort_id=cohort.cohort_id, + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 346, in generate_cohort_set + return asyncio.run( + ~~~~~~~~~~~^ + async_generate_cohort_set( + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + ...<9 lines>... + ) + ^ ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\api.py", line 56, in build_cohort - return build_cohort_table(normalized, ctx, cohort_id=cohort_id, materialize=materialize) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\engine\cohort.py", line 99, in build_cohort_table - included_events = _materialize( - included_events, ctx=ctx, cohort_id=cohort_id, stage="included", schema=schema + ^ + File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\runners.py", line 204, in run + return runner.run(main) + ~~~~~~~~~~^^^^^^ + File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\runners.py", line 127, in run + return self._loop.run_until_complete(task) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ + File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\base_events.py", line 719, in run_until_complete + return future.result() + ~~~~~~~~~~~~~^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 294, in async_generate_cohort_set + upsert_generation_history( + ~~~~~~~~~~~~~~~~~~~~~~~~~^ + backend, + ^^^^^^^^ + ...<6 lines>... + end_time=end_time or datetime.now(), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ) + ^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_checksum_store.py", line 265, in upsert_generation_history + create_table( + ~~~~~~~~~~~~^ + backend, + ^^^^^^^^ + ...<11 lines>... + overwrite=False, + ^^^^^^^^^^^^^^^^ ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\engine\cohort.py", line 39, in _materialize - create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) - ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^ File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 64, in create_table _call_with_optional_database( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ @@ -35,115 +60,31 @@ Traceback (most recent call last): ^ File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 13, in _call_with_optional_database return method(*args, database=database, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 228, in create_table - cur.execute(insert_stmt).fetchall() - ~~~~~~~~~~~^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 200, in create_table + self._run_pre_execute_hooks(table) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1300, in _run_pre_execute_hooks + self._register_in_memory_tables(expr) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1277, in _register_in_memory_tables + self._register_in_memory_table(memtable) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 476, in _register_in_memory_table + cur.execute(put_into) + ~~~~~~~~~~~^^^^^^^^^^ File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\telemetry\latency_logger.py", line 182, in wrapper return func(self, *args, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1341, in execute - self.active_result_set = self.backend.execute_command( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - operation=prepared_operation, - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ...<10 lines>... - query_tags=query_tags, - ^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1357, in execute + self._handle_staging_operation( + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ + staging_allowed_local_path=self.connection.staging_allowed_local_path, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + input_stream=input_stream, + ^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 1066, in execute_command - execute_response, has_more_rows = self._handle_execute_response( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - resp, cursor - ^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 1273, in _handle_execute_response - final_operation_state = self._wait_until_command_done( - resp.operationHandle, - resp.directResults and resp.directResults.operationStatus, - ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 954, in _wait_until_command_done - self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\backend\thrift_backend.py", line 634, in _check_command_not_in_error_or_closed_state - raise ServerOperationError( - ...<7 lines>... + File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1103, in _handle_staging_operation + raise ProgrammingError( + ...<3 lines>... ) -databricks.sql.exc.ServerOperationError: ShuffleMapStage 9148 (mapPartitionsInternal at PhotonExec.scala:865) has failed the maximum allowable number of times: 4. Most recent failure reason: -org.apache.spark.shuffle.PrismFetchFailedException: Prism failed to read shuffle data from BlockManagerId: BlockManagerId(9, 10.128.16.236, 4048, None). shuffleId=2880, firstMapdId=158410, firstMapIndex=4, firstReduceId=0. Error reason: status: ResourceExhausted, message: "Request is throttled: Low priority request queue is full", details: [], metadata: MetadataMap { headers: {"content-type": "application/grpc", "date": "Sun, 17 May 2026 21:32:04 GMT", "content-length": "0"} }. Failed shuffle fetch from executor node: 9 at 10.128.16.236 - at org.apache.spark.errors.SparkCoreErrors$.prismFetchFailedError(SparkCoreErrors.scala:442) - at org.apache.spark.errors.SparkCoreErrors.prismFetchFailedError(SparkCoreErrors.scala) - at com.databricks.spark.prism.PrismInputStream.read(PrismInputStream.java:128) - at java.base/java.io.FilterInputStream.read(FilterInputStream.java:82) - at java.base/java.io.PushbackInputStream.read(PushbackInputStream.java:135) - at org.apache.spark.storage.ShuffleBlockFetcherIterator.prismHasNext(ShuffleBlockFetcherIterator.scala:1011) - at org.apache.spark.storage.ShuffleBlockFetcherIterator.hasNext(ShuffleBlockFetcherIterator.scala:962) - at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:30) - at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) - at com.databricks.photon.LazyBlockFetcherIterator.hasNext(ShuffledBlockRDD.scala:443) - at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) - at scala.collection.convert.JavaCollectionWrappers$IteratorWrapper.hasNext(JavaCollectionWrappers.scala:32) - at com.databricks.photon.CloseableIterator$$anon$6.hasNext(CloseableIterator.scala:71) - at com.databricks.photon.JniApiImpl.open(Native Method) - at com.databricks.photon.JniApi.open(JniApi.scala) - at com.databricks.photon.JniExecNode.open(JniExecNode.java:74) - at com.databricks.photon.PhotonPreShuffleResultHandler.$anonfun$getResultImpl$1(PhotonExec.scala:1413) - at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) - at com.databricks.photon.PhotonResultHandler.timeit(PhotonResultHandler.scala:33) - at com.databricks.photon.PhotonResultHandler.timeit$(PhotonResultHandler.scala:31) - at com.databricks.photon.PhotonPreShuffleResultHandler.timeit(PhotonExec.scala:1406) - at com.databricks.photon.PhotonPreShuffleResultHandler.getResultImpl(PhotonExec.scala:1413) - at com.databricks.photon.PhotonResultHandler.$anonfun$getResult$1(PhotonResultHandler.scala:74) - at com.databricks.photon.PhotonResultHandler.com$databricks$photon$PhotonResultHandler$$convertPhotonOOMIfNeeded(PhotonResultHandler.scala:53) - at com.databricks.photon.PhotonResultHandler.getResult(PhotonResultHandler.scala:74) - at com.databricks.photon.PhotonResultHandler.getResult$(PhotonResultHandler.scala:63) - at com.databricks.photon.PhotonPreShuffleResultHandler.getResult(PhotonExec.scala:1406) - at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.open(PhotonBasicEvaluatorFactory.scala:258) - at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.hasNextImpl(PhotonBasicEvaluatorFactory.scala:263) - at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.$anonfun$hasNext$1(PhotonBasicEvaluatorFactory.scala:283) - at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.scala:17) - at org.apache.spark.TaskContext.runFuncAsBillable(TaskContext.scala:274) - at com.databricks.photon.PhotonBasicEvaluatorFactory$PhotonBasicEvaluator$$anon$1.hasNext(PhotonBasicEvaluatorFactory.scala:283) - at com.databricks.photon.CloseableIterator$$anon$10.hasNext(CloseableIterator.scala:211) - at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) - at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) - at com.databricks.photon.MetadataOnlyShuffleWriter.write(MetadataOnlyShuffleWriter.scala:50) - at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:58) - at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$5(ShuffleMapTask.scala:98) - at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) - at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:93) - at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) - at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:58) - at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:39) - at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:233) - at org.apache.spark.scheduler.Task.doRunTask(Task.scala:223) - at org.apache.spark.scheduler.Task.$anonfun$run$6(Task.scala:179) - at com.databricks.unity.UCSEphemeralState$Handle.runWith(UCSEphemeralState.scala:51) - at com.databricks.unity.HandleImpl.runWith(UCSHandle.scala:128) - at com.databricks.unity.HandleImpl.$anonfun$runWithAndClose$1(UCSHandle.scala:133) - at scala.util.Using$.resource(Using.scala:296) - at com.databricks.unity.HandleImpl.runWithAndClose(UCSHandle.scala:132) - at org.apache.spark.scheduler.TaskExecutionUtils$.withUCHandleForTaskExecution(Task.scala:391) - at org.apache.spark.scheduler.Task.$anonfun$run$3(Task.scala:173) - at org.apache.spark.scheduler.TaskExecutionUtils$.withCredentialsForTaskExecution(Task.scala:368) - at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119) - at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) - at org.apache.spark.scheduler.Task.run(Task.scala:114) - at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:1593) - at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86) - at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83) - at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:111) - at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:1598) - at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) - at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) - at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:1425) - at com.databricks.aether.RDDTask.run(RDDTask.scala:271) - at com.databricks.aether.worker.WorkerTaskAttemptThread.$anonfun$runInternal$1(AetherWorkerImpl.scala:362) - at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) - at com.databricks.aether.AetherInt64Gauge.scopedAdd(AetherServiceMetricImpls.scala:129) - at com.databricks.aether.worker.WorkerTaskAttemptThread.runInternal(AetherWorkerImpl.scala:351) - at com.databricks.aether.worker.WorkerTaskAttemptThread.run(AetherWorkerImpl.scala:328) - at com.databricks.aether.FairBlockingQueue$SlotCountingRunnable.run(FairBlockingQueue.java:647) - at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) - at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) - at java.base/java.lang.Thread.run(Thread.java:840) \ No newline at end of file +databricks.sql.exc.ProgrammingError: Local file operations are restricted to paths within the configured staging_allowed_local_path \ No newline at end of file From d60b35bfd390d3328f8fe81aaeb12025573da1be Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 15:56:57 -0700 Subject: [PATCH 32/53] Removal of inefficient python memory usage and memtables --- .../cohort_definition_set/_checksum_store.py | 117 +++++++++--------- circe/execution/engine/custom_era.py | 21 +--- circe/execution/engine/group_demographics.py | 48 ++++--- circe/execution/ibis/compile_steps.py | 9 +- circe/execution/ibis/context.py | 11 +- circe/execution/ibis/operations.py | 2 +- circe/execution/ibis/person_filters.py | 5 +- tests/execution/test_group_demographics.py | 12 +- tests/execution/test_operations.py | 11 ++ tests/test_cohort_definition_set.py | 11 +- 10 files changed, 133 insertions(+), 114 deletions(-) diff --git a/circe/cohort_definition_set/_checksum_store.py b/circe/cohort_definition_set/_checksum_store.py index baa5972..21b0d55 100644 --- a/circe/cohort_definition_set/_checksum_store.py +++ b/circe/cohort_definition_set/_checksum_store.py @@ -27,9 +27,7 @@ from ..execution.ibis.operations import create_table, read_table, table_exists if TYPE_CHECKING: - import pandas as pd - - from ..execution.typing import IbisBackendLike + from ..execution.typing import IbisBackendLike, Table def load_checksums( @@ -56,23 +54,29 @@ def load_checksums( if not table_exists(backend, table_name=table_name, schema=schema): return {} - table = read_table(backend, table_name=table_name, schema=schema) - rows = table.execute() - if rows.empty: - return {} + import ibis - time_col = "end_time" if "end_time" in rows.columns else "generation_end_time" - has_status = "status" in rows.columns + table = read_table(backend, table_name=table_name, schema=schema) + column_names = table.schema().names + has_status = "status" in column_names + has_end_time = "end_time" in column_names + has_gen_end_time = "generation_end_time" in column_names + time_col = "end_time" if has_end_time else ("generation_end_time" if has_gen_end_time else None) if has_status: - rows = rows[rows["status"] == "COMPLETE"] - if rows.empty: - return {} + table = table.filter(table.status == ibis.literal("COMPLETE", type="str")) - if time_col in rows.columns: - rows = rows.sort_values(time_col, ascending=False) - rows = rows.drop_duplicates(subset=["cohort_definition_id"], keep="first") + if time_col is not None: + w = ibis.window( + group_by=table.cohort_definition_id, + order_by=ibis.desc(table[time_col]), + ) + ranked = table.mutate(_rn=ibis.row_number().over(w)) + table = ranked.filter(ranked._rn == 0) + rows = table.select("cohort_definition_id", "checksum").execute() + if rows.empty: + return {} return {int(row["cohort_definition_id"]): str(row["checksum"]) for _, row in rows.iterrows()} @@ -81,13 +85,13 @@ def load_generation_history( *, schema: str | None, table_name: str, -) -> pd.DataFrame | None: +) -> Table | None: """Load the full generation history from the history table. - Returns all columns (cohort_definition_id, checksum, status, start_time, - end_time) for every recorded generation. Returns ``None`` if the table - does not exist or was created with the v1 schema that lacks timing - columns. + Returns an ibis Table expression with all columns (cohort_definition_id, + checksum, status, start_time, end_time) for every recorded generation. + Returns ``None`` if the table does not exist or was created with the v1 + schema that lacks timing columns. Args: backend: Ibis backend connection. @@ -95,20 +99,17 @@ def load_generation_history( table_name: Name of the generation history table. Returns: - DataFrame with per-cohort history, or ``None`` if unavailable. + ibis Table with per-cohort history, or ``None`` if unavailable. """ if not table_exists(backend, table_name=table_name, schema=schema): return None table = read_table(backend, table_name=table_name, schema=schema) - rows = table.execute() - if rows.empty: - return None - - if "start_time" not in rows.columns or "status" not in rows.columns: + column_names = table.schema().names + if "start_time" not in column_names or "status" not in column_names: return None - return rows + return table def save_checksums( @@ -176,27 +177,24 @@ def save_generation_history( return import ibis - import pandas as pd - - new_rows_df = pd.DataFrame( - [ - { - "cohort_definition_id": cohort_id, - "checksum": checksum, - "status": status, - "start_time": start_time, - "end_time": end_time, - } - for cohort_id, (checksum, status, start_time, end_time) in generated.items() - ] - ) - new_rows_df["cohort_definition_id"] = new_rows_df["cohort_definition_id"].astype("int64") - new_rows_df["checksum"] = new_rows_df["checksum"].astype(str) - new_rows_df["status"] = new_rows_df["status"].astype(str) - new_rows_df["start_time"] = pd.to_datetime(new_rows_df["start_time"]) - new_rows_df["end_time"] = pd.to_datetime(new_rows_df["end_time"]) - new_relation = ibis.memtable(new_rows_df) + def _checksum_row(cid, checksum, status, start_time, end_time): + return ( + ibis.literal(int(cid), type="int64") + .name("cohort_definition_id") + .as_table() + .mutate( + checksum=ibis.literal(str(checksum), type="str"), + status=ibis.literal(str(status), type="str"), + start_time=ibis.literal(start_time, type="timestamp"), + end_time=ibis.literal(end_time, type="timestamp"), + ) + ) + + items = list(generated.items()) + new_relation = _checksum_row(items[0][0], *items[0][1]) + for cid, vals in items[1:]: + new_relation = new_relation.union(_checksum_row(cid, *vals), distinct=False) if not table_exists(backend, table_name=table_name, schema=schema): create_table(backend, table_name=table_name, schema=schema, obj=new_relation, overwrite=False) @@ -262,21 +260,18 @@ def upsert_generation_history( row = [int(cohort_id), str(checksum), str(status), start_time, end_time] if not table_exists(backend, table_name=table_name, schema=schema): - create_table( - backend, - table_name=table_name, - schema=schema, - obj=ibis.memtable( - { - "cohort_definition_id": [int(cohort_id)], - "checksum": [str(checksum)], - "status": [str(status)], - "start_time": [start_time], - "end_time": [end_time], - } - ), - overwrite=False, + new_row = ( + ibis.literal(int(cohort_id), type="int64") + .name("cohort_definition_id") + .as_table() + .mutate( + checksum=ibis.literal(str(checksum), type="str"), + status=ibis.literal(str(status), type="str"), + start_time=ibis.literal(start_time, type="timestamp"), + end_time=ibis.literal(end_time, type="timestamp"), + ) ) + create_table(backend, table_name=table_name, schema=schema, obj=new_row, overwrite=False) return delete_cohort_rows( diff --git a/circe/execution/engine/custom_era.py b/circe/execution/engine/custom_era.py index d223b88..2320480 100644 --- a/circe/execution/engine/custom_era.py +++ b/circe/execution/engine/custom_era.py @@ -83,15 +83,7 @@ def compute_drug_eras( days_supply_override: int | None, cohort_person_ids=None, ): - concept_ids = tuple(sorted(int(i) for i in ctx.concept_set_table(drug_codeset_id).execute().iloc[:, 0])) - - if not concept_ids: - de = ctx.table("drug_exposure") - return de.filter(ibis.literal(False)).select( - de.person_id.cast("int64").name(PERSON_ID), - ibis.null().cast("date").name("era_start_date"), - ibis.null().cast("date").name("era_end_date"), - ) + concept_table = ctx.concept_set_table(drug_codeset_id) de = ctx.table("drug_exposure") if cohort_person_ids is not None: @@ -100,12 +92,11 @@ def compute_drug_eras( predicates=[de.person_id == cohort_person_ids.person_id], ) - if "drug_source_concept_id" in de.columns: - filtered = de.filter( - de.drug_concept_id.isin(concept_ids) | de.drug_source_concept_id.isin(concept_ids) - ) - else: - filtered = de.filter(de.drug_concept_id.isin(concept_ids)) + has_source = "drug_source_concept_id" in de.columns + filtered = de.semi_join(concept_table, de.drug_concept_id == concept_table.concept_id) + if has_source: + source_matches = de.semi_join(concept_table, de.drug_source_concept_id == concept_table.concept_id) + filtered = filtered.union(source_matches, distinct=True) prepared = filtered.select( filtered.person_id.cast("int64").name("person_id"), diff --git a/circe/execution/engine/group_demographics.py b/circe/execution/engine/group_demographics.py index 45f9eee..0b6ae38 100644 --- a/circe/execution/engine/group_demographics.py +++ b/circe/execution/engine/group_demographics.py @@ -80,25 +80,41 @@ def _apply_date_predicate(date_expr, predicate): ) +def _explicit_ids_table(concept_ids: tuple[int, ...]) -> Table: + """Build a single-column ibis Table from explicit concept IDs, no memtable. + + Uses ``ibis.literal().name().as_table()`` with ``union()`` — generates + ``SELECT id1 AS concept_id UNION ALL SELECT id2 AS concept_id ...``. + """ + first = ibis.literal(int(concept_ids[0]), type="int64").name("concept_id").as_table() + for cid in concept_ids[1:]: + t = ibis.literal(int(cid), type="int64").name("concept_id").as_table() + first = first.union(t, distinct=False) + return first + + def _demographic_concept_ids( *, explicit_ids: tuple[int, ...], codeset_id: int | None, ctx: ExecutionContext, -) -> tuple[int, ...]: +) -> Table | None: """Resolve concept IDs for a demographic filter. - Returns a Python tuple of concept IDs. Demographic sets are tiny (1-5 - IDs) so this is cheap and avoids join complexity. + Returns an ibis Table with a single ``concept_id`` column, or ``None`` + if neither explicit IDs nor a codeset is provided (meaning no filter). """ - all_ids = list(explicit_ids) + if not explicit_ids and codeset_id is None: + return None + parts: list[Table] = [] + if explicit_ids: + parts.append(_explicit_ids_table(explicit_ids)) if codeset_id is not None: - t = ctx.concept_set_table(codeset_id) - for row in t.select("concept_id").distinct().to_pandas().itertuples(): - cid = row.concept_id - if cid not in all_ids: - all_ids.append(cid) - return tuple(all_ids) + parts.append(ctx.concept_set_table(codeset_id).select("concept_id").distinct()) + result = parts[0] + for part in parts[1:]: + result = result.union(part, distinct=True) + return result def demographic_match_keys( @@ -127,24 +143,24 @@ def demographic_match_keys( codeset_id=demographic.gender_codeset_id, ctx=ctx, ) - if gender_ids: - predicates.append(joined.gender_concept_id.isin(gender_ids)) + if gender_ids is not None: + predicates.append(joined.gender_concept_id.isin(gender_ids.concept_id)) race_ids = _demographic_concept_ids( explicit_ids=demographic.race_concept_ids, codeset_id=demographic.race_codeset_id, ctx=ctx, ) - if race_ids: - predicates.append(joined.race_concept_id.isin(race_ids)) + if race_ids is not None: + predicates.append(joined.race_concept_id.isin(race_ids.concept_id)) ethnicity_ids = _demographic_concept_ids( explicit_ids=demographic.ethnicity_concept_ids, codeset_id=demographic.ethnicity_codeset_id, ctx=ctx, ) - if ethnicity_ids: - predicates.append(joined.ethnicity_concept_id.isin(ethnicity_ids)) + if ethnicity_ids is not None: + predicates.append(joined.ethnicity_concept_id.isin(ethnicity_ids.concept_id)) if demographic.occurrence_start_date is not None: predicates.append( diff --git a/circe/execution/ibis/compile_steps.py b/circe/execution/ibis/compile_steps.py index 58398eb..cfc5626 100644 --- a/circe/execution/ibis/compile_steps.py +++ b/circe/execution/ibis/compile_steps.py @@ -26,6 +26,7 @@ from ..plan.predicates import DateRangePredicate, NumericRangePredicate from ..plan.schema import END_DATE, PERSON_ID, START_DATE from .context import ExecutionContext +from ..ibis_compat import literal_column_relation from .person_filters import ( apply_person_age_filter, apply_person_ethnicity_filter, @@ -137,7 +138,7 @@ def _filter_visit_concepts(table, ctx: ExecutionContext, *, step: FilterByVisit) concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) + concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) # If neither codeset_id nor concept_ids, no filtering needed @@ -170,7 +171,7 @@ def _filter_provider_specialty( concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) + concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) if step.exclude: @@ -197,7 +198,7 @@ def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): concept_table = ctx.concept_set_table(step.codeset_id) joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) elif step.concept_ids: - concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) + concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) if step.exclude: @@ -278,7 +279,7 @@ def apply_step(step, *, table, source, ctx: ExecutionContext): if isinstance(step, FilterByConceptSet): if not step.concept_ids: return table if step.exclude else table.limit(0) - concept_table = ibis.memtable({"concept_id": list(step.concept_ids)}, schema={"concept_id": "int64"}) + concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") return _filter_by_concept_table(table, concept_table, column=step.column, exclude=step.exclude) if isinstance(step, FilterByVisit): diff --git a/circe/execution/ibis/context.py b/circe/execution/ibis/context.py index dcbd9c8..77fb7aa 100644 --- a/circe/execution/ibis/context.py +++ b/circe/execution/ibis/context.py @@ -8,6 +8,7 @@ from .._dataclass import frozen_slots_dataclass from ..normalize.cohort import NormalizedConceptSet from ..typing import IbisBackendLike, Table +from ..ibis_compat import literal_rows_relation def _table_with_schema_fallback( @@ -59,22 +60,18 @@ def concept_set_table(self, codeset_id: int) -> Table: def _build_codeset_memtable( concept_sets: Mapping[int, NormalizedConceptSet], ) -> Table: - """Build a simple memtable for concept sets with known concept IDs. + """Build a simple table for concept sets with known concept IDs. Only handles simple includes (no descendant/mapped expansion needed). This is a fallback for backward-compatible test usage. + Uses ``literal_rows_relation`` to avoid ``ibis.memtable()``. """ rows: list[dict[str, Any]] = [] for cid, cset in concept_sets.items(): for item in cset.items: if not item.is_excluded and item.concept_id is not None: rows.append({"codeset_id": int(cid), "concept_id": int(item.concept_id)}) - if rows: - return ibis.memtable(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) - return ibis.memtable( - {"codeset_id": [], "concept_id": []}, - schema={"codeset_id": "int64", "concept_id": "int64"}, - ) + return literal_rows_relation(rows, schema={"codeset_id": "int64", "concept_id": "int64"}) def make_execution_context( diff --git a/circe/execution/ibis/operations.py b/circe/execution/ibis/operations.py index 40789e1..698c64c 100644 --- a/circe/execution/ibis/operations.py +++ b/circe/execution/ibis/operations.py @@ -83,7 +83,7 @@ def cohort_rows_exist( table = read_table(backend, table_name=cohort_table, schema=results_schema) cohort_id_expr = ibis.literal(int(cohort_id), type="int64") matching = table.filter(table.cohort_definition_id.cast("int64") == cohort_id_expr) - return len(matching.limit(1).execute()) > 0 + return matching.limit(1).count().execute() > 0 except Exception as exc: raise ExecutionError( f"Ibis executor write error: failed checking existing rows for cohort_id={cohort_id}." diff --git a/circe/execution/ibis/person_filters.py b/circe/execution/ibis/person_filters.py index ff88851..f511a1f 100644 --- a/circe/execution/ibis/person_filters.py +++ b/circe/execution/ibis/person_filters.py @@ -6,6 +6,7 @@ from ..plan.predicates import NumericRangePredicate from ..plan.schema import PERSON_ID from .context import ExecutionContext +from ..ibis_compat import literal_column_relation def _apply_numeric_predicate(expr, predicate: NumericRangePredicate): @@ -65,7 +66,7 @@ def apply_person_gender_filter( concept_table = ctx.concept_set_table(codeset_id) concept_table = concept_table.select(concept_table.concept_id.name("_pconcept_id")) elif concept_ids: - concept_table = ibis.memtable({"_pconcept_id": list(concept_ids)}, schema={"_pconcept_id": "int64"}) + concept_table = literal_column_relation(concept_ids, column_name="_pconcept_id", dtype="int64") else: return table @@ -87,7 +88,7 @@ def _apply_person_concept_filter( concept_table = ctx.concept_set_table(codeset_id) concept_table = concept_table.select(concept_table.concept_id.name("_pconcept_id")) elif concept_ids: - concept_table = ibis.memtable({"_pconcept_id": list(concept_ids)}, schema={"_pconcept_id": "int64"}) + concept_table = literal_column_relation(concept_ids, column_name="_pconcept_id", dtype="int64") else: return table diff --git a/tests/execution/test_group_demographics.py b/tests/execution/test_group_demographics.py index b4bdbc3..0fbcb0f 100644 --- a/tests/execution/test_group_demographics.py +++ b/tests/execution/test_group_demographics.py @@ -71,15 +71,21 @@ def test_apply_date_predicate_rejects_invalid_between_and_op(): def test_demographic_concept_table_returns_table(): - ctx = _DemographicContext(None, codesets={1: (8507, 8532)}) + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + conn = ibis.duckdb.connect() + ctx = _DemographicContext(conn, codesets={1: (8507, 8532)}) result = _demographic_concept_ids(explicit_ids=(8507,), codeset_id=1, ctx=ctx) - assert result == (8507, 8532) + assert result is not None + conn.create_table("_test_demo_concepts", result, temp=True, overwrite=True) + rows = conn.table("_test_demo_concepts").execute() + assert sorted(rows["concept_id"].tolist()) == [8507, 8532] def test_demographic_concept_table_returns_empty_when_empty(): ctx = _DemographicContext(None) result = _demographic_concept_ids(explicit_ids=(), codeset_id=None, ctx=ctx) - assert result == () + assert result is None def test_demographic_match_keys_applies_all_supported_filters(): diff --git a/tests/execution/test_operations.py b/tests/execution/test_operations.py index ce42448..2265265 100644 --- a/tests/execution/test_operations.py +++ b/tests/execution/test_operations.py @@ -96,6 +96,14 @@ def __ne__(self, other): return ("ne", other) +class _CountExpr: + def __init__(self, rows): + self._rows = rows + + def execute(self): + return len(self._rows) + + class _CohortRelation: cohort_definition_id = _CohortColumn() @@ -111,6 +119,9 @@ def filter(self, _predicate): def limit(self, _count): return self + def count(self): + return _CountExpr(self.rows) + def execute(self): return self.rows diff --git a/tests/test_cohort_definition_set.py b/tests/test_cohort_definition_set.py index 3c85aad..037e0fe 100644 --- a/tests/test_cohort_definition_set.py +++ b/tests/test_cohort_definition_set.py @@ -558,11 +558,12 @@ def test_load_generation_history(): history = load_generation_history(conn, schema="main", table_name=CHECKSUM_TABLE) assert history is not None - assert not history.empty - assert "start_time" in history.columns - assert "end_time" in history.columns - assert "status" in history.columns - assert history.iloc[0]["status"] == "COMPLETE" + rows = history.execute() + assert not rows.empty + assert "start_time" in rows.columns + assert "end_time" in rows.columns + assert "status" in rows.columns + assert rows.iloc[0]["status"] == "COMPLETE" # Non-existent table returns None none_result = load_generation_history(conn, schema="main", table_name="nonexistent_table") From 440e6e9e77ac7f96733fa0f190fcdd3ce441e908 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 16:34:55 -0700 Subject: [PATCH 33/53] more fixes from concept set changes --- circe/execution/ibis/compile_steps.py | 36 ++++++++++++++------------ circe/execution/ibis/context.py | 4 +-- circe/execution/ibis/person_filters.py | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/circe/execution/ibis/compile_steps.py b/circe/execution/ibis/compile_steps.py index cfc5626..4d9d908 100644 --- a/circe/execution/ibis/compile_steps.py +++ b/circe/execution/ibis/compile_steps.py @@ -3,6 +3,7 @@ import ibis from ..errors import CompilationError, UnsupportedFeatureError +from ..ibis_compat import literal_column_relation from ..plan.events import ( ApplyDateAdjustment, FilterByCareSite, @@ -26,7 +27,6 @@ from ..plan.predicates import DateRangePredicate, NumericRangePredicate from ..plan.schema import END_DATE, PERSON_ID, START_DATE from .context import ExecutionContext -from ..ibis_compat import literal_column_relation from .person_filters import ( apply_person_age_filter, apply_person_ethnicity_filter, @@ -136,20 +136,20 @@ def _filter_visit_concepts(table, ctx: ExecutionContext, *, step: FilterByVisit) if step.codeset_id is not None: concept_table = ctx.concept_set_table(step.codeset_id) - joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) elif step.concept_ids: concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") - joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) - # If neither codeset_id nor concept_ids, no filtering needed + else: + return _select_original_columns(table, joined) - if step.exclude: + if not step.exclude: + joined = joined.join(concept_table, joined._visit_concept_id == concept_table.concept_id) + return _select_original_columns(table, joined) + else: marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) joined = joined.join(marked, joined._visit_concept_id == marked.concept_id, how="left") joined = joined.filter(joined._cm.isnull()) return _select_original_columns(table, joined) - return _select_original_columns(table, joined) - def _filter_provider_specialty( table, @@ -169,19 +169,20 @@ def _filter_provider_specialty( if step.codeset_id is not None: concept_table = ctx.concept_set_table(step.codeset_id) - joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) elif step.concept_ids: concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") - joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) + else: + return _select_original_columns(table, joined) - if step.exclude: + if not step.exclude: + joined = joined.join(concept_table, joined._specialty_concept_id == concept_table.concept_id) + return _select_original_columns(table, joined) + else: marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) joined = joined.join(marked, joined._specialty_concept_id == marked.concept_id, how="left") joined = joined.filter(joined._cm.isnull()) return _select_original_columns(table, joined) - return _select_original_columns(table, joined) - def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): care_site = ctx.table("care_site") @@ -196,19 +197,20 @@ def _filter_care_site(table, ctx: ExecutionContext, *, step: FilterByCareSite): if step.codeset_id is not None: concept_table = ctx.concept_set_table(step.codeset_id) - joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) elif step.concept_ids: concept_table = literal_column_relation(step.concept_ids, column_name="concept_id", dtype="int64") - joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) + else: + return _select_original_columns(table, joined) - if step.exclude: + if not step.exclude: + joined = joined.join(concept_table, joined._place_of_service_concept_id == concept_table.concept_id) + return _select_original_columns(table, joined) + else: marked = concept_table.mutate(_cm=ibis.literal(1, type="int64")) joined = joined.join(marked, joined._place_of_service_concept_id == marked.concept_id, how="left") joined = joined.filter(joined._cm.isnull()) return _select_original_columns(table, joined) - return _select_original_columns(table, joined) - def _filter_care_site_location_region( table, diff --git a/circe/execution/ibis/context.py b/circe/execution/ibis/context.py index 77fb7aa..1f09cfc 100644 --- a/circe/execution/ibis/context.py +++ b/circe/execution/ibis/context.py @@ -3,12 +3,10 @@ from collections.abc import Mapping from typing import Any -import ibis - from .._dataclass import frozen_slots_dataclass +from ..ibis_compat import literal_rows_relation from ..normalize.cohort import NormalizedConceptSet from ..typing import IbisBackendLike, Table -from ..ibis_compat import literal_rows_relation def _table_with_schema_fallback( diff --git a/circe/execution/ibis/person_filters.py b/circe/execution/ibis/person_filters.py index f511a1f..f8809d2 100644 --- a/circe/execution/ibis/person_filters.py +++ b/circe/execution/ibis/person_filters.py @@ -3,10 +3,10 @@ import ibis from ..errors import CompilationError +from ..ibis_compat import literal_column_relation from ..plan.predicates import NumericRangePredicate from ..plan.schema import PERSON_ID from .context import ExecutionContext -from ..ibis_compat import literal_column_relation def _apply_numeric_predicate(expr, predicate: NumericRangePredicate): From 1c3a3d7e82bdfe95ccdce1c7fbd172ed48b7633d Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 17:23:52 -0700 Subject: [PATCH 34/53] More crazy codeset join fixes --- circe/execution/ibis/codesets.py | 1 + tests/execution/test_codeset_resolution.py | 112 +++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tests/execution/test_codeset_resolution.py diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 8eb19d1..b0125ff 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -150,6 +150,7 @@ def _build_codeset_expression( marked = e.mutate(_cm=ibis.literal(1, type="int64")) result = result.join(marked, result.concept_id == marked.concept_id, how="left") result = result.filter(result._cm.isnull()).drop("_cm") + result = result.select(result.concept_id.name(CONCEPT_ID)) return result.select(result.concept_id.name(CONCEPT_ID)) diff --git a/tests/execution/test_codeset_resolution.py b/tests/execution/test_codeset_resolution.py new file mode 100644 index 0000000..25fadb6 --- /dev/null +++ b/tests/execution/test_codeset_resolution.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import pytest + +from circe.execution.ibis.codesets import build_single_codeset_table +from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem + + +def _make_item(cid: int, *, excluded: bool = False) -> NormalizedConceptSetItem: + return NormalizedConceptSetItem( + concept_id=cid, + is_excluded=excluded, + include_descendants=False, + include_mapped=False, + ) + + +def test_multiple_excludes_no_collision(): + """Multiple excluded items trigger sequential anti-joins in + _build_codeset_expression. Without the per-iteration reselect, the + second anti-join would hit a concept_id_right collision. + """ + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + + concept_sets = { + 1: NormalizedConceptSet( + set_id=1, + items=( + _make_item(111), # include + _make_item(222), # include + _make_item(333, excluded=True), # exclude 1 + _make_item(444, excluded=True), # exclude 2 + _make_item(555, excluded=True), # exclude 3 → triggers collision without fix + ), + ), + } + + tbl = build_single_codeset_table( + backend=conn, + concept_sets=concept_sets, + batch_table_name="__test_exclude_codesets", + ) + rows = tbl.execute() + + included_ids = set(rows["concept_id"].tolist()) + assert included_ids == {111, 222}, f"Expected {{111, 222}} got {included_ids}" + + conn.drop_table("__test_exclude_codesets", force=True) + + +def test_single_exclude_works(): + """Single exclude should also work correctly.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + + concept_sets = { + 1: NormalizedConceptSet( + set_id=1, + items=( + _make_item(111), + _make_item(222), + _make_item(333, excluded=True), + ), + ), + } + + tbl = build_single_codeset_table( + backend=conn, + concept_sets=concept_sets, + batch_table_name="__test_exclude_codesets2", + ) + rows = tbl.execute() + + included_ids = set(rows["concept_id"].tolist()) + assert included_ids == {111, 222}, f"Expected {{111, 222}} got {included_ids}" + + conn.drop_table("__test_exclude_codesets2", force=True) + + +def test_all_excluded_returns_empty(): + """All items excluded should return empty codeset table.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + + concept_sets = { + 1: NormalizedConceptSet( + set_id=1, + items=( + _make_item(111, excluded=True), + _make_item(222, excluded=True), + _make_item(333, excluded=True), + ), + ), + } + + tbl = build_single_codeset_table( + backend=conn, + concept_sets=concept_sets, + batch_table_name="__test_exclude_codesets3", + ) + rows = tbl.execute() + + assert len(rows) == 0, f"Expected empty, got {rows}" + + conn.drop_table("__test_exclude_codesets3", force=True) From 2604f9dbd0f4b807dfbb834dcfed66fcac47d516 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Mon, 18 May 2026 18:22:58 -0700 Subject: [PATCH 35/53] Removal of heavy recursion for large unions --- circe/execution/engine/censoring.py | 10 +- circe/execution/engine/group_demographics.py | 24 +- circe/execution/engine/group_keys.py | 14 +- circe/execution/ibis/codesets.py | 8 +- tests/execution/phenotype_fixtures/33.json | 406 +++++++++++++++++++ tests/execution/phenotype_fixtures/54.json | 163 ++++++++ tests/execution/test_codeset_resolution.py | 197 +++++++-- 7 files changed, 757 insertions(+), 65 deletions(-) create mode 100644 tests/execution/phenotype_fixtures/33.json create mode 100644 tests/execution/phenotype_fixtures/54.json diff --git a/circe/execution/engine/censoring.py b/circe/execution/engine/censoring.py index ce21f62..264234d 100644 --- a/circe/execution/engine/censoring.py +++ b/circe/execution/engine/censoring.py @@ -9,10 +9,12 @@ def _union_all(tables): - current = tables[0] - for table in tables[1:]: - current = current.union(table, distinct=False) - return current + if len(tables) == 1: + return tables[0] + mid = len(tables) // 2 + left = _union_all(tables[:mid]) + right = _union_all(tables[mid:]) + return left.union(right, distinct=False) def _compile_censor_events(criteria, ctx): diff --git a/circe/execution/engine/group_demographics.py b/circe/execution/engine/group_demographics.py index 0b6ae38..7716ca4 100644 --- a/circe/execution/engine/group_demographics.py +++ b/circe/execution/engine/group_demographics.py @@ -80,19 +80,6 @@ def _apply_date_predicate(date_expr, predicate): ) -def _explicit_ids_table(concept_ids: tuple[int, ...]) -> Table: - """Build a single-column ibis Table from explicit concept IDs, no memtable. - - Uses ``ibis.literal().name().as_table()`` with ``union()`` — generates - ``SELECT id1 AS concept_id UNION ALL SELECT id2 AS concept_id ...``. - """ - first = ibis.literal(int(concept_ids[0]), type="int64").name("concept_id").as_table() - for cid in concept_ids[1:]: - t = ibis.literal(int(cid), type="int64").name("concept_id").as_table() - first = first.union(t, distinct=False) - return first - - def _demographic_concept_ids( *, explicit_ids: tuple[int, ...], @@ -108,13 +95,14 @@ def _demographic_concept_ids( return None parts: list[Table] = [] if explicit_ids: - parts.append(_explicit_ids_table(explicit_ids)) + from ..ibis_compat import literal_column_relation + + parts.append(literal_column_relation(explicit_ids, column_name="concept_id", dtype="int64")) if codeset_id is not None: parts.append(ctx.concept_set_table(codeset_id).select("concept_id").distinct()) - result = parts[0] - for part in parts[1:]: - result = result.union(part, distinct=True) - return result + if len(parts) == 1: + return parts[0] + return parts[0].union(parts[1], distinct=True) def demographic_match_keys( diff --git a/circe/execution/engine/group_keys.py b/circe/execution/engine/group_keys.py index 4de323d..8510227 100644 --- a/circe/execution/engine/group_keys.py +++ b/circe/execution/engine/group_keys.py @@ -4,11 +4,17 @@ from ..typing import Table +def _binary_union(tables: list[Table]) -> Table: + if len(tables) == 1: + return tables[0] + mid = len(tables) // 2 + left = _binary_union(tables[:mid]) + right = _binary_union(tables[mid:]) + return left.union(right, distinct=False) + + def union_all(tables: list[Table]) -> Table: - current = tables[0] - for table in tables[1:]: - current = current.union(table, distinct=False) - return current + return _binary_union(tables) def event_keys(events: Table) -> Table: diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index b0125ff..9403720 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -137,13 +137,7 @@ def _build_codeset_expression( if not include_parts: return _empty_table(columns=(("concept_id", 0),)) - if len(include_parts) == 1: - result = include_parts[0] - else: - result = include_parts[0] - for part in include_parts[1:]: - result = result.union(part, distinct=False) - + result = _union_all_tables(include_parts) result = result.distinct() for e in exclude_parts: diff --git a/tests/execution/phenotype_fixtures/33.json b/tests/execution/phenotype_fixtures/33.json new file mode 100644 index 0000000..aff8643 --- /dev/null +++ b/tests/execution/phenotype_fixtures/33.json @@ -0,0 +1,406 @@ +{ + "cdmVersionRange" : ">=5.0.0", + "PrimaryCriteria" : { + "CriteriaList" : [ + { + "ConditionOccurrence" : { + "CodesetId" : 0, + "ConditionTypeExclude" : false + } + } + ], + "ObservationWindow" : { + "PriorDays" : 0, + "PostDays" : 0 + }, + "PrimaryCriteriaLimit" : { + "Type" : "All" + } + }, + "ConceptSets" : [ + { + "id" : 0, + "name" : "Dementia", + "expression" : { + "items" : [ + { + "concept" : { + "CONCEPT_ID" : 37312036, + "CONCEPT_NAME" : "Aggression due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "788861009", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37312035, + "CONCEPT_NAME" : "Agitation due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "788862002", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4041685, + "CONCEPT_NAME" : "Amyotrophic lateral sclerosis with dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "230258005", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37312031, + "CONCEPT_NAME" : "Anxiety due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "788866004", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37312030, + "CONCEPT_NAME" : "Apathetic behaviour due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "788867008", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 35608576, + "CONCEPT_NAME" : "Behavioral and psychological symptoms of dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "10171000132106", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4092747, + "CONCEPT_NAME" : "Cerebral degeneration presenting primarily with dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "279982005", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4182210, + "CONCEPT_NAME" : "Dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "52448006", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37116464, + "CONCEPT_NAME" : "Dementia caused by heavy metal exposure", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "733184002", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37017549, + "CONCEPT_NAME" : "Dementia co-occurrent with human immunodeficiency virus infection", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "713844000", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4244346, + "CONCEPT_NAME" : "Dialysis dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "9345005", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37311665, + "CONCEPT_NAME" : "Disinhibited behaviour due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "789170003", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4043378, + "CONCEPT_NAME" : "Frontotemporal dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "230270009", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 45765480, + "CONCEPT_NAME" : "Frontotemporal dementia with parkinsonism-17", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "702429008", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 45765477, + "CONCEPT_NAME" : "GRN-related frontotemporal dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "702426001", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 377788, + "CONCEPT_NAME" : "General paresis - neurosyphilis", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "51928006", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 372610, + "CONCEPT_NAME" : "Postconcussion syndrome", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "40425004", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37017247, + "CONCEPT_NAME" : "Presenile dementia co-occurrent with human immunodeficiency virus infection", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "713488003", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : true, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37311890, + "CONCEPT_NAME" : "Psychological symptom due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "789011007", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 37312577, + "CONCEPT_NAME" : "Wandering due to dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "789062005", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4059191, + "CONCEPT_NAME" : "H/O: dementia", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "161465002", + "DOMAIN_ID" : "Observation", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Context-dependent" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + } + ] + } + } + ], + "QualifiedLimit" : { + "Type" : "First" + }, + "ExpressionLimit" : { + "Type" : "All" + }, + "InclusionRules" : [], + "EndStrategy" : { + "DateOffset" : { + "DateField" : "EndDate", + "Offset" : 365 + } + }, + "CensoringCriteria" : [], + "CollapseSettings" : { + "CollapseType" : "ERA", + "EraPad" : 0 + }, + "CensorWindow" : {} +} diff --git a/tests/execution/phenotype_fixtures/54.json b/tests/execution/phenotype_fixtures/54.json new file mode 100644 index 0000000..b74720a --- /dev/null +++ b/tests/execution/phenotype_fixtures/54.json @@ -0,0 +1,163 @@ +{ + "cdmVersionRange" : ">=5.0.0", + "PrimaryCriteria" : { + "CriteriaList" : [ + { + "ConditionOccurrence" : { + "CodesetId" : 4, + "ConditionTypeExclude" : false + } + } + ], + "ObservationWindow" : { + "PriorDays" : 0, + "PostDays" : 0 + }, + "PrimaryCriteriaLimit" : { + "Type" : "All" + } + }, + "ConceptSets" : [ + { + "id" : 4, + "name" : "Febrile seizure and unspecified seizure ", + "expression" : { + "items" : [ + { + "concept" : { + "CONCEPT_ID" : 444413, + "CONCEPT_NAME" : "Febrile convulsion", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "41497008", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 377091, + "CONCEPT_NAME" : "Seizure", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "91175000", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : false, + "includeMapped" : false + }, + { + "concept" : { + "CONCEPT_ID" : 4196708, + "CONCEPT_NAME" : "Seizure related finding", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "313287004", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + } + ] + } + }, + { + "id" : 6, + "name" : "Febrile seizure", + "expression" : { + "items" : [ + { + "concept" : { + "CONCEPT_ID" : 444413, + "CONCEPT_NAME" : "Febrile convulsion", + "STANDARD_CONCEPT" : "S", + "STANDARD_CONCEPT_CAPTION" : "Standard", + "INVALID_REASON" : "V", + "INVALID_REASON_CAPTION" : "Valid", + "CONCEPT_CODE" : "41497008", + "DOMAIN_ID" : "Condition", + "VOCABULARY_ID" : "SNOMED", + "CONCEPT_CLASS_ID" : "Clinical Finding" + }, + "isExcluded" : false, + "includeDescendants" : true, + "includeMapped" : false + } + ] + } + } + ], + "QualifiedLimit" : { + "Type" : "First" + }, + "ExpressionLimit" : { + "Type" : "All" + }, + "InclusionRules" : [ + { + "name" : "Has febrile seizure diagnosis ", + "expression" : { + "Type" : "ALL", + "CriteriaList" : [ + { + "Criteria" : { + "ConditionOccurrence" : { + "CodesetId" : 6, + "ConditionTypeExclude" : false + } + }, + "StartWindow" : { + "Start" : { + "Days" : 0, + "Coeff" : -1 + }, + "End" : { + "Days" : 42, + "Coeff" : 1 + }, + "UseIndexEnd" : false, + "UseEventEnd" : false + }, + "RestrictVisit" : false, + "IgnoreObservationPeriod" : false, + "Occurrence" : { + "Type" : 2, + "Count" : 1, + "IsDistinct" : false + } + } + ], + "DemographicCriteriaList" : [], + "Groups" : [] + } + } + ], + "EndStrategy" : { + "DateOffset" : { + "DateField" : "EndDate", + "Offset" : 14 + } + }, + "CensoringCriteria" : [], + "CollapseSettings" : { + "CollapseType" : "ERA", + "EraPad" : 0 + }, + "CensorWindow" : {} +} diff --git a/tests/execution/test_codeset_resolution.py b/tests/execution/test_codeset_resolution.py index 25fadb6..b754554 100644 --- a/tests/execution/test_codeset_resolution.py +++ b/tests/execution/test_codeset_resolution.py @@ -1,10 +1,134 @@ from __future__ import annotations +from pathlib import Path + import pytest +from circe.cohortdefinition import CohortExpression +from circe.execution.api import build_cohort from circe.execution.ibis.codesets import build_single_codeset_table from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem +BENCHMARK_OUTPUT = Path(__file__).resolve().parent.parent.parent / "benchmark_output" +JSON_DIR = Path(__file__).resolve().parent / "phenotype_fixtures" + + +def _seed_minimal_cdm(conn, ibis): + import datetime + + S = datetime.date(2000, 1, 1) + tables = { + "person": {"person_id": [999], "year_of_birth": [1900], "gender_concept_id": [0]}, + "observation_period": { + "person_id": [999], + "observation_period_id": [999], + "observation_period_start_date": [S], + "observation_period_end_date": [S], + }, + "condition_occurrence": { + "person_id": [999], + "condition_occurrence_id": [999], + "condition_concept_id": [0], + "condition_start_date": [S], + "condition_end_date": [S], + }, + "procedure_occurrence": { + "person_id": [999], + "procedure_occurrence_id": [999], + "procedure_concept_id": [0], + "procedure_date": [S], + }, + "measurement": { + "person_id": [999], + "measurement_id": [999], + "measurement_concept_id": [0], + "measurement_date": [S], + }, + "observation": { + "person_id": [999], + "observation_id": [999], + "observation_concept_id": [0], + "observation_date": [S], + }, + "drug_exposure": { + "person_id": [999], + "drug_exposure_id": [999], + "drug_concept_id": [0], + "drug_exposure_start_date": [S], + "drug_exposure_end_date": [S], + }, + "death": {"person_id": [999], "death_date": [S]}, + "visit_occurrence": { + "person_id": [999], + "visit_occurrence_id": [999], + "visit_concept_id": [0], + "visit_start_date": [S], + "visit_end_date": [S], + }, + "specimen": { + "person_id": [999], + "specimen_id": [999], + "specimen_concept_id": [0], + "specimen_date": [S], + }, + "device_exposure": { + "person_id": [999], + "device_exposure_id": [999], + "device_concept_id": [0], + "device_exposure_start_date": [S], + "device_exposure_end_date": [S], + }, + "dose_era": { + "person_id": [999], + "dose_era_id": [999], + "drug_concept_id": [0], + "unit_concept_id": [0], + "dose_value": [0.0], + "dose_era_start_date": [S], + "dose_era_end_date": [S], + }, + "payer_plan_period": { + "person_id": [999], + "payer_plan_period_id": [999], + "payer_plan_period_start_date": [S], + "payer_plan_period_end_date": [S], + }, + "visit_detail": { + "person_id": [999], + "visit_detail_id": [999], + "visit_detail_concept_id": [0], + "visit_detail_start_date": [S], + "visit_detail_end_date": [S], + }, + "condition_era": { + "person_id": [999], + "condition_era_id": [999], + "condition_concept_id": [0], + "condition_era_start_date": [S], + "condition_era_end_date": [S], + "condition_occurrence_count": [1], + }, + "drug_era": { + "person_id": [999], + "drug_era_id": [999], + "drug_concept_id": [0], + "drug_era_start_date": [S], + "drug_era_end_date": [S], + "drug_exposure_count": [1], + "gap_days": [0], + }, + "concept": {"concept_id": [0, 999], "invalid_reason": ["X", None]}, + "concept_ancestor": {"ancestor_concept_id": [999], "descendant_concept_id": [999]}, + "concept_relationship": { + "concept_id_1": [999], + "concept_id_2": [999], + "relationship_id": ["X"], + "invalid_reason": ["X"], + }, + } + for name, obj in tables.items(): + conn.create_table(name, obj=ibis.memtable(obj), overwrite=True) + def _make_item(cid: int, *, excluded: bool = False) -> NormalizedConceptSetItem: return NormalizedConceptSetItem( @@ -15,11 +139,12 @@ def _make_item(cid: int, *, excluded: bool = False) -> NormalizedConceptSetItem: ) +# ------------------------------------------------------------------ +# Multi-exclude collision tests +# ------------------------------------------------------------------ + + def test_multiple_excludes_no_collision(): - """Multiple excluded items trigger sequential anti-joins in - _build_codeset_expression. Without the per-iteration reselect, the - second anti-join would hit a concept_id_right collision. - """ ibis = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") @@ -29,11 +154,11 @@ def test_multiple_excludes_no_collision(): 1: NormalizedConceptSet( set_id=1, items=( - _make_item(111), # include - _make_item(222), # include - _make_item(333, excluded=True), # exclude 1 - _make_item(444, excluded=True), # exclude 2 - _make_item(555, excluded=True), # exclude 3 → triggers collision without fix + _make_item(111), + _make_item(222), + _make_item(333, excluded=True), + _make_item(444, excluded=True), + _make_item(555, excluded=True), ), ), } @@ -44,15 +169,11 @@ def test_multiple_excludes_no_collision(): batch_table_name="__test_exclude_codesets", ) rows = tbl.execute() - - included_ids = set(rows["concept_id"].tolist()) - assert included_ids == {111, 222}, f"Expected {{111, 222}} got {included_ids}" - + assert set(rows["concept_id"].tolist()) == {111, 222} conn.drop_table("__test_exclude_codesets", force=True) def test_single_exclude_works(): - """Single exclude should also work correctly.""" ibis = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") @@ -61,29 +182,19 @@ def test_single_exclude_works(): concept_sets = { 1: NormalizedConceptSet( set_id=1, - items=( - _make_item(111), - _make_item(222), - _make_item(333, excluded=True), - ), + items=(_make_item(111), _make_item(222), _make_item(333, excluded=True)), ), } tbl = build_single_codeset_table( - backend=conn, - concept_sets=concept_sets, - batch_table_name="__test_exclude_codesets2", + backend=conn, concept_sets=concept_sets, batch_table_name="__test_exclude_codesets2" ) rows = tbl.execute() - - included_ids = set(rows["concept_id"].tolist()) - assert included_ids == {111, 222}, f"Expected {{111, 222}} got {included_ids}" - + assert set(rows["concept_id"].tolist()) == {111, 222} conn.drop_table("__test_exclude_codesets2", force=True) def test_all_excluded_returns_empty(): - """All items excluded should return empty codeset table.""" ibis = pytest.importorskip("ibis") _ = pytest.importorskip("duckdb") @@ -101,12 +212,34 @@ def test_all_excluded_returns_empty(): } tbl = build_single_codeset_table( - backend=conn, - concept_sets=concept_sets, - batch_table_name="__test_exclude_codesets3", + backend=conn, concept_sets=concept_sets, batch_table_name="__test_exclude_codesets3" ) rows = tbl.execute() + assert len(rows) == 0 + conn.drop_table("__test_exclude_codesets3", force=True) - assert len(rows) == 0, f"Expected empty, got {rows}" - conn.drop_table("__test_exclude_codesets3", force=True) +# ------------------------------------------------------------------ +# Phenotype cohort regression tests (recursion / collision fixes) +# ------------------------------------------------------------------ + + +@pytest.mark.parametrize("cohort_id", [33, 54]) +def test_phenotype_cohort_with_exclusions_compiles(cohort_id: int): + """Cohorts 33 and 54 previously failed with recursion errors.""" + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + json_path = JSON_DIR / f"{cohort_id}.json" + if not json_path.exists(): + pytest.skip(f"Phenotype JSON not found: {json_path}") + + expression = CohortExpression.model_validate_json(json_path.read_text()) + + conn = ibis.duckdb.connect() + _seed_minimal_cdm(conn, ibis) + + try: + build_cohort(expression, backend=conn, cdm_schema="main", materialize=False) + except Exception as exc: + pytest.fail(f"Cohort {cohort_id} compilation failed: {exc}") From 2078f00f986a7df4675905df85edfe1d39ed754d Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Tue, 19 May 2026 08:37:57 -0700 Subject: [PATCH 36/53] Replaced concept set temp tables per cohort with per execution --- circe/execution/api.py | 2 +- circe/execution/engine/cohort.py | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/circe/execution/api.py b/circe/execution/api.py index 6589700..5401c1d 100644 --- a/circe/execution/api.py +++ b/circe/execution/api.py @@ -53,7 +53,7 @@ def build_cohort( codeset_table = build_single_codeset_table( backend=backend, concept_sets=normalized.concept_sets, - batch_table_name=f"__{cohort_table}_{cohort_id}_codesets", + batch_table_name="__codesets", results_schema=results_schema, vocabulary_schema=vocabulary_schema, session_prefix=session_prefix, diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 5754535..1f78eed 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -27,24 +27,30 @@ def _materialize( cohort_table: str = "cohort", session_prefix: str = "", ) -> Table: - """Write *table* to a backend staging table and return a fresh reference.""" - name = f"{session_prefix}__{cohort_table}_{cohort_id}_{stage}" + """Write *table* to a backend staging table and return a fresh reference. + + Uses a single session-scoped table per stage (not per cohort) since cohorts + are processed sequentially. The table is overwritten for each cohort. + """ + name = f"{session_prefix}__staging_{stage}" create_table(ctx.backend, table_name=name, schema=schema, obj=table, overwrite=True) return read_table(ctx.backend, table_name=name, schema=schema) def _drop_staging_tables( ctx: ExecutionContext, - cohort_id: int, schema: str | None, - cohort_table: str = "cohort", session_prefix: str = "", ) -> None: - """Remove all staging tables for *cohort_id* from the database.""" - for stage in ("codesets", "primary", "qualified", "included", "ended"): - name = f"{session_prefix}__{cohort_table}_{cohort_id}_{stage}" + """Remove all session-scoped staging tables from the database.""" + for stage in ("primary", "qualified", "included", "ended"): + name = f"{session_prefix}__staging_{stage}" with contextlib.suppress(Exception): ctx.backend.drop_table(name, database=schema, force=True) + # Also drop the session-scoped codeset table + codeset_name = f"{session_prefix}__codesets" + with contextlib.suppress(Exception): + ctx.backend.drop_table(codeset_name, database=schema, force=True) def build_cohort_table( From 2af353c8a6b70726936b1f726862f4327a6eb872 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Tue, 19 May 2026 08:57:56 -0700 Subject: [PATCH 37/53] Fixed bad recursion for excluded concept sets --- circe/execution/ibis/codesets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 9403720..5c99fa3 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -140,8 +140,9 @@ def _build_codeset_expression( result = _union_all_tables(include_parts) result = result.distinct() - for e in exclude_parts: - marked = e.mutate(_cm=ibis.literal(1, type="int64")) + if exclude_parts: + exclude_relation = _union_all_tables(exclude_parts).distinct() + marked = exclude_relation.mutate(_cm=ibis.literal(1, type="int64")) result = result.join(marked, result.concept_id == marked.concept_id, how="left") result = result.filter(result._cm.isnull()).drop("_cm") result = result.select(result.concept_id.name(CONCEPT_ID)) From 739eaac21d81869431b5f8cfbe13240cfcd1f445 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Tue, 19 May 2026 10:24:04 -0700 Subject: [PATCH 38/53] Modification to inefficient joins in correlated criteria --- circe/execution/engine/groups.py | 47 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/circe/execution/engine/groups.py b/circe/execution/engine/groups.py index d630df2..553307c 100644 --- a/circe/execution/engine/groups.py +++ b/circe/execution/engine/groups.py @@ -22,32 +22,45 @@ def _evaluate_group( if group.is_empty(): return keys - child_results: list[Table] = [] - index_id = 0 - - for correlated in group.criteria: - correlated_matches = correlated_match_keys( - index_events, - correlated, - criterion_index=index_id, - ctx=ctx, + child_matches: list[Table] = [] + + for index_id, correlated in enumerate(group.criteria): + child_matches.append( + correlated_match_keys( + index_events, + correlated, + criterion_index=index_id, + ctx=ctx, + ) ) - child_results.append(correlated_matches.mutate(index_id=ibis.literal(index_id, type="int64"))) - index_id += 1 + + index_id = len(child_matches) for demographic in group.demographics: - demographic_matches = demographic_match_keys(index_events, demographic, ctx) - child_results.append(demographic_matches.mutate(index_id=ibis.literal(index_id, type="int64"))) + child_matches.append(demographic_match_keys(index_events, demographic, ctx)) index_id += 1 for child_group in group.groups: - child_group_matches = _evaluate_group(index_events, child_group, ctx) - child_results.append(child_group_matches.mutate(index_id=ibis.literal(index_id, type="int64"))) + child_matches.append(_evaluate_group(index_events, child_group, ctx)) index_id += 1 - if not child_results: + if not child_matches: return keys + normalized_mode = (group.mode or "ALL").upper() + if normalized_mode == "ANY": + return union_all(child_matches).distinct() + + if normalized_mode == "AT_LEAST" and group.count is not None and int(group.count) <= 1: + return union_all(child_matches).distinct() + + if len(child_matches) == 1 and normalized_mode == "ALL": + return child_matches[0] + + child_results: list[Table] = [] + for index_id, child_match in enumerate(child_matches): + child_results.append(child_match.mutate(index_id=ibis.literal(index_id, type="int64"))) + unioned = union_all(child_results) group_counts = unioned.group_by(unioned.person_id, unioned.event_id).aggregate( matched_children=unioned.index_id.nunique() @@ -65,7 +78,7 @@ def _evaluate_group( counted.matched_children, group.mode, group.count, - index_id, + len(child_matches), ) return counted.filter(predicate).select( counted.person_id.name(PERSON_ID), From 90a374f27c22c3912ca7cbb8e752ce6841fa4c7f Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Tue, 19 May 2026 23:14:18 -0700 Subject: [PATCH 39/53] Optimization to codeset resolution and Additional criteria --- circe/execution/engine/cohort.py | 7 +- circe/execution/engine/inclusion.py | 37 +- circe/execution/ibis/codesets.py | 78 +++-- .../test_codeset_batching_verification.py | 326 ++++++++++++++++++ tests/execution/test_inclusion.py | 110 ++++++ 5 files changed, 529 insertions(+), 29 deletions(-) create mode 100644 tests/execution/test_codeset_batching_verification.py diff --git a/circe/execution/engine/cohort.py b/circe/execution/engine/cohort.py index 1f78eed..4335bb3 100644 --- a/circe/execution/engine/cohort.py +++ b/circe/execution/engine/cohort.py @@ -81,7 +81,10 @@ def build_cohort_table( # ── Primary events ────────────────────────────────────────────────── primary_events = build_primary_events(cohort_plan, ctx) - if materialize: + has_additional_criteria = ( + normalized.additional_criteria is not None and not normalized.additional_criteria.is_empty() + ) + if materialize and has_additional_criteria: primary_events = _materialize( primary_events, ctx=ctx, @@ -94,7 +97,7 @@ def build_cohort_table( # ── Additional (correlated) criteria ──────────────────────────────── qualified_events = apply_additional_criteria(primary_events, normalized.additional_criteria, ctx) - if normalized.additional_criteria is not None and not normalized.additional_criteria.is_empty(): + if has_additional_criteria: qualified_events = apply_result_limit(qualified_events, cohort_plan.qualified_limit_type) if materialize: qualified_events = _materialize( diff --git a/circe/execution/engine/inclusion.py b/circe/execution/engine/inclusion.py index 7e95784..0e89eac 100644 --- a/circe/execution/engine/inclusion.py +++ b/circe/execution/engine/inclusion.py @@ -1,6 +1,9 @@ from __future__ import annotations +import ibis + from ..normalize.groups import NormalizedInclusionRule +from .group_keys import event_keys, union_all from .groups import apply_additional_criteria @@ -12,7 +15,33 @@ def apply_inclusion_rules( if not inclusion_rules: return events - included = events - for rule in inclusion_rules: - included = apply_additional_criteria(included, rule.expression, ctx) - return included + active_rule_keys = [] + + for rule_index, rule in enumerate(inclusion_rules): + if rule.expression is None or rule.expression.is_empty(): + continue + + matched = apply_additional_criteria(events, rule.expression, ctx) + active_rule_keys.append(event_keys(matched).mutate(rule_id=ibis.literal(rule_index, type="int64"))) + + if not active_rule_keys: + return events + + if len(active_rule_keys) == 1: + matched_keys = active_rule_keys[0].select("person_id", "event_id") + else: + matched_rules = union_all(active_rule_keys) + matched_counts = matched_rules.group_by("person_id", "event_id").aggregate( + matched_rule_count=matched_rules.rule_id.nunique() + ) + matched_keys = matched_counts.filter( + matched_counts.matched_rule_count == len(active_rule_keys) + ).select("person_id", "event_id") + + included = events.join( + matched_keys, + predicates=[ + (events.person_id == matched_keys.person_id) & (events.event_id == matched_keys.event_id) + ], + ) + return included.select(*[included[c] for c in events.columns]) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index 5c99fa3..ceaa7b6 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -100,39 +100,71 @@ def _build_codeset_expression( The database engine performs the expansion at execution time. Never uses ``ibis.memtable`` -- all leaf values use ``as_table().mutate()`` to avoid local-file staging on Databricks. + + Batches all ancestor lookups within one concept set into a single + ``concept_ancestor`` JOIN (mirrors Java's ``IN (id1, ..., idN)`` pattern) + rather than issuing one JOIN per item. """ - include_parts: list[Table] = [] - exclude_parts: list[Table] = [] + # Separate items by (is_excluded) and collect IDs for batched lookups. + include_direct: list[int] = [] + include_desc: list[int] = [] + include_mapped: list[int] = [] + exclude_direct: list[int] = [] + exclude_desc: list[int] = [] + exclude_mapped: list[int] = [] for item in concept_set.items: if item.concept_id is None: continue + cid = int(item.concept_id) + if item.is_excluded: + exclude_direct.append(cid) + if item.include_descendants: + exclude_desc.append(cid) + if item.include_mapped: + exclude_mapped.append(cid) + else: + include_direct.append(cid) + if item.include_descendants: + include_desc.append(cid) + if item.include_mapped: + include_mapped.append(cid) - direct: tuple[int, ...] = (int(item.concept_id),) - - if item.include_descendants: - desc = _descendant_expression( - direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + # Build include expression parts with batched vocabulary lookups. + include_parts: list[Table] = [] + if include_direct: + for cid in include_direct: + include_parts.append(_literal_select(concept_id=cid)) + if include_desc: + include_parts.append( + _descendant_expression( + tuple(include_desc), table_getter=table_getter, vocabulary_schema=vocabulary_schema ) - base = _union_all_tables( - [ - _literal_select(concept_id=int(item.concept_id)), - desc, - ] + ) + if include_mapped: + include_parts.append( + _mapped_expression( + tuple(include_mapped), table_getter=table_getter, vocabulary_schema=vocabulary_schema ) - else: - base = _literal_select(concept_id=int(item.concept_id)) + ) - if item.include_mapped: - mapped = _mapped_expression( - direct, table_getter=table_getter, vocabulary_schema=vocabulary_schema + # Build exclude expression parts with batched vocabulary lookups. + exclude_parts: list[Table] = [] + if exclude_direct: + for cid in exclude_direct: + exclude_parts.append(_literal_select(concept_id=cid)) + if exclude_desc: + exclude_parts.append( + _descendant_expression( + tuple(exclude_desc), table_getter=table_getter, vocabulary_schema=vocabulary_schema ) - base = _union_all_tables([base, mapped]) - - if item.is_excluded: - exclude_parts.append(base) - else: - include_parts.append(base) + ) + if exclude_mapped: + exclude_parts.append( + _mapped_expression( + tuple(exclude_mapped), table_getter=table_getter, vocabulary_schema=vocabulary_schema + ) + ) if not include_parts: return _empty_table(columns=(("concept_id", 0),)) diff --git a/tests/execution/test_codeset_batching_verification.py b/tests/execution/test_codeset_batching_verification.py new file mode 100644 index 0000000..9ea5516 --- /dev/null +++ b/tests/execution/test_codeset_batching_verification.py @@ -0,0 +1,326 @@ +"""Verify that the batched codeset optimization produces identical results. + +This test exercises multiple items with include_descendants=True within a single +concept set -- the exact pattern that OPT-1 batches into a single +concept_ancestor JOIN instead of N separate JOINs. +""" + +from __future__ import annotations + +import pytest + +from circe.execution.ibis.codesets import _build_codeset_expression, build_single_codeset_table +from circe.execution.normalize.cohort import NormalizedConceptSet, NormalizedConceptSetItem + + +@pytest.fixture +def vocab_conn(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + conn.create_table( + "concept", + obj=ibis.memtable( + { + "concept_id": [10, 11, 12, 20, 21, 22, 30, 31, 32, 40, 41, 50], + "invalid_reason": [None, None, None, None, None, None, None, None, None, None, None, "D"], + } + ), + overwrite=True, + ) + conn.create_table( + "concept_ancestor", + obj=ibis.memtable( + { + "ancestor_concept_id": [10, 10, 20, 20, 30, 30, 40], + "descendant_concept_id": [11, 12, 21, 22, 31, 32, 50], + } + ), + overwrite=True, + ) + conn.create_table( + "concept_relationship", + obj=ibis.memtable( + { + "concept_id_1": [40, 41, 99], + "concept_id_2": [10, 20, 99], + "relationship_id": ["Maps to", "Maps to", "Maps to"], + "invalid_reason": [None, None, "D"], + } + ), + overwrite=True, + ) + return conn + + +def _table_getter(conn): + def getter(name, schema): + return conn.table(name) + + return getter + + +class TestBatchedDescendantExpansion: + """Test that batching multiple include_descendants items gives correct results.""" + + def test_multiple_descendants_batched(self, vocab_conn): + """Three items with include_descendants=True should resolve to all their descendants.""" + concept_set = NormalizedConceptSet( + set_id=1, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=30, is_excluded=False, include_descendants=True, include_mapped=False + ), + ), + ) + result = _build_codeset_expression( + concept_set, table_getter=_table_getter(vocab_conn), vocabulary_schema=None + ) + rows = set(result.execute()["concept_id"].tolist()) + # Direct {10,20,30} + descendants: 10->{11,12}, 20->{21,22}, 30->{31,32} + # Note: 40->50 but 50 is invalid so not included + assert rows == {10, 11, 12, 20, 21, 22, 30, 31, 32} + + def test_descendants_with_direct_exclusion(self, vocab_conn): + """Excluded item (direct, no descendants) removes it from the final set.""" + concept_set = NormalizedConceptSet( + set_id=2, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=11, is_excluded=True, include_descendants=False, include_mapped=False + ), + ), + ) + result = _build_codeset_expression( + concept_set, table_getter=_table_getter(vocab_conn), vocabulary_schema=None + ) + rows = set(result.execute()["concept_id"].tolist()) + # Include: {10, 11, 12, 20, 21, 22}, Exclude: {11} -> {10, 12, 20, 21, 22} + assert rows == {10, 12, 20, 21, 22} + + def test_excluded_with_descendants_batched(self, vocab_conn): + """Excluded items with include_descendants should batch their ancestor lookup too.""" + concept_set = NormalizedConceptSet( + set_id=3, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=30, is_excluded=True, include_descendants=True, include_mapped=False + ), + ), + ) + result = _build_codeset_expression( + concept_set, table_getter=_table_getter(vocab_conn), vocabulary_schema=None + ) + rows = set(result.execute()["concept_id"].tolist()) + # Include: {10, 11, 12, 20, 21, 22}. Exclude: {30, 31, 32}. No overlap. + assert rows == {10, 11, 12, 20, 21, 22} + + def test_mapped_batched(self, vocab_conn): + """Multiple items with include_mapped should batch the relationship lookup.""" + concept_set = NormalizedConceptSet( + set_id=4, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=False, include_mapped=True + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=False, include_mapped=True + ), + ), + ) + result = _build_codeset_expression( + concept_set, table_getter=_table_getter(vocab_conn), vocabulary_schema=None + ) + rows = set(result.execute()["concept_id"].tolist()) + # Direct: {10, 20}. Mapped: concept_relationship where concept_id_2 IN (10,20) + # -> concept_id_1=40 (maps to 10), concept_id_1=41 (maps to 20) + assert rows == {10, 20, 40, 41} + + def test_descendants_and_mapped_combined(self, vocab_conn): + """Items with both include_descendants and include_mapped batch both lookups.""" + concept_set = NormalizedConceptSet( + set_id=5, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=True, include_mapped=True + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=True, include_mapped=True + ), + ), + ) + result = _build_codeset_expression( + concept_set, table_getter=_table_getter(vocab_conn), vocabulary_schema=None + ) + rows = set(result.execute()["concept_id"].tolist()) + # Direct: {10, 20}. Desc: {11, 12, 21, 22}. Mapped: {40, 41} + assert rows == {10, 11, 12, 20, 21, 22, 40, 41} + + def test_build_single_codeset_table_multiple_sets(self, vocab_conn): + """build_single_codeset_table correctly separates concept sets with batched expansion.""" + concept_sets = { + 1: NormalizedConceptSet( + set_id=1, + items=( + NormalizedConceptSetItem( + concept_id=10, is_excluded=False, include_descendants=True, include_mapped=False + ), + NormalizedConceptSetItem( + concept_id=20, is_excluded=False, include_descendants=True, include_mapped=False + ), + ), + ), + 2: NormalizedConceptSet( + set_id=2, + items=( + NormalizedConceptSetItem( + concept_id=30, is_excluded=False, include_descendants=True, include_mapped=False + ), + ), + ), + } + tbl = build_single_codeset_table( + backend=vocab_conn, + concept_sets=concept_sets, + batch_table_name="__test_batch_verify", + ) + df = tbl.execute() + cs1_ids = set(df[df["codeset_id"] == 1]["concept_id"].tolist()) + cs2_ids = set(df[df["codeset_id"] == 2]["concept_id"].tolist()) + assert cs1_ids == {10, 11, 12, 20, 21, 22} + assert cs2_ids == {30, 31, 32} + vocab_conn.drop_table("__test_batch_verify", force=True) + + +class TestBatchedEndToEnd: + """End-to-end cohort build with batched codeset resolution.""" + + def test_cohort_with_descendants_and_exclusion(self): + """Reproduce the existing test_build_cohort_concept_set_resolves_descendants_and_mapped.""" + import datetime + + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + from circe.cohortdefinition import ( + CohortExpression, + ConditionOccurrence, + PrimaryCriteria, + ) + from circe.execution.api import build_cohort + from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem + + conn = ibis.duckdb.connect() + S = datetime.date(2020, 1, 1) + E = datetime.date(2020, 12, 31) + conn.create_table( + "person", + obj=ibis.memtable( + {"person_id": [1, 2], "year_of_birth": [1980, 1980], "gender_concept_id": [0, 0]} + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2], + "observation_period_id": [1, 2], + "observation_period_start_date": [S, S], + "observation_period_end_date": [E, E], + } + ), + overwrite=True, + ) + conn.create_table( + "concept", + obj=ibis.memtable( + { + "concept_id": [100, 101, 102, 200, 201], + "invalid_reason": [None, None, "D", None, None], + } + ), + overwrite=True, + ) + conn.create_table( + "concept_ancestor", + obj=ibis.memtable({"ancestor_concept_id": [100, 100], "descendant_concept_id": [101, 102]}), + overwrite=True, + ) + conn.create_table( + "concept_relationship", + obj=ibis.memtable( + { + "concept_id_1": [200, 201], + "concept_id_2": [100, 101], + "relationship_id": ["Maps to", "Maps to"], + "invalid_reason": [None, "D"], + } + ), + overwrite=True, + ) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 1, 1, 1, 1, 2], + "condition_occurrence_id": [1000, 1001, 1002, 1003, 1004, 1005], + "condition_concept_id": [100, 101, 102, 200, 201, 999], + "condition_start_date": [S, S, S, S, S, S], + "condition_end_date": [S, S, S, S, S, S], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + ConceptSet( + id=1, + expression=ConceptSetExpression( + items=[ + ConceptSetItem( + concept=Concept(conceptId=100), + includeDescendants=True, + includeMapped=True, + ), + ConceptSetItem( + concept=Concept(conceptId=101), + isExcluded=True, + includeMapped=True, + ), + ] + ), + ) + ], + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), + ) + + cohort_result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + assert set(cohort_result.person_id) == {1} + # codeset 1: include 100 (desc=True, mapped=True), exclude 101 (mapped=True) + # Include side: direct 100 + desc of 100: {101, 102(invalid)} + mapped of 100: {200} + # -> valid include = {100, 101, 200} + # Exclude side: direct 101 + mapped of 101: {201(invalid_reason='D')} + # -> valid exclude = {101} + # Final: {100, 101, 200} - {101} = {100, 200} + assert set(cohort_result.concept_id) == {100, 200} diff --git a/tests/execution/test_inclusion.py b/tests/execution/test_inclusion.py index 46ce20e..4a63bcf 100644 --- a/tests/execution/test_inclusion.py +++ b/tests/execution/test_inclusion.py @@ -12,6 +12,7 @@ Occurrence, PrimaryCriteria, ) +from circe.execution.api import build_cohort as build_execution_cohort from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem @@ -153,3 +154,112 @@ def test_inclusion_rule_without_expression_is_noop(): result = build_cohort(expression, backend=conn, cdm_schema="main").execute() assert set(result.person_id) == {1, 2} + + +def test_inclusion_rules_materialize_false_matches_materialized_result(): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + + conn = ibis.duckdb.connect() + _seed_common_tables(conn, ibis, persons=(1, 2, 3, 4)) + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 1, 1, 2, 2, 3, 3, 3, 4, 4], + "condition_occurrence_id": [100, 101, 102, 200, 201, 300, 301, 302, 400, 401], + "condition_concept_id": [111, 222, 333, 111, 222, 111, 222, 333, 111, 333], + "condition_start_date": [ + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-01", + "2020-01-02", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-01", + "2020-01-02", + ], + "condition_end_date": [ + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-01", + "2020-01-02", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-01", + "2020-01-02", + ], + "visit_occurrence_id": [10, 10, 10, 20, 20, 30, 30, 30, 40, 40], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 222), + _make_concept_set(3, 333), + ], + primary_criteria=PrimaryCriteria(criteria_list=[ConditionOccurrence(codeset_id=1)]), + inclusion_rules=[ + InclusionRule( + name="rule-1", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ConditionOccurrence(codeset_id=2), + occurrence=Occurrence(type=Occurrence._AT_LEAST, count=1), + ) + ], + ), + ), + InclusionRule( + name="rule-2", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ConditionOccurrence(codeset_id=3), + occurrence=Occurrence(type=Occurrence._AT_LEAST, count=1), + ) + ], + ), + ), + InclusionRule(name="noop", expression=None), + ], + ) + + materialized = build_execution_cohort( + expression, + backend=conn, + cdm_schema="main", + materialize=True, + ).execute() + non_materialized = build_execution_cohort( + expression, + backend=conn, + cdm_schema="main", + materialize=False, + ).execute() + + materialized_rows = { + tuple(row) + for row in materialized[["person_id", "event_id", "start_date", "end_date"]].itertuples( + index=False, name=None + ) + } + non_materialized_rows = { + tuple(row) + for row in non_materialized[["person_id", "event_id", "start_date", "end_date"]].itertuples( + index=False, name=None + ) + } + + assert non_materialized_rows == materialized_rows + assert materialized_rows From 8f194df6941194e614e13f84f4bdcc68a047f728 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 20 May 2026 07:18:22 -0700 Subject: [PATCH 40/53] Fix concept casting issue on databricks --- circe/execution/ibis/codesets.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/circe/execution/ibis/codesets.py b/circe/execution/ibis/codesets.py index ceaa7b6..3ca3a34 100644 --- a/circe/execution/ibis/codesets.py +++ b/circe/execution/ibis/codesets.py @@ -169,10 +169,19 @@ def _build_codeset_expression( if not include_parts: return _empty_table(columns=(("concept_id", 0),)) + # Normalize column nullability for union compatibility across backends. + # _literal_select produces nullable int64 while vocabulary table columns + # (e.g. Databricks concept_ancestor.descendant_concept_id) may be non-nullable. + # Strict backends require exact schema match for UNION ALL. + if len(include_parts) > 1: + include_parts = [p.select(p.concept_id.cast("int64").name(CONCEPT_ID)) for p in include_parts] + result = _union_all_tables(include_parts) result = result.distinct() if exclude_parts: + if len(exclude_parts) > 1: + exclude_parts = [p.select(p.concept_id.cast("int64").name(CONCEPT_ID)) for p in exclude_parts] exclude_relation = _union_all_tables(exclude_parts).distinct() marked = exclude_relation.mutate(_cm=ibis.literal(1, type="int64")) result = result.join(marked, result.concept_id == marked.concept_id, how="left") @@ -332,6 +341,15 @@ def build_single_codeset_table( _create_table_impl(backend, table_name=name, schema=results_schema, obj=empty, overwrite=True) return _read_table(backend, table_name=name, schema=results_schema) + # Normalize column nullability for union compatibility across backends. + if len(parts) > 1: + parts = [ + p.select( + p.codeset_id.cast("int64").name("codeset_id"), p.concept_id.cast("int64").name(CONCEPT_ID) + ) + for p in parts + ] + combined = _union_all_tables(parts) _create_table_impl(backend, table_name=name, schema=results_schema, obj=combined, overwrite=True) return _read_table(backend, table_name=name, schema=results_schema) From 3ff5a300e8a30d865cdd20a342e110403b142316 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 20 May 2026 08:31:17 -0700 Subject: [PATCH 41/53] Fixed issue with use of source concepts --- circe/execution/lower/common.py | 8 ++++++++ circe/execution/normalize/criteria.py | 13 +++++++++++++ tests/execution/test_registry_dispatch.py | 2 ++ 3 files changed, 23 insertions(+) diff --git a/circe/execution/lower/common.py b/circe/execution/lower/common.py index 3b47498..989f6da 100644 --- a/circe/execution/lower/common.py +++ b/circe/execution/lower/common.py @@ -38,6 +38,14 @@ def lower_common_steps(criterion: NormalizedCriterion) -> list[PlanStep]: ) ) + if criterion.source_codeset_id is not None and criterion.source_concept_column is not None: + steps.append( + FilterByCodeset( + column=criterion.source_concept_column, + codeset_id=int(criterion.source_codeset_id), + ) + ) + if criterion.person_filters.gender_concept_ids or criterion.person_filters.gender_codeset_id is not None: steps.append( FilterByPersonGender( diff --git a/circe/execution/normalize/criteria.py b/circe/execution/normalize/criteria.py index c3fb6eb..b7a7828 100644 --- a/circe/execution/normalize/criteria.py +++ b/circe/execution/normalize/criteria.py @@ -62,6 +62,7 @@ class NormalizedCriterion: source_concept_column: str | None visit_occurrence_column: str | None codeset_id: int | None + source_codeset_id: int | None first: bool occurrence_start_date: NormalizedDateRange | None occurrence_end_date: NormalizedDateRange | None @@ -119,6 +120,7 @@ def _build_normalized_criterion( source_concept_column: str | None, visit_occurrence_column: str | None, codeset_id: int | None, + source_codeset_id: int | None = None, first: bool, occurrence_start_date: NormalizedDateRange | None, occurrence_end_date: NormalizedDateRange | None, @@ -135,6 +137,7 @@ def _build_normalized_criterion( source_concept_column=source_concept_column, visit_occurrence_column=visit_occurrence_column, codeset_id=codeset_id, + source_codeset_id=source_codeset_id, first=first, occurrence_start_date=occurrence_start_date, occurrence_end_date=occurrence_end_date, @@ -156,6 +159,7 @@ def _normalize_condition_occurrence(criteria: ConditionOccurrence) -> Normalized source_concept_column="condition_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.condition_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -176,6 +180,7 @@ def _normalize_drug_exposure(criteria: DrugExposure) -> NormalizedCriterion: source_concept_column="drug_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.drug_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -196,6 +201,7 @@ def _normalize_visit_occurrence(criteria: VisitOccurrence) -> NormalizedCriterio source_concept_column="visit_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.visit_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -216,6 +222,7 @@ def _normalize_measurement(criteria: Measurement) -> NormalizedCriterion: source_concept_column="measurement_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.measurement_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -238,6 +245,7 @@ def _normalize_procedure_occurrence( source_concept_column="procedure_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.procedure_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -258,6 +266,7 @@ def _normalize_observation(criteria: Observation) -> NormalizedCriterion: source_concept_column="observation_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.observation_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -278,6 +287,7 @@ def _normalize_visit_detail(criteria: VisitDetail) -> NormalizedCriterion: source_concept_column="visit_detail_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.visit_detail_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.visit_detail_start_date), occurrence_end_date=normalize_date_range(criteria.visit_detail_end_date), @@ -298,6 +308,7 @@ def _normalize_device_exposure(criteria: DeviceExposure) -> NormalizedCriterion: source_concept_column="device_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.device_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -318,6 +329,7 @@ def _normalize_specimen(criteria: Specimen) -> NormalizedCriterion: source_concept_column="specimen_source_concept_id", visit_occurrence_column="visit_occurrence_id", codeset_id=criteria.codeset_id, + source_codeset_id=criteria.specimen_source_concept, first=bool(criteria.first), occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=normalize_date_range(criteria.occurrence_end_date), @@ -338,6 +350,7 @@ def _normalize_death(criteria: Death) -> NormalizedCriterion: source_concept_column="cause_source_concept_id", visit_occurrence_column=None, codeset_id=criteria.codeset_id, + source_codeset_id=criteria.death_source_concept, first=False, occurrence_start_date=normalize_date_range(criteria.occurrence_start_date), occurrence_end_date=None, diff --git a/tests/execution/test_registry_dispatch.py b/tests/execution/test_registry_dispatch.py index 7d53701..ca1c98f 100644 --- a/tests/execution/test_registry_dispatch.py +++ b/tests/execution/test_registry_dispatch.py @@ -40,6 +40,7 @@ def test_registry_dispatch_round_trip(): source_concept_column=None, visit_occurrence_column=None, codeset_id=None, + source_codeset_id=None, first=False, occurrence_start_date=None, occurrence_end_date=None, @@ -87,6 +88,7 @@ class UnknownCriteria(Criteria): source_concept_column=None, visit_occurrence_column=None, codeset_id=None, + source_codeset_id=None, first=False, occurrence_start_date=None, occurrence_end_date=None, From 8b132c81de3695871e56727e9145917a45f5d57e Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 20 May 2026 10:07:00 -0700 Subject: [PATCH 42/53] Tests and fixes for cohorts that are not identical from phenotype library --- circe/execution/engine/collapse.py | 14 +- circe/execution/engine/end_strategy.py | 21 +- circe/execution/engine/groups.py | 9 +- circe/execution/engine/primary.py | 40 +- circe/execution/plan/schema.py | 2 + tests/execution/test_correctness_bugs.py | 862 +++++++++++++++++++++++ 6 files changed, 935 insertions(+), 13 deletions(-) create mode 100644 tests/execution/test_correctness_bugs.py diff --git a/circe/execution/engine/collapse.py b/circe/execution/engine/collapse.py index b6d0cc3..ed5114a 100644 --- a/circe/execution/engine/collapse.py +++ b/circe/execution/engine/collapse.py @@ -2,7 +2,15 @@ import ibis -from ..plan.schema import END_DATE, PERSON_ID, START_DATE +from ..plan.schema import END_DATE, OP_END_DATE, OP_START_DATE, PERSON_ID, START_DATE + + +def _strip_op_columns(events): + """Remove internal observation period columns from final output.""" + cols_to_drop = [c for c in events.columns if c in (OP_START_DATE, OP_END_DATE)] + if cols_to_drop: + return events.drop(*cols_to_drop) + return events def _apply_censor_window(events, censor_window): @@ -71,11 +79,11 @@ def _collapse_era(intervals, era_pad: int): def collapse_events(events, collapse_settings, censor_window): if collapse_settings is None: - return _apply_censor_window(events, censor_window) + return _strip_op_columns(_apply_censor_window(events, censor_window)) collapse_type = (collapse_settings.collapse_type or "era").lower() if collapse_type == "no_collapse": - return _apply_censor_window(events, censor_window) + return _strip_op_columns(_apply_censor_window(events, censor_window)) intervals = events.select( events.person_id.cast("int64").name(PERSON_ID), diff --git a/circe/execution/engine/end_strategy.py b/circe/execution/engine/end_strategy.py index 4b8e5b9..2585b77 100644 --- a/circe/execution/engine/end_strategy.py +++ b/circe/execution/engine/end_strategy.py @@ -3,10 +3,19 @@ import ibis from ..errors import UnsupportedFeatureError -from ..plan.schema import END_DATE, PERSON_ID, START_DATE +from ..plan.schema import END_DATE, OP_END_DATE, OP_START_DATE, PERSON_ID, START_DATE def attach_observation_bounds(events, ctx): + """Attach observation period bounds to events. + + If events already carry op_start_date/op_end_date from the primary events + stage, use those directly (avoiding a re-join that creates duplicates when + overlapping OPs exist). Falls back to a re-join only if the columns are missing. + """ + if OP_START_DATE in events.columns and OP_END_DATE in events.columns: + return events + observation_period = ctx.table("observation_period").select( PERSON_ID, "observation_period_start_date", @@ -20,8 +29,8 @@ def attach_observation_bounds(events, ctx): ) return joined.select( *[joined[c] for c in events.columns], - observation_period.observation_period_start_date.cast("date").name("op_start_date"), - observation_period.observation_period_end_date.cast("date").name("op_end_date"), + observation_period.observation_period_start_date.cast("date").name(OP_START_DATE), + observation_period.observation_period_end_date.cast("date").name(OP_END_DATE), ).distinct() @@ -39,7 +48,7 @@ def _apply_date_offset_strategy(with_bounds, strategy): ) candidate = base_date + ibis.interval(days=offset) - return ibis.least(candidate, with_bounds.op_end_date) + return ibis.least(candidate, with_bounds[OP_END_DATE]) def _replace_end_date(events, with_bounds, new_end_expr): @@ -57,7 +66,7 @@ def apply_end_strategy(events, strategy, ctx): with_bounds = attach_observation_bounds(events, ctx) if strategy is None: - return _replace_end_date(events, with_bounds, with_bounds.op_end_date) + return _replace_end_date(events, with_bounds, with_bounds[OP_END_DATE]) if strategy.kind == "date_offset": end_date_expr = _apply_date_offset_strategy(with_bounds, strategy) @@ -69,4 +78,4 @@ def apply_end_strategy(events, strategy, ctx): return apply_custom_era_strategy(events, strategy, ctx) # Fallback: preserve default semantics of op_end_date clipping. - return _replace_end_date(events, with_bounds, with_bounds.op_end_date) + return _replace_end_date(events, with_bounds, with_bounds[OP_END_DATE]) diff --git a/circe/execution/engine/groups.py b/circe/execution/engine/groups.py index 553307c..a6a1427 100644 --- a/circe/execution/engine/groups.py +++ b/circe/execution/engine/groups.py @@ -4,7 +4,7 @@ from ..ibis.context import ExecutionContext from ..normalize.groups import NormalizedCriteriaGroup -from ..plan.schema import EVENT_ID, PERSON_ID +from ..plan.schema import EVENT_ID, OP_END_DATE, OP_START_DATE, PERSON_ID from ..typing import Table from .group_demographics import demographic_match_keys from .group_keys import event_keys, union_all @@ -94,7 +94,12 @@ def apply_additional_criteria( if group is None or group.is_empty(): return events - index_events = attach_observation_period(events, ctx) + # Use pre-existing OP bounds from primary events if available, + # avoiding a re-join that creates duplicates when overlapping OPs exist. + if OP_START_DATE in events.columns and OP_END_DATE in events.columns: + index_events = events + else: + index_events = attach_observation_period(events, ctx) matched_keys = _evaluate_group(index_events, group, ctx) filtered = events.join( diff --git a/circe/execution/engine/primary.py b/circe/execution/engine/primary.py index eea5224..11ad060 100644 --- a/circe/execution/engine/primary.py +++ b/circe/execution/engine/primary.py @@ -7,7 +7,7 @@ from ..ibis.context import ExecutionContext from ..normalize.windows import NormalizedObservationWindow from ..plan.cohort import CohortPlan -from ..plan.schema import DOMAIN, EVENT_ID, PERSON_ID, START_DATE +from ..plan.schema import DOMAIN, EVENT_ID, OP_END_DATE, OP_START_DATE, PERSON_ID, START_DATE from ..typing import Table from .groups import apply_additional_criteria from .limits import apply_result_limit @@ -54,7 +54,39 @@ def _apply_observation_window( lower = joined.observation_period_start_date + ibis.interval(days=window.prior_days) upper = joined.observation_period_end_date - ibis.interval(days=window.post_days) filtered = joined.filter((joined[START_DATE] >= lower) & (joined[START_DATE] <= upper)) - return filtered.select(*[filtered[c] for c in events.columns]) + # Carry OP bounds through the pipeline (matching Java behavior). + # Drop any pre-existing op_ columns from earlier stages before re-attaching. + base_cols = [c for c in events.columns if c not in (OP_START_DATE, OP_END_DATE)] + return filtered.select( + *[filtered[c] for c in base_cols], + filtered.observation_period_start_date.cast("date").name(OP_START_DATE), + filtered.observation_period_end_date.cast("date").name(OP_END_DATE), + ) + + +def _attach_op_bounds(events, ctx: ExecutionContext): + """Attach observation period bounds to events without applying an observation window filter. + + This mirrors the Java/R behavior where op_start_date and op_end_date are + always present on primary events for use by end strategy and window constraints. + """ + observation_period = ctx.table("observation_period").select( + PERSON_ID, + "observation_period_start_date", + "observation_period_end_date", + ) + joined = events.join( + observation_period, + (events[PERSON_ID] == observation_period[PERSON_ID]) + & (events[START_DATE] >= observation_period.observation_period_start_date.cast("date")) + & (events[START_DATE] <= observation_period.observation_period_end_date.cast("date")), + ) + base_cols = [c for c in events.columns if c not in (OP_START_DATE, OP_END_DATE)] + return joined.select( + *[joined[c] for c in base_cols], + observation_period.observation_period_start_date.cast("date").name(OP_START_DATE), + observation_period.observation_period_end_date.cast("date").name(OP_END_DATE), + ) def build_primary_events(plan: CohortPlan, ctx: ExecutionContext) -> Table: @@ -74,6 +106,10 @@ def build_primary_events(plan: CohortPlan, ctx: ExecutionContext) -> Table: if plan.observation_window is not None: events = _apply_observation_window(events, ctx, plan.observation_window) + else: + # Always attach OP bounds even without an observation window, + # matching Java behavior where op_end_date is always available. + events = _attach_op_bounds(events, ctx) events = apply_result_limit(events, plan.primary_limit_type) return events diff --git a/circe/execution/plan/schema.py b/circe/execution/plan/schema.py index 061815f..375e6fc 100644 --- a/circe/execution/plan/schema.py +++ b/circe/execution/plan/schema.py @@ -22,6 +22,8 @@ CRITERION_INDEX = "criterion_index" CRITERION_TYPE = "criterion_type" SOURCE_TABLE = "source_table" +OP_START_DATE = "op_start_date" +OP_END_DATE = "op_end_date" STANDARD_EVENT_COLUMNS = ( PERSON_ID, diff --git a/tests/execution/test_correctness_bugs.py b/tests/execution/test_correctness_bugs.py new file mode 100644 index 0000000..7fb4788 --- /dev/null +++ b/tests/execution/test_correctness_bugs.py @@ -0,0 +1,862 @@ +"""Tests for known correctness bugs identified in benchmark comparison. + +Bug 2: End strategy re-joins observation_period, creating duplicates when a person + has overlapping observation periods → overcounting after ERA collapse. + +Bug 3: AdditionalCriteria with VisitOccurrence using EndWindow + UseEventEnd causes + undercounting when visit_end_date is NULL (comparison evaluates to NULL → row dropped). + +Bug 5: Severe undercounting in cohorts with nested CorrelatedCriteria inside + PrimaryCriteria or complex multi-rule inclusion logic. +""" + +from __future__ import annotations + +import pytest + +from circe.api import build_cohort +from circe.cohortdefinition import ( + CohortExpression, + ConditionOccurrence, + CorelatedCriteria, + CriteriaGroup, + Occurrence, + PrimaryCriteria, + ProcedureOccurrence, + VisitOccurrence, + Window, + WindowBound, +) +from circe.cohortdefinition.core import CollapseSettings, DateOffsetStrategy, ResultLimit +from circe.cohortdefinition.criteria import InclusionRule +from circe.vocabulary import Concept, ConceptSet, ConceptSetExpression, ConceptSetItem + + +def _make_concept_set(set_id: int, concept_id: int) -> ConceptSet: + return ConceptSet( + id=set_id, + expression=ConceptSetExpression(items=[ConceptSetItem(concept=Concept(conceptId=concept_id))]), + ) + + +# ────────────────────────────────────────────────────────────────────────────── +# Bug 2: Overlapping observation periods cause duplicate rows in end strategy +# ────────────────────────────────────────────────────────────────────────────── + + +class TestBug2OverlappingObservationPeriods: + """When a person has overlapping observation periods and end strategy uses + DateOffset with EndDate, the re-join to observation_period creates duplicate + rows (one per matching OP) with different op_end_date values, resulting in + different capped end dates. After ERA collapse, this produces more eras than + expected. + + In simple cases, ERA collapse merges the duplicates back (same start_date → + always overlap). However, `attach_observation_bounds` still creates structural + duplication that can cause overcounting in complex pipelines or on backends + with different NULL/tie-breaking semantics. + + Expected behavior: Each event should produce exactly one cohort era regardless + of how many observation periods overlap the event's start_date. + """ + + @pytest.fixture + def conn(self): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + conn = ibis.duckdb.connect() + + # Person 1 has TWO overlapping observation periods with DIFFERENT end dates. + # The shorter OP ends BEFORE the event's end_date + offset, so it caps the + # end date to a different value than the longer OP. + conn.create_table( + "person", + obj=ibis.memtable({"person_id": [1], "year_of_birth": [1980], "gender_concept_id": [8507]}), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 1], + "observation_period_id": [10, 11], + # OP 10: 2019-01-01 to 2020-04-15 (shorter — will cap the end date) + # OP 11: 2020-01-01 to 2021-12-31 (longer — won't cap) + "observation_period_start_date": ["2019-01-01", "2020-01-01"], + "observation_period_end_date": ["2020-04-15", "2021-12-31"], + } + ), + overwrite=True, + ) + # Condition event: start=2020-03-01, end=2020-03-25 + # DateOffset(EndDate, 30): target end_date = 2020-03-25 + 30 = 2020-04-24 + # Via OP 10 (end=2020-04-15): LEAST(2020-04-24, 2020-04-15) = 2020-04-15 + # Via OP 11 (end=2021-12-31): LEAST(2020-04-24, 2021-12-31) = 2020-04-24 + # → Two different end dates for the SAME event → duplicate row + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "condition_occurrence_id": [100], + "condition_concept_id": [111], + "condition_start_date": ["2020-03-01"], + "condition_end_date": ["2020-03-25"], + "visit_occurrence_id": [10], + } + ), + overwrite=True, + ) + return conn + + def test_attach_observation_bounds_skips_rejoin_when_op_columns_present(self, conn): + """When events already carry op_start_date/op_end_date from primary events, + attach_observation_bounds returns them directly without re-joining. + + This is the fix for Bug 2: the primary events stage now always attaches + OP bounds, so end_strategy and correlated criteria use those instead of + re-joining (which would create duplicates for overlapping OPs). + """ + ibis = pytest.importorskip("ibis") + from circe.execution.engine.end_strategy import attach_observation_bounds + from circe.execution.ibis.context import make_execution_context + + ctx = make_execution_context(backend=conn, cdm_schema="main") + + # Build events WITH op_start_date/op_end_date (as they come from build_primary_events) + events = conn.create_table( + "__test_events_with_op", + obj=ibis.memtable( + { + "person_id": [1], + "event_id": [1], + "start_date": ["2020-03-01"], + "end_date": ["2020-03-25"], + "op_start_date": ["2020-01-01"], + "op_end_date": ["2021-12-31"], + } + ), + overwrite=True, + ) + events = events.mutate( + person_id=events.person_id.cast("int64"), + event_id=events.event_id.cast("int64"), + start_date=events.start_date.cast("date"), + end_date=events.end_date.cast("date"), + op_start_date=events.op_start_date.cast("date"), + op_end_date=events.op_end_date.cast("date"), + ) + + with_bounds = attach_observation_bounds(events, ctx) + result = with_bounds.execute() + + # With OP columns already present, no re-join happens → exactly 1 row + assert len(result) == 1, ( + f"attach_observation_bounds produced {len(result)} rows when OP columns " + "were already present. Should return events directly without re-joining." + ) + + def test_fallback_rejoin_produces_duplicates_for_overlapping_ops(self, conn): + """The fallback re-join path (for events WITHOUT OP columns) still creates + duplicates when overlapping OPs exist. This documents the limitation of the + fallback path, which is NOT used in the normal pipeline (build_primary_events + always attaches OP columns). + """ + ibis = pytest.importorskip("ibis") + from circe.execution.engine.end_strategy import attach_observation_bounds + from circe.execution.ibis.context import make_execution_context + + ctx = make_execution_context(backend=conn, cdm_schema="main") + + # Build events WITHOUT op columns (triggering the fallback re-join) + events = conn.create_table( + "__test_events_no_op", + obj=ibis.memtable( + { + "person_id": [1], + "event_id": [1], + "start_date": ["2020-03-01"], + "end_date": ["2020-03-25"], + } + ), + overwrite=True, + ) + events = events.mutate( + person_id=events.person_id.cast("int64"), + event_id=events.event_id.cast("int64"), + start_date=events.start_date.cast("date"), + end_date=events.end_date.cast("date"), + ) + + with_bounds = attach_observation_bounds(events, ctx) + result = with_bounds.execute() + + # Fallback path: re-join produces 2 rows (one per matching OP) + # This is the known limitation that Bug 2 fix avoids by carrying OP from primary events + assert len(result) == 2, ( + f"Expected fallback re-join to produce 2 rows for overlapping OPs, got {len(result)}." + ) + + def test_single_event_produces_single_era(self, conn): + """One event in overlapping OPs should still produce exactly one cohort era. + + This test passes because ERA collapse merges the duplicate rows back + together (they share the same start_date). But the duplication is still + wasteful and can cause issues on backends with different tie-breaking. + """ + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111)], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="All"), + ), + end_strategy=DateOffsetStrategy(offset=30, date_field="EndDate"), + collapse_settings=CollapseSettings(collapse_type="ERA", era_pad=0), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + assert len(result) == 1 + # End date should use the LONGEST OP (not capped by shorter OP) + assert str(result.iloc[0]["end_date"])[:10] == "2020-04-24" + + def test_two_events_in_overlapping_ops_collapse_correctly(self, conn): + """Two close events that should merge into one era must not be split by OP duplication.""" + ibis = pytest.importorskip("ibis") + # Two events: both fall in both OPs, both get duplicated end dates + # Event 1: start=2020-03-01, end=2020-03-25 → target 2020-04-24 (capped to 2020-04-15 via OP10) + # Event 2: start=2020-03-10, end=2020-03-28 → target 2020-04-27 (capped to 2020-04-15 via OP10) + # These events overlap — should collapse to a single era + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 1], + "condition_occurrence_id": [100, 101], + "condition_concept_id": [111, 111], + "condition_start_date": ["2020-03-01", "2020-03-10"], + "condition_end_date": ["2020-03-25", "2020-03-28"], + "visit_occurrence_id": [10, 10], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111)], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="All"), + ), + end_strategy=DateOffsetStrategy(offset=30, date_field="EndDate"), + collapse_settings=CollapseSettings(collapse_type="ERA", era_pad=0), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Both events overlap (event 2 starts before event 1's end+30) → single merged era + # Correct: 1 era from 2020-03-01 to 2020-04-27 + # Bug: OP duplication creates extra rows with different end dates, potentially + # splitting what should be a single era into multiple fragments + assert len(result) == 1, ( + f"Expected 1 merged era but got {len(result)}. " + "Overlapping OPs may have created duplicate rows preventing proper ERA merge." + ) + + +# ────────────────────────────────────────────────────────────────────────────── +# Bug 3: AdditionalCriteria with VisitOccurrence + EndWindow + UseEventEnd +# undercounts when visit_end_date is NULL +# ────────────────────────────────────────────────────────────────────────────── + + +class TestBug3VisitEndWindowUndercounting: + """When AdditionalCriteria requires a VisitOccurrence with EndWindow using + UseEventEnd=true, visits with NULL visit_end_date fail the comparison + (NULL >= date evaluates to NULL/false) and the event is excluded. + + The Java/R reference implementation handles this case (likely via COALESCE + or different NULL semantics), keeping these events in the cohort. + + Pattern from affected cohorts (71, 74, 260-263, 881, 898, 965, 967): + - StartWindow: visit starts on or before the index event + - EndWindow with UseEventEnd=true: visit ends on or after the index event + - This checks "the event occurred during a visit" + """ + + @pytest.fixture + def conn(self): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + conn = ibis.duckdb.connect() + + conn.create_table( + "person", + obj=ibis.memtable( + {"person_id": [1, 2], "year_of_birth": [1980, 1975], "gender_concept_id": [8507, 8532]} + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2], + "observation_period_id": [10, 20], + "observation_period_start_date": ["2019-01-01", "2019-01-01"], + "observation_period_end_date": ["2022-12-31", "2022-12-31"], + } + ), + overwrite=True, + ) + # Two condition events — one for each person + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "condition_occurrence_id": [100, 200], + "condition_concept_id": [111, 111], + "condition_start_date": ["2020-06-15", "2020-06-15"], + "condition_end_date": ["2020-06-20", "2020-06-20"], + "visit_occurrence_id": [1000, 2000], + } + ), + overwrite=True, + ) + # Two visits: + # Person 1: visit with a proper end_date (2020-06-20) — should match + # Person 2: visit with NULL end_date (ongoing visit) — should ALSO match + conn.create_table( + "visit_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "visit_occurrence_id": [1000, 2000], + "visit_concept_id": [9201, 9201], # Inpatient + "visit_start_date": ["2020-06-10", "2020-06-10"], + "visit_end_date": ["2020-06-20", None], # Person 2 has NULL end + "visit_source_concept_id": [0, 0], + } + ), + overwrite=True, + ) + return conn + + def test_null_visit_end_date_excludes_from_end_window(self, conn): + """A visit with NULL end_date is correctly excluded by EndWindow with UseEventEnd=true. + + This matches Java/R behavior: CohortExpressionQueryBuilder.java line 586 uses + A.END_DATE directly without COALESCE. When visit_end_date is NULL, the comparison + `A.END_DATE >= ...` evaluates to NULL → row excluded. This is correct behavior + per OMOP CDM (visit_end_date is a required NOT NULL field). + """ + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111), _make_concept_set(2, 9201)], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="All"), + ), + additional_criteria=CriteriaGroup( + type="ANY", + criteria_list=[ + CorelatedCriteria( + criteria=VisitOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(coeff=-1), # unbounded before + end=WindowBound(days=0, coeff=1), # up to index start + use_index_end=False, + use_event_end=False, + ), + end_window=Window( + start=WindowBound(days=0, coeff=-1), # from index start + end=WindowBound(coeff=1), # unbounded after + use_index_end=False, + use_event_end=True, # compare visit END date + ), + occurrence=Occurrence(type=2, count=1), # at least 1 + ) + ], + ), + end_strategy=DateOffsetStrategy(offset=30, date_field="EndDate"), + collapse_settings=CollapseSettings(collapse_type="ERA", era_pad=0), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Only Person 1 should be in the cohort: + # Person 1: visit (Jun 10 - Jun 20) encompasses condition (Jun 15) ✓ + # Person 2: visit (Jun 10 - NULL) → EndWindow check: NULL >= Jun 15 → NULL → excluded + # This matches Java behavior (no COALESCE on A.END_DATE) + assert len(result) == 1, ( + f"Expected 1 cohort entry (only person with valid visit_end_date) but got {len(result)}." + ) + + def test_visit_not_encompassing_event_excluded(self, conn): + """Visits that genuinely don't encompass the event should still be excluded.""" + ibis = pytest.importorskip("ibis") + # Override: Person 2's visit ENDS BEFORE the condition starts + conn.create_table( + "visit_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "visit_occurrence_id": [1000, 2000], + "visit_concept_id": [9201, 9201], + "visit_start_date": ["2020-06-10", "2020-06-01"], + "visit_end_date": ["2020-06-20", "2020-06-10"], # Person 2 visit ends before condition + "visit_source_concept_id": [0, 0], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111), _make_concept_set(2, 9201)], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="All"), + ), + additional_criteria=CriteriaGroup( + type="ANY", + criteria_list=[ + CorelatedCriteria( + criteria=VisitOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(coeff=-1), + end=WindowBound(days=0, coeff=1), + use_index_end=False, + use_event_end=False, + ), + end_window=Window( + start=WindowBound(days=0, coeff=-1), + end=WindowBound(coeff=1), + use_index_end=False, + use_event_end=True, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + end_strategy=DateOffsetStrategy(offset=30, date_field="EndDate"), + collapse_settings=CollapseSettings(collapse_type="ERA", era_pad=0), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Only person 1 should match — person 2's visit ended before the condition + assert len(result) == 1, ( + f"Expected 1 cohort entry (person 1 only) but got {len(result)}. " + "Person 2's visit ends before condition start and should be excluded." + ) + assert int(result.iloc[0]["person_id"]) == 1 + + +# ────────────────────────────────────────────────────────────────────────────── +# Bug 5a: Nested CorrelatedCriteria inside PrimaryCriteria severe undercount +# ────────────────────────────────────────────────────────────────────────────── + + +class TestBug5NestedCorrelatedCriteria: + """Cohorts with CorrelatedCriteria nested inside PrimaryCriteria (e.g., + "ConditionOccurrence where a ProcedureOccurrence exists within X days") + produce severe undercounting (1-2% of expected results). + + Pattern from cohort 402: PrimaryCriteria has a ConditionOccurrence with + an inline CorrelatedCriteria requiring a ProcedureOccurrence nearby. + """ + + @pytest.fixture + def conn(self): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + conn = ibis.duckdb.connect() + + conn.create_table( + "person", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "year_of_birth": [1980, 1975, 1990], + "gender_concept_id": [8507, 8532, 8507], + } + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "observation_period_id": [10, 20, 30], + "observation_period_start_date": ["2018-01-01", "2018-01-01", "2018-01-01"], + "observation_period_end_date": ["2022-12-31", "2022-12-31", "2022-12-31"], + } + ), + overwrite=True, + ) + # Three persons with conditions + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "condition_occurrence_id": [100, 200, 300], + "condition_concept_id": [111, 111, 111], + "condition_start_date": ["2020-06-01", "2020-07-01", "2020-08-01"], + "condition_end_date": ["2020-06-05", "2020-07-05", "2020-08-05"], + "visit_occurrence_id": [1000, 2000, 3000], + } + ), + overwrite=True, + ) + # Only persons 1 and 2 have a procedure within 7 days of their condition + conn.create_table( + "procedure_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "procedure_occurrence_id": [500, 600], + "procedure_concept_id": [222, 222], + "procedure_date": ["2020-06-03", "2020-07-02"], + "procedure_source_concept_id": [0, 0], + "visit_occurrence_id": [1000, 2000], + } + ), + overwrite=True, + ) + # Need visit_occurrence table for schema completeness + conn.create_table( + "visit_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2, 3], + "visit_occurrence_id": [1000, 2000, 3000], + "visit_concept_id": [9201, 9201, 9201], + "visit_start_date": ["2020-06-01", "2020-07-01", "2020-08-01"], + "visit_end_date": ["2020-06-10", "2020-07-10", "2020-08-10"], + "visit_source_concept_id": [0, 0, 0], + } + ), + overwrite=True, + ) + return conn + + def test_primary_criteria_with_nested_correlated_criteria(self, conn): + """PrimaryCriteria with inline CorrelatedCriteria should correctly filter + primary events to those having a matching correlated event. + + Pattern: "Condition X where Procedure Y occurs within ±7 days" + Only persons 1 and 2 have procedures within 7 days of their condition. + """ + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111), _make_concept_set(2, 222)], + primary_criteria=PrimaryCriteria( + criteria_list=[ + ConditionOccurrence( + codeset_id=1, + correlated_criteria=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ProcedureOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(days=7, coeff=-1), # 7 days before + end=WindowBound(days=7, coeff=1), # 7 days after + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), # at least 1 + ) + ], + ), + ) + ], + primary_limit=ResultLimit(type="First"), + ), + qualified_limit=ResultLimit(type="First"), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Persons 1 and 2 have procedures within 7 days of condition → match + # Person 3 has no procedure at all → excluded by correlated criteria + assert len(result) == 2, ( + f"Expected 2 cohort entries (persons 1 & 2) but got {len(result)}. " + "Nested CorrelatedCriteria in PrimaryCriteria may not be filtering correctly." + ) + result_persons = sorted(result["person_id"].tolist()) + assert result_persons == [1, 2] + + def test_nested_correlated_excludes_non_matching(self, conn): + """Person 3 has no procedure near the condition and should be excluded.""" + pytest.importorskip("ibis") + # Make the window very tight so only person 1 matches (procedure on same day) + expression = CohortExpression( + concept_sets=[_make_concept_set(1, 111), _make_concept_set(2, 222)], + primary_criteria=PrimaryCriteria( + criteria_list=[ + ConditionOccurrence( + codeset_id=1, + correlated_criteria=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ProcedureOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(days=0, coeff=-1), # same day only + end=WindowBound(days=0, coeff=1), + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + ) + ], + primary_limit=ResultLimit(type="First"), + ), + qualified_limit=ResultLimit(type="First"), + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Only person 1's procedure (Jun 3) is within 0 days of condition (Jun 1)? No! + # Person 1: condition=Jun 1, procedure=Jun 3 → 2 days apart → outside ±0 window + # Person 2: condition=Jul 1, procedure=Jul 2 → 1 day apart → outside ±0 window + # Neither should match with a 0-day window + assert len(result) == 0, ( + f"Expected 0 entries with 0-day window but got {len(result)}. " + "No procedures occur on the exact same day as the conditions." + ) + + +# ────────────────────────────────────────────────────────────────────────────── +# Bug 5b: Complex multi-rule inclusion logic causes excessive filtering +# ────────────────────────────────────────────────────────────────────────────── + + +class TestBug5ComplexInclusionRules: + """Cohorts with many inclusion rules (7+) where each rule has complex + correlated criteria may over-filter due to incorrect rule intersection logic. + + Pattern from cohort 726: Multiple inclusion rules, each requiring different + correlated criteria. The ALL logic requires ALL rules to pass for an event + to be included. If any rule's join logic is overly restrictive (e.g., inner + join instead of semi-join), events get dropped. + """ + + @pytest.fixture + def conn(self): + ibis = pytest.importorskip("ibis") + _ = pytest.importorskip("duckdb") + conn = ibis.duckdb.connect() + + conn.create_table( + "person", + obj=ibis.memtable( + {"person_id": [1, 2], "year_of_birth": [1980, 1975], "gender_concept_id": [8507, 8532]} + ), + overwrite=True, + ) + conn.create_table( + "observation_period", + obj=ibis.memtable( + { + "person_id": [1, 2], + "observation_period_id": [10, 20], + "observation_period_start_date": ["2018-01-01", "2018-01-01"], + "observation_period_end_date": ["2023-12-31", "2023-12-31"], + } + ), + overwrite=True, + ) + # Both persons have the primary condition + conn.create_table( + "condition_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "condition_occurrence_id": [100, 200], + "condition_concept_id": [111, 111], + "condition_start_date": ["2020-06-01", "2020-06-01"], + "condition_end_date": ["2020-06-05", "2020-06-05"], + "visit_occurrence_id": [1000, 2000], + } + ), + overwrite=True, + ) + # Both persons have visit occurrences matching each inclusion rule + conn.create_table( + "visit_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 1, 2, 2], + "visit_occurrence_id": [1000, 1001, 2000, 2001], + "visit_concept_id": [9201, 9202, 9201, 9202], # Inpatient, ER + "visit_start_date": ["2020-06-01", "2020-05-01", "2020-06-01", "2020-05-01"], + "visit_end_date": ["2020-06-10", "2020-05-02", "2020-06-10", "2020-05-02"], + "visit_source_concept_id": [0, 0, 0, 0], + } + ), + overwrite=True, + ) + # Both persons have procedures (for a second inclusion rule) + conn.create_table( + "procedure_occurrence", + obj=ibis.memtable( + { + "person_id": [1, 2], + "procedure_occurrence_id": [500, 600], + "procedure_concept_id": [333, 333], + "procedure_date": ["2020-06-02", "2020-06-02"], + "procedure_source_concept_id": [0, 0], + "visit_occurrence_id": [1000, 2000], + } + ), + overwrite=True, + ) + return conn + + def test_multiple_inclusion_rules_all_satisfied(self, conn): + """When a person satisfies ALL inclusion rules, they should remain in the cohort. + + Two inclusion rules: + 1. Must have an inpatient visit (9201) within ±30 days + 2. Must have a procedure (333) within ±30 days + Both persons satisfy both rules. + """ + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 9201), + _make_concept_set(3, 333), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="First"), + ), + inclusion_rules=[ + InclusionRule( + name="Has inpatient visit", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=VisitOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(days=30, coeff=-1), + end=WindowBound(days=30, coeff=1), + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + ), + InclusionRule( + name="Has procedure", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ProcedureOccurrence(codeset_id=3), + start_window=Window( + start=WindowBound(days=30, coeff=-1), + end=WindowBound(days=30, coeff=1), + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + ), + ], + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Both persons have inpatient visit AND procedure within 30 days + assert len(result) == 2, ( + f"Expected 2 cohort entries but got {len(result)}. " + "Multiple inclusion rules may be incorrectly intersecting and dropping valid events." + ) + + def test_one_rule_not_satisfied_excludes_person(self, conn): + """Person failing one inclusion rule should be excluded.""" + ibis = pytest.importorskip("ibis") + # Override procedures — only person 1 has one + conn.create_table( + "procedure_occurrence", + obj=ibis.memtable( + { + "person_id": [1], + "procedure_occurrence_id": [500], + "procedure_concept_id": [333], + "procedure_date": ["2020-06-02"], + "procedure_source_concept_id": [0], + "visit_occurrence_id": [1000], + } + ), + overwrite=True, + ) + + expression = CohortExpression( + concept_sets=[ + _make_concept_set(1, 111), + _make_concept_set(2, 9201), + _make_concept_set(3, 333), + ], + primary_criteria=PrimaryCriteria( + criteria_list=[ConditionOccurrence(codeset_id=1)], + primary_limit=ResultLimit(type="First"), + ), + inclusion_rules=[ + InclusionRule( + name="Has inpatient visit", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=VisitOccurrence(codeset_id=2), + start_window=Window( + start=WindowBound(days=30, coeff=-1), + end=WindowBound(days=30, coeff=1), + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + ), + InclusionRule( + name="Has procedure", + expression=CriteriaGroup( + type="ALL", + criteria_list=[ + CorelatedCriteria( + criteria=ProcedureOccurrence(codeset_id=3), + start_window=Window( + start=WindowBound(days=30, coeff=-1), + end=WindowBound(days=30, coeff=1), + use_index_end=False, + use_event_end=False, + ), + occurrence=Occurrence(type=2, count=1), + ) + ], + ), + ), + ], + expression_limit=ResultLimit(type="All"), + ) + + result = build_cohort(expression, backend=conn, cdm_schema="main").execute() + # Person 1: has visit + procedure → passes both rules + # Person 2: has visit but NO procedure → fails rule 2 → excluded + assert len(result) == 1, f"Expected 1 cohort entry (person 1 only) but got {len(result)}." + assert int(result.iloc[0]["person_id"]) == 1 From 55f75b6a4dc0a797be6c528e762a73f493962d20 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Thu, 21 May 2026 09:47:32 -0700 Subject: [PATCH 43/53] Added benchmark report markdown file --- benchmark_output/benchmark_report.qmd | 465 ++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 benchmark_output/benchmark_report.qmd diff --git a/benchmark_output/benchmark_report.qmd b/benchmark_output/benchmark_report.qmd new file mode 100644 index 0000000..03e0cc6 --- /dev/null +++ b/benchmark_output/benchmark_report.qmd @@ -0,0 +1,465 @@ +--- +title: "CircePy vs CohortGenerator: Databricks Execution Time Benchmark" +format: + html: + toc: true + code-fold: true + theme: cosmo + self-contained: true +execute: + warning: false + message: false +--- + +## CIRCE Expression Model + +The CIRCE cohort expression schema defines a declarative structure for specifying +patient cohorts on the OMOP CDM. CircePy implements this schema as a typed Python +class hierarchy mirroring the Java original. + +```{mermaid} +classDiagram + direction TB + + class CohortExpression { + +concept_sets: list~ConceptSet~ + +primary_criteria: PrimaryCriteria + +additional_criteria: CriteriaGroup + +inclusion_rules: list~InclusionRule~ + +end_strategy: EndStrategy + +collapse_settings: CollapseSettings + +censoring_criteria: list~Criteria~ + } + + class PrimaryCriteria { + +criteria_list: list~Criteria~ + +observation_window: ObservationFilter + +primary_limit: ResultLimit + } + + class Criteria { + <> + +codeset_id: int + +date_adjustment: DateAdjustment + +correlated_criteria: CriteriaGroup + } + + class ConditionOccurrence { + +condition_type: list~Concept~ + +stop_reason: TextFilter + +condition_status: list~Concept~ + } + + class DrugExposure { + +drug_type: list~Concept~ + +refills: NumericRange + +days_supply: NumericRange + +route_concept: list~Concept~ + } + + class VisitOccurrence { + +visit_type: list~Concept~ + +visit_length: NumericRange + } + + class CriteriaGroup { + +type: ALL | ANY | AT_LEAST | AT_MOST + +criteria_list: list~CorelatedCriteria~ + +groups: list~CriteriaGroup~ + +demographic_criteria_list: list~DemographicCriteria~ + } + + class CorelatedCriteria { + +criteria: Criteria + +start_window: Window + +end_window: Window + +occurrence: Occurrence + } + + class Window { + +start: WindowBound + +end: WindowBound + +use_event_end: bool + +use_index_end: bool + } + + class InclusionRule { + +name: str + +expression: CriteriaGroup + } + + class EndStrategy { + <> + } + + class DateOffsetStrategy { + +date_field: str + +offset: int + } + + class CustomEraStrategy { + +drug_codeset_id: int + +gap_days: int + +offset: int + } + + class ConceptSet { + +id: int + +name: str + +expression: ConceptSetExpression + } + + class CollapseSettings { + +collapse_type: ERA | NO_COLLAPSE + +era_pad: int + } + + CohortExpression *-- PrimaryCriteria + CohortExpression *-- CriteriaGroup : additional_criteria + CohortExpression *-- InclusionRule + CohortExpression *-- EndStrategy + CohortExpression *-- CollapseSettings + CohortExpression *-- ConceptSet + + PrimaryCriteria *-- Criteria + + Criteria <|-- ConditionOccurrence + Criteria <|-- DrugExposure + Criteria <|-- VisitOccurrence + + EndStrategy <|-- DateOffsetStrategy + EndStrategy <|-- CustomEraStrategy + + InclusionRule *-- CriteriaGroup + CriteriaGroup *-- CorelatedCriteria + CriteriaGroup *-- CriteriaGroup : nested groups + CorelatedCriteria *-- Criteria + CorelatedCriteria *-- Window +``` + +The model supports 16 OMOP CDM domain criteria (only three shown above for brevity), +each inheriting from the abstract `Criteria` base class. Temporal relationships between +events are expressed through `CorelatedCriteria` with `Window` bounds, enabling +arbitrary time-relative logic. + +## Overview + +This report compares SQL execution times between the R (CohortGenerator) and Python +(CircePy) implementations of the OHDSI CIRCE cohort expression compiler, both +executing against the same Databricks SQL Warehouse on the Merative CCAE CDM +(~150M patients). + +Both implementations: + +- Compile identical CIRCE JSON cohort definitions into SQL +- Execute against the same Databricks SQL Warehouse endpoint +- Write results to the same `scratch.scratch_jgilber2` schema +- Produce row-level identical output (verified separately) + +```{python} +import pandas as pd +import numpy as np +from pathlib import Path + +# Load data +times = pd.read_csv(Path("execution_times.csv")) +counts = pd.read_csv(Path("cohort_row_counts.csv")) + +# Pivot to wide format for paired comparison +r_times = times[times.implementation == "R"].rename(columns={"duration_s": "r_seconds"}) +py_times = times[times.implementation == "Python"].rename(columns={"duration_s": "py_seconds"}) + +paired = pd.merge( + r_times[["cohort_definition_id", "r_seconds"]], + py_times[["cohort_definition_id", "py_seconds"]], + on="cohort_definition_id", +) +paired = pd.merge(paired, counts, on="cohort_definition_id", how="left") +paired["speedup"] = paired["r_seconds"] / paired["py_seconds"] +paired["diff_seconds"] = paired["py_seconds"] - paired["r_seconds"] +paired["pct_diff"] = (paired["diff_seconds"] / paired["r_seconds"]) * 100 +``` + +## Summary Statistics + +```{python} +#| label: tbl-summary +#| tbl-cap: "Aggregate execution time comparison (707 shared cohorts)" + +summary = pd.DataFrame({ + "Metric": [ + "Total wall-clock time", + "Mean per cohort", + "Median per cohort", + "Std dev", + "Min", + "Max", + "Cohorts where Python faster", + ], + "R (CohortGenerator)": [ + f"{paired.r_seconds.sum():,.0f}s ({paired.r_seconds.sum()/3600:.1f}h)", + f"{paired.r_seconds.mean():.1f}s", + f"{paired.r_seconds.median():.1f}s", + f"{paired.r_seconds.std():.1f}s", + f"{paired.r_seconds.min():.1f}s", + f"{paired.r_seconds.max():.1f}s", + "", + ], + "Python (CircePy)": [ + f"{paired.py_seconds.sum():,.0f}s ({paired.py_seconds.sum()/3600:.1f}h)", + f"{paired.py_seconds.mean():.1f}s", + f"{paired.py_seconds.median():.1f}s", + f"{paired.py_seconds.std():.1f}s", + f"{paired.py_seconds.min():.1f}s", + f"{paired.py_seconds.max():.1f}s", + f"{(paired.py_seconds < paired.r_seconds).sum()} / {len(paired)} ({(paired.py_seconds < paired.r_seconds).mean()*100:.0f}%)", + ], +}) + +summary.style.hide(axis="index") +``` + +```{python} +#| label: tbl-ratio +#| tbl-cap: "Speed ratio distribution (R time / Python time)" + +quantiles = paired.speedup.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) +ratio_summary = pd.DataFrame({ + "Percentile": ["5th", "25th", "Median", "75th", "95th"], + "R/Python ratio": [f"{v:.2f}" for v in quantiles.values], + "Interpretation": [ + f"Python {1/quantiles.iloc[0]:.1f}x slower" if quantiles.iloc[0] > 1 else f"Python {quantiles.iloc[0]:.1f}x faster", + f"Python {1/quantiles.iloc[1]:.1f}x slower" if quantiles.iloc[1] > 1 else f"Python {quantiles.iloc[1]:.1f}x faster", + f"Python {1/quantiles.iloc[2]:.1f}x slower" if quantiles.iloc[2] > 1 else f"Python {quantiles.iloc[2]:.1f}x faster", + f"Python {1/quantiles.iloc[3]:.1f}x slower" if quantiles.iloc[3] > 1 else f"Python {quantiles.iloc[3]:.1f}x faster", + f"Python {1/quantiles.iloc[4]:.1f}x slower" if quantiles.iloc[4] > 1 else f"Python {quantiles.iloc[4]:.1f}x faster", + ], +}) +ratio_summary.style.hide(axis="index") +``` + +## Execution Time Distribution + +```{python} +#| label: fig-distribution +#| fig-cap: "Distribution of per-cohort execution times by implementation" +#| fig-width: 10 +#| fig-height: 5 + +import matplotlib.pyplot as plt + +fig, axes = plt.subplots(1, 2, figsize=(10, 5)) + +# Histogram +axes[0].hist(paired.r_seconds, bins=50, alpha=0.6, label="R", color="#2196F3") +axes[0].hist(paired.py_seconds, bins=50, alpha=0.6, label="Python", color="#FF9800") +axes[0].set_xlabel("Execution time (seconds)") +axes[0].set_ylabel("Number of cohorts") +axes[0].set_title("Execution Time Distribution") +axes[0].legend() + +# Log-scale box plot +data_box = pd.DataFrame({ + "R": paired.r_seconds, + "Python": paired.py_seconds, +}) +axes[1].boxplot([paired.r_seconds, paired.py_seconds], labels=["R", "Python"]) +axes[1].set_ylabel("Execution time (seconds)") +axes[1].set_title("Execution Time (Box Plot)") +axes[1].set_yscale("log") + +plt.tight_layout() +plt.show() +``` + +## Paired Comparison (Scatter) + +```{python} +#| label: fig-scatter +#| fig-cap: "Per-cohort execution time: R vs Python (each point = one cohort definition)" +#| fig-width: 8 +#| fig-height: 7 + +fig, ax = plt.subplots(figsize=(8, 7)) + +max_val = max(paired.r_seconds.max(), paired.py_seconds.max()) * 1.05 + +ax.scatter(paired.r_seconds, paired.py_seconds, alpha=0.4, s=15, color="#455A64") +ax.plot([0, max_val], [0, max_val], "k--", linewidth=1, label="y = x (equal time)") +ax.set_xlabel("R execution time (seconds)") +ax.set_ylabel("Python execution time (seconds)") +ax.set_title("Per-Cohort Execution Time: R vs Python") +ax.legend() +ax.set_xlim(0, max_val) +ax.set_ylim(0, max_val) +ax.set_aspect("equal") + +plt.tight_layout() +plt.show() +``` + +```{python} +#| label: fig-scatter-log +#| fig-cap: "Same comparison on log-log scale to reveal patterns across magnitudes" +#| fig-width: 8 +#| fig-height: 7 + +fig, ax = plt.subplots(figsize=(8, 7)) + +ax.scatter(paired.r_seconds, paired.py_seconds, alpha=0.4, s=15, color="#455A64") + +lims = [ + min(paired.r_seconds.min(), paired.py_seconds.min()) * 0.8, + max(paired.r_seconds.max(), paired.py_seconds.max()) * 1.2, +] +ax.plot(lims, lims, "k--", linewidth=1, label="y = x (equal time)") +ax.set_xlabel("R execution time (seconds)") +ax.set_ylabel("Python execution time (seconds)") +ax.set_title("Per-Cohort Execution Time: R vs Python (Log Scale)") +ax.set_xscale("log") +ax.set_yscale("log") +ax.legend() + +plt.tight_layout() +plt.show() +``` + +## Speed Ratio Distribution + +```{python} +#| label: fig-speedup +#| fig-cap: "Distribution of R/Python speed ratio (>1 means R is faster)" +#| fig-width: 9 +#| fig-height: 5 + +fig, ax = plt.subplots(figsize=(9, 5)) + +ax.hist(paired.speedup, bins=50, color="#607D8B", edgecolor="white", linewidth=0.5) +ax.axvline(x=1.0, color="red", linestyle="--", linewidth=1.5, label="Equal speed") +ax.axvline(x=paired.speedup.median(), color="#FF9800", linestyle="-", linewidth=1.5, + label=f"Median = {paired.speedup.median():.2f}") +ax.set_xlabel("Speed ratio (R seconds / Python seconds)") +ax.set_ylabel("Number of cohorts") +ax.set_title("Speed Ratio Distribution") +ax.legend() + +plt.tight_layout() +plt.show() +``` + +## Per-Cohort Detail + +```{python} +#| label: fig-waterfall +#| fig-cap: "Per-cohort time difference (Python − R), sorted by magnitude" +#| fig-width: 10 +#| fig-height: 6 + +sorted_diff = paired.sort_values("diff_seconds").reset_index(drop=True) + +fig, ax = plt.subplots(figsize=(10, 6)) +colors = ["#4CAF50" if d < 0 else "#F44336" for d in sorted_diff.diff_seconds] +ax.bar(range(len(sorted_diff)), sorted_diff.diff_seconds, color=colors, width=1.0) +ax.axhline(y=0, color="black", linewidth=0.5) +ax.set_xlabel("Cohort (sorted by time difference)") +ax.set_ylabel("Time difference: Python − R (seconds)") +ax.set_title("Per-Cohort Time Difference (green = Python faster, red = R faster)") + +plt.tight_layout() +plt.show() +``` + +## Execution Time vs Cohort Size + +```{python} +#| label: fig-size-vs-time +#| fig-cap: "Relationship between cohort output size and execution time" +#| fig-width: 10 +#| fig-height: 5 + +has_counts = paired.dropna(subset=["n_rows"]) + +if len(has_counts) > 0: + fig, axes = plt.subplots(1, 2, figsize=(10, 5)) + + axes[0].scatter(has_counts.n_rows, has_counts.r_seconds, alpha=0.3, s=10, label="R", color="#2196F3") + axes[0].scatter(has_counts.n_rows, has_counts.py_seconds, alpha=0.3, s=10, label="Python", color="#FF9800") + axes[0].set_xlabel("Cohort output rows") + axes[0].set_ylabel("Execution time (seconds)") + axes[0].set_xscale("log") + axes[0].set_yscale("log") + axes[0].set_title("Execution Time vs Cohort Size") + axes[0].legend() + + axes[1].scatter(has_counts.n_rows, has_counts.speedup, alpha=0.3, s=10, color="#455A64") + axes[1].axhline(y=1.0, color="red", linestyle="--", linewidth=1) + axes[1].set_xlabel("Cohort output rows") + axes[1].set_ylabel("R/Python speed ratio") + axes[1].set_xscale("log") + axes[1].set_title("Speed Ratio vs Cohort Size") + + plt.tight_layout() + plt.show() +``` + +## Slowest Cohorts + +```{python} +#| label: tbl-slowest +#| tbl-cap: "Top 20 slowest cohorts (by Python execution time)" + +top20 = paired.nlargest(20, "py_seconds")[ + ["cohort_definition_id", "r_seconds", "py_seconds", "diff_seconds", "speedup", "n_rows"] +].copy() +top20.columns = ["Cohort ID", "R (s)", "Python (s)", "Diff (s)", "R/Py Ratio", "Output Rows"] +top20["R (s)"] = top20["R (s)"].round(1) +top20["Python (s)"] = top20["Python (s)"].round(1) +top20["Diff (s)"] = top20["Diff (s)"].round(1) +top20["R/Py Ratio"] = top20["R/Py Ratio"].round(2) +top20["Output Rows"] = top20["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") +top20.style.hide(axis="index") +``` + +## Largest Python Advantage + +```{python} +#| label: tbl-py-faster +#| tbl-cap: "Top 20 cohorts where Python is fastest relative to R" + +py_wins = paired.nlargest(20, "speedup")[ + ["cohort_definition_id", "r_seconds", "py_seconds", "speedup", "n_rows"] +].copy() +py_wins.columns = ["Cohort ID", "R (s)", "Python (s)", "R/Py Ratio", "Output Rows"] +py_wins["R (s)"] = py_wins["R (s)"].round(1) +py_wins["Python (s)"] = py_wins["Python (s)"].round(1) +py_wins["R/Py Ratio"] = py_wins["R/Py Ratio"].round(2) +py_wins["Output Rows"] = py_wins["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") +py_wins.style.hide(axis="index") +``` + +## Largest R Advantage + +```{python} +#| label: tbl-r-faster +#| tbl-cap: "Top 20 cohorts where R is fastest relative to Python" + +r_wins = paired.nsmallest(20, "speedup")[ + ["cohort_definition_id", "r_seconds", "py_seconds", "speedup", "n_rows"] +].copy() +r_wins.columns = ["Cohort ID", "R (s)", "Python (s)", "R/Py Ratio", "Output Rows"] +r_wins["R (s)"] = r_wins["R (s)"].round(1) +r_wins["Python (s)"] = r_wins["Python (s)"].round(1) +r_wins["R/Py Ratio"] = r_wins["R/Py Ratio"].round(2) +r_wins["Output Rows"] = r_wins["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") +r_wins.style.hide(axis="index") +``` + +## Methodology + +- **CDM**: Merative CCAE v3909 (~150M patients) +- **Backend**: Databricks SQL Warehouse (shared endpoint) +- **R stack**: CohortGenerator → DatabaseConnector → JDBC → Spark SQL +- **Python stack**: CircePy → ibis-framework → databricks-sql-connector → Spark SQL +- **Timing**: Wall-clock time from checksum table (`end_time - start_time`) — includes SQL compilation, network round-trips, and warehouse execution +- **Cohort definitions**: 707 PhenotypeLibrary phenotypes compiled and executed by both implementations +- **Correctness**: Row-level output verified identical between implementations From ed2cec3a9ce3df9c16392103c91c1e45f7d2a411 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 1 Jun 2026 11:48:51 -0700 Subject: [PATCH 44/53] Benchmark script changes for local run on databricks --- benchmarks/README.md | 29 +++++++ benchmarks/_backend.py | 125 ++++++++++++++++++++++++++---- benchmarks/benchmark_run_py.py | 11 +++ benchmarks/benchmark_run_r.R | 134 +++++++++++++++++++++++++-------- 4 files changed, 254 insertions(+), 45 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index d591ae1..4f0cb74 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -22,6 +22,12 @@ Install the Python package with DuckDB support: pip install -e ".[dev]" ``` +For Databricks benchmarks, install the Databricks backend extra as well: + +```bash +pip install -e ".[dev,ibis-databricks]" +``` + ## Quick Start ```bash @@ -38,6 +44,29 @@ python benchmarks/benchmark_run_py.py python benchmarks/benchmark_analyze_duckdb.py ``` +## Databricks + +The benchmark scripts can also run against a Databricks SQL warehouse. Set the +connection and schema environment variables referenced by +`benchmark_db_config.yaml`, then run the same entry points with +`--backend databricks`: + +```bash +export DATABRICKS_HOST="adb-..databricks.net" +export DATABRICKS_HTTP_PATH="/sql/1.0/warehouses/" +export DATABRICKS_TOKEN="..." +export DATABRICKS_CDM_SCHEMA="catalog.schema" +export DATABRICKS_VOCABULARY_SCHEMA="catalog.schema" +export DATABRICKS_RESULTS_SCHEMA="catalog.schema" + +Rscript benchmarks/benchmark_run_r.R --backend databricks +python benchmarks/benchmark_run_py.py --backend databricks +python benchmarks/benchmark_analyze_duckdb.py --backend databricks +``` + +If you use Unity Catalog defaults, you can also set `DATABRICKS_CATALOG` and +`DATABRICKS_DATABASE` for the Python Ibis connection. + ## Files | File | Language | Purpose | diff --git a/benchmarks/_backend.py b/benchmarks/_backend.py index 004ef71..32b69a5 100644 --- a/benchmarks/_backend.py +++ b/benchmarks/_backend.py @@ -37,16 +37,60 @@ R_CSV = OUTPUT_DIR / "r_checksum_times.csv" PY_CSV = OUTPUT_DIR / "py_checksum_times.csv" +ENV_PATH = REPO_ROOT / ".env" + + +def _strip_wrapping_quotes(value: str) -> str: + if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}: + return value[1:-1] + return value + + +def _load_env_file() -> None: + """Load repo-local environment variables without overriding the shell.""" + if not ENV_PATH.exists(): + return + + for raw_line in ENV_PATH.read_text().splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + + key, value = line.split("=", 1) + key = key.strip() + if not key or key in os.environ: + continue + + os.environ[key] = _strip_wrapping_quotes(value.strip()) + + +_load_env_file() + def _expandvars(text: str) -> str: """Expand ``${ENV_VAR}`` patterns in *text*, falling back to an empty string.""" return re.sub( r"\$\{(\w+)\}", - lambda m: os.environ.get(m.group(1), ""), + lambda m: _get_env_var(m.group(1)), text, ) +def _get_env_var(name: str) -> str: + """Return an environment variable, including benchmark-specific aliases.""" + value = os.environ.get(name) + if value: + return value + + aliases = { + "DATABRICKS_RESULTS_SCHEMA": "DATABRICKS_SCRATCH_SCHEMA", + } + alias = aliases.get(name) + if alias is None: + return "" + return os.environ.get(alias, "") + + def _expandvars_recursive(obj: Any) -> Any: """Expand environment variables throughout a nested dict/list.""" if isinstance(obj, str): @@ -58,6 +102,51 @@ def _expandvars_recursive(obj: Any) -> Any: return obj +def _require_config_value(cfg: dict[str, Any], path: tuple[str, ...], env_var: str | None = None) -> str: + """Return a non-empty configuration value or raise a helpful error.""" + current: Any = cfg + for key in path: + if not isinstance(current, dict): + current = None + break + current = current.get(key) + + if isinstance(current, str) and current: + return current + + dotted = ".".join(path) + if env_var is not None: + raise ValueError( + f"Missing Databricks config value '{dotted}'. Set {env_var} or update {CONFIG_PATH}." + ) + raise ValueError(f"Missing Databricks config value '{dotted}' in {CONFIG_PATH}.") + + +def _split_catalog_schema(qualified_schema: str | None) -> tuple[str | None, str | None]: + """Split a qualified Databricks schema into catalog and schema parts.""" + if not qualified_schema: + return None, None + + parts = [part for part in qualified_schema.split(".") if part] + if len(parts) >= 2: + return parts[0], parts[1] + return None, parts[0] if parts else None + + +def _infer_databricks_namespace(cfg: dict[str, Any]) -> tuple[str | None, str | None]: + """Infer a sensible catalog/schema for the initial Databricks connection.""" + conn_cfg = cfg.get("connection", {}) + if conn_cfg.get("catalog") or conn_cfg.get("schema"): + return conn_cfg.get("catalog"), conn_cfg.get("schema") + + for key in ("results_schema", "cdm_schema", "vocabulary_schema"): + catalog, schema = _split_catalog_schema(cfg.get(key)) + if catalog or schema: + return catalog, schema + + return None, None + + @dataclass class BackendConnection: """Hold the configured connection and schema information for a benchmark run.""" @@ -123,24 +212,34 @@ def connect_backend(backend_name: str) -> BackendConnection: "pip install 'ibis-framework[databricks]'" ) - conn_cfg = cfg["connection"] + catalog, schema = _infer_databricks_namespace(cfg) db_cfg: dict[str, Any] = { - "host": conn_cfg["server_hostname"], - "http_path": conn_cfg["http_path"], + "server_hostname": _require_config_value( + cfg, ("connection", "server_hostname"), env_var="DATABRICKS_HOST" + ), + "http_path": _require_config_value( + cfg, ("connection", "http_path"), env_var="DATABRICKS_HTTP_PATH" + ), } - if conn_cfg.get("personal_access_token"): - db_cfg["token"] = conn_cfg["personal_access_token"] - if conn_cfg.get("catalog"): - db_cfg["catalog"] = conn_cfg["catalog"] - if conn_cfg.get("schema"): - db_cfg["schema"] = conn_cfg["schema"] + token = _require_config_value( + cfg, ("connection", "personal_access_token"), env_var="DATABRICKS_TOKEN" + ) + if token: + db_cfg["access_token"] = token + if catalog: + db_cfg["catalog"] = catalog + if schema: + db_cfg["schema"] = schema backend = ibis.databricks.connect(**db_cfg) return BackendConnection( backend=backend, - cdm_schema=cfg["cdm_schema"], - results_schema=cfg["results_schema"], - vocabulary_schema=cfg.get("vocabulary_schema", cfg["cdm_schema"]), + cdm_schema=_require_config_value(cfg, ("cdm_schema",), env_var="DATABRICKS_CDM_SCHEMA"), + results_schema=_require_config_value( + cfg, ("results_schema",), env_var="DATABRICKS_RESULTS_SCHEMA" + ), + vocabulary_schema=cfg.get("vocabulary_schema") + or _require_config_value(cfg, ("cdm_schema",), env_var="DATABRICKS_CDM_SCHEMA"), r_cohort_table=cfg.get("r_cohort_table", R_COHORT_TABLE), py_cohort_table=cfg.get("py_cohort_table", PY_COHORT_TABLE), r_checksum_table=cfg.get("r_checksum_table", R_CHECKSUM_TABLE), diff --git a/benchmarks/benchmark_run_py.py b/benchmarks/benchmark_run_py.py index 7a8e635..02d81b7 100644 --- a/benchmarks/benchmark_run_py.py +++ b/benchmarks/benchmark_run_py.py @@ -36,6 +36,17 @@ datefmt="%H:%M:%S", ) +logging.getLogger("circe").setLevel(logging.INFO) + +for logger_name in ( + "databricks", + "databricks.sql", + "databricks.sql.client", + "databricks.sql.http", + "urllib3", +): + logging.getLogger(logger_name).setLevel(logging.WARNING) + REPO_ROOT = Path(__file__).resolve().parent.parent OUTPUT_DIR = REPO_ROOT / "benchmark_output" JSON_DIR = OUTPUT_DIR / "phenotype_jsons" diff --git a/benchmarks/benchmark_run_r.R b/benchmarks/benchmark_run_r.R index 7e31b78..a1aba7c 100644 --- a/benchmarks/benchmark_run_r.R +++ b/benchmarks/benchmark_run_r.R @@ -42,6 +42,84 @@ Sys.setenv(EUNOMIA_DATA_FOLDER = EUNOMIA_DATA_DIR) DUCKDB_PATH <- file.path(OUTPUT_DIR, "eunomia.duckdb") +cfg_value <- function(value, default = "") { + if (is.null(value) || identical(value, "")) { + return(default) + } + value +} + +trim_quotes <- function(value) { + sub("^(['\"])", "", sub("(['\"])$", "", trimws(value))) +} + +expand_env_vars <- function(value) { + matches <- gregexpr("\\$\\{([A-Za-z0-9_]+)\\}", value, perl = TRUE) + tokens <- regmatches(value, matches)[[1]] + if (length(tokens) == 0) { + return(value) + } + + expanded <- value + for (token in unique(tokens)) { + var_name <- sub("^\\$\\{", "", sub("\\}$", "", token)) + expanded <- gsub(token, Sys.getenv(var_name, unset = ""), expanded, fixed = TRUE) + } + expanded +} + +load_databricks_config <- function(config_path) { + if (!file.exists(config_path)) { + stop(sprintf("Config not found: %s", config_path)) + } + + lines <- readLines(config_path, warn = FALSE) + section_started <- FALSE + current_group <- NULL + values <- list() + + for (line in lines) { + if (grepl("^\\s*$", line) || grepl("^\\s*#", line)) { + next + } + + if (!section_started) { + if (grepl("^databricks:\\s*$", line)) { + section_started <- TRUE + } + next + } + + if (grepl("^[A-Za-z0-9_-]+:\\s*$", line)) { + break + } + + if (grepl("^ [A-Za-z0-9_-]+:\\s*$", line)) { + current_group <- sub("^ ([A-Za-z0-9_-]+):\\s*$", "\\1", line) + if (is.null(values[[current_group]])) { + values[[current_group]] <- list() + } + next + } + + if (grepl("^ [A-Za-z0-9_-]+:\\s*", line)) { + key <- sub("^ ([A-Za-z0-9_-]+):.*$", "\\1", line) + raw_value <- sub("^ [A-Za-z0-9_-]+:\\s*", "", line) + values[[key]] <- expand_env_vars(trim_quotes(raw_value)) + current_group <- NULL + next + } + + if (!is.null(current_group) && grepl("^ [A-Za-z0-9_-]+:\\s*", line)) { + key <- sub("^ ([A-Za-z0-9_-]+):.*$", "\\1", line) + raw_value <- sub("^ [A-Za-z0-9_-]+:\\s*", "", line) + values[[current_group]][[key]] <- expand_env_vars(trim_quotes(raw_value)) + } + } + + values +} + # --------------------------------------------------------------------------- # 1. Load phenotype definitions from PhenotypeLibrary # --------------------------------------------------------------------------- @@ -101,46 +179,35 @@ if (backend == "duckdb") { cat("Setting up Databricks connection...\n") - # Read YAML config config_path <- file.path(dirname(script_path), "benchmark_db_config.yaml") - if (!file.exists(config_path)) { - stop(sprintf("Config not found: %s", config_path)) - } + cfg <- load_databricks_config(config_path) + conn_cfg <- cfg$connection - # Simple YAML reader — extracts top-level key's connection block - yaml_txt <- readLines(config_path, warn = FALSE) - yaml_txt <- yaml_txt[!grepl("^\\s*#", yaml_txt)] # strip comments + server_hostname <- cfg_value(conn_cfg$server_hostname) + http_path <- cfg_value(conn_cfg$http_path) + databricks_token <- cfg_value(conn_cfg$personal_access_token) - extract_yaml <- function(key) { - pattern <- sprintf("^\\s*%s\\s*:\\s*[\"']?(.+?)[\"']?\\s*$", key) - line <- grep(pattern, yaml_txt, value = TRUE) - if (length(line) == 0) return("") - sub(pattern, "\\1", line[1]) + if (server_hostname == "" || http_path == "" || databricks_token == "") { + stop( + paste( + "Databricks credentials not found in benchmarks/benchmark_db_config.yaml.", + "Set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN", + "before running the benchmarks." + ) + ) } - resolve_env <- function(val) { - # Expand ${VAR} placeholders - gsub("\\$\\{(\\w+)\\}", function(m) { - v <- Sys.getenv(gsub("[${}]", "", m), unset = "") - v - }, val, perl = TRUE) + CDM_SCHEMA <- cfg_value(cfg$cdm_schema) + RESULTS_SCHEMA <- cfg_value(cfg$results_schema) + VOCABULARY_SCHEMA <- cfg_value(cfg$vocabulary_schema, CDM_SCHEMA) + if (CDM_SCHEMA == "") { + stop("Databricks cdm_schema is required. Set DATABRICKS_CDM_SCHEMA in the environment or update the benchmark config.") } - - server_hostname <- resolve_env(extract_yaml("server_hostname")) - http_path <- resolve_env(extract_yaml("http_path")) - databricks_token <- resolve_env(extract_yaml("personal_access_token")) - - if (server_hostname == "" || http_path == "" || databricks_token == "") { - stop("Databricks credentials not found. Set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN environment variables.") + if (RESULTS_SCHEMA == "") { + stop("Databricks results_schema is required. Set DATABRICKS_RESULTS_SCHEMA in the environment or update the benchmark config.") } - CDM_SCHEMA <- resolve_env(extract_yaml("cdm_schema")) - if (CDM_SCHEMA == "") CDM_SCHEMA <- "hive_metastore.omop_cdm" - - RESULTS_SCHEMA <- resolve_env(extract_yaml("results_schema")) - if (RESULTS_SCHEMA == "") RESULTS_SCHEMA <- "hive_metastore.results" - - COHORT_TABLE <- "cohort_r" + COHORT_TABLE <- cfg_value(cfg$r_cohort_table, "cohort") TEMP_EMULATION_SCHEMA <- RESULTS_SCHEMA # Databricks needs a real schema for temp conn_string <- paste0( @@ -160,6 +227,9 @@ if (backend == "duckdb") { } cat(sprintf("CDM schema : %s\n", CDM_SCHEMA)) +if (backend == "databricks") { + cat(sprintf("Vocabulary schema: %s\n", VOCABULARY_SCHEMA)) +} cat(sprintf("Results schema : %s\n", RESULTS_SCHEMA)) cat(sprintf("Cohort table : %s\n", COHORT_TABLE)) From 441290283e89f68f40f4413c70d50b9950343206 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 1 Jun 2026 11:51:33 -0700 Subject: [PATCH 45/53] git ignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 31de11d..e0c867f 100644 --- a/.gitignore +++ b/.gitignore @@ -179,4 +179,8 @@ debug_app/user_overrides.json debug_app/test_results.json .test_baseline.json -.test_final.json \ No newline at end of file +.test_final.json + +# Benchmark outputs (generated by running benchmarks) +benchmark_output/ +renv.lock \ No newline at end of file From f0dca61f5f6e2a3201137e82103e2e17045aceb4 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 1 Jun 2026 11:51:49 -0700 Subject: [PATCH 46/53] git ignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e0c867f..ea2f543 100644 --- a/.gitignore +++ b/.gitignore @@ -183,4 +183,5 @@ debug_app/test_results.json # Benchmark outputs (generated by running benchmarks) benchmark_output/ -renv.lock \ No newline at end of file +renv.lock +eunomia_data/ \ No newline at end of file From e92f452209320bc809893ac24b896dbde85a050a Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 1 Jun 2026 11:51:59 -0700 Subject: [PATCH 47/53] git ignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ea2f543..b26d71d 100644 --- a/.gitignore +++ b/.gitignore @@ -184,4 +184,5 @@ debug_app/test_results.json # Benchmark outputs (generated by running benchmarks) benchmark_output/ renv.lock -eunomia_data/ \ No newline at end of file +eunomia_data/ +renv/ \ No newline at end of file From 77da42fff1b123efa07c619f1cdfe6db48dcce1f Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 1 Jun 2026 11:52:16 -0700 Subject: [PATCH 48/53] git ignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b26d71d..1e6e7b7 100644 --- a/.gitignore +++ b/.gitignore @@ -185,4 +185,5 @@ debug_app/test_results.json benchmark_output/ renv.lock eunomia_data/ -renv/ \ No newline at end of file +renv/ +.Rprofile \ No newline at end of file From f830cb5171c1e194e3a26bc3967c467b04d4d294 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Wed, 17 Jun 2026 09:17:46 -0700 Subject: [PATCH 49/53] removed benchmarking code (moved to stand alone repository) --- .gitignore | 1 + benchmark_output/benchmark_report.qmd | 465 ------------------------- benchmarks/README.md | 96 ----- benchmarks/_backend.py | 249 ------------- benchmarks/benchmark_analyze_duckdb.py | 230 ------------ benchmarks/benchmark_db_config.yaml | 73 ---- benchmarks/benchmark_run_py.py | 160 --------- benchmarks/benchmark_run_r.R | 282 --------------- benchmarks/compare_cohort_outputs.py | 270 -------------- benchmarks/error_repoer.txt | 90 ----- benchmarks/export_phenotypes.R | 43 --- 11 files changed, 1 insertion(+), 1958 deletions(-) delete mode 100644 benchmark_output/benchmark_report.qmd delete mode 100644 benchmarks/README.md delete mode 100644 benchmarks/_backend.py delete mode 100644 benchmarks/benchmark_analyze_duckdb.py delete mode 100644 benchmarks/benchmark_db_config.yaml delete mode 100644 benchmarks/benchmark_run_py.py delete mode 100644 benchmarks/benchmark_run_r.R delete mode 100644 benchmarks/compare_cohort_outputs.py delete mode 100644 benchmarks/error_repoer.txt delete mode 100644 benchmarks/export_phenotypes.R diff --git a/.gitignore b/.gitignore index 1e6e7b7..4aa9d29 100644 --- a/.gitignore +++ b/.gitignore @@ -183,6 +183,7 @@ debug_app/test_results.json # Benchmark outputs (generated by running benchmarks) benchmark_output/ +circepy_benchmarks/ renv.lock eunomia_data/ renv/ diff --git a/benchmark_output/benchmark_report.qmd b/benchmark_output/benchmark_report.qmd deleted file mode 100644 index 03e0cc6..0000000 --- a/benchmark_output/benchmark_report.qmd +++ /dev/null @@ -1,465 +0,0 @@ ---- -title: "CircePy vs CohortGenerator: Databricks Execution Time Benchmark" -format: - html: - toc: true - code-fold: true - theme: cosmo - self-contained: true -execute: - warning: false - message: false ---- - -## CIRCE Expression Model - -The CIRCE cohort expression schema defines a declarative structure for specifying -patient cohorts on the OMOP CDM. CircePy implements this schema as a typed Python -class hierarchy mirroring the Java original. - -```{mermaid} -classDiagram - direction TB - - class CohortExpression { - +concept_sets: list~ConceptSet~ - +primary_criteria: PrimaryCriteria - +additional_criteria: CriteriaGroup - +inclusion_rules: list~InclusionRule~ - +end_strategy: EndStrategy - +collapse_settings: CollapseSettings - +censoring_criteria: list~Criteria~ - } - - class PrimaryCriteria { - +criteria_list: list~Criteria~ - +observation_window: ObservationFilter - +primary_limit: ResultLimit - } - - class Criteria { - <> - +codeset_id: int - +date_adjustment: DateAdjustment - +correlated_criteria: CriteriaGroup - } - - class ConditionOccurrence { - +condition_type: list~Concept~ - +stop_reason: TextFilter - +condition_status: list~Concept~ - } - - class DrugExposure { - +drug_type: list~Concept~ - +refills: NumericRange - +days_supply: NumericRange - +route_concept: list~Concept~ - } - - class VisitOccurrence { - +visit_type: list~Concept~ - +visit_length: NumericRange - } - - class CriteriaGroup { - +type: ALL | ANY | AT_LEAST | AT_MOST - +criteria_list: list~CorelatedCriteria~ - +groups: list~CriteriaGroup~ - +demographic_criteria_list: list~DemographicCriteria~ - } - - class CorelatedCriteria { - +criteria: Criteria - +start_window: Window - +end_window: Window - +occurrence: Occurrence - } - - class Window { - +start: WindowBound - +end: WindowBound - +use_event_end: bool - +use_index_end: bool - } - - class InclusionRule { - +name: str - +expression: CriteriaGroup - } - - class EndStrategy { - <> - } - - class DateOffsetStrategy { - +date_field: str - +offset: int - } - - class CustomEraStrategy { - +drug_codeset_id: int - +gap_days: int - +offset: int - } - - class ConceptSet { - +id: int - +name: str - +expression: ConceptSetExpression - } - - class CollapseSettings { - +collapse_type: ERA | NO_COLLAPSE - +era_pad: int - } - - CohortExpression *-- PrimaryCriteria - CohortExpression *-- CriteriaGroup : additional_criteria - CohortExpression *-- InclusionRule - CohortExpression *-- EndStrategy - CohortExpression *-- CollapseSettings - CohortExpression *-- ConceptSet - - PrimaryCriteria *-- Criteria - - Criteria <|-- ConditionOccurrence - Criteria <|-- DrugExposure - Criteria <|-- VisitOccurrence - - EndStrategy <|-- DateOffsetStrategy - EndStrategy <|-- CustomEraStrategy - - InclusionRule *-- CriteriaGroup - CriteriaGroup *-- CorelatedCriteria - CriteriaGroup *-- CriteriaGroup : nested groups - CorelatedCriteria *-- Criteria - CorelatedCriteria *-- Window -``` - -The model supports 16 OMOP CDM domain criteria (only three shown above for brevity), -each inheriting from the abstract `Criteria` base class. Temporal relationships between -events are expressed through `CorelatedCriteria` with `Window` bounds, enabling -arbitrary time-relative logic. - -## Overview - -This report compares SQL execution times between the R (CohortGenerator) and Python -(CircePy) implementations of the OHDSI CIRCE cohort expression compiler, both -executing against the same Databricks SQL Warehouse on the Merative CCAE CDM -(~150M patients). - -Both implementations: - -- Compile identical CIRCE JSON cohort definitions into SQL -- Execute against the same Databricks SQL Warehouse endpoint -- Write results to the same `scratch.scratch_jgilber2` schema -- Produce row-level identical output (verified separately) - -```{python} -import pandas as pd -import numpy as np -from pathlib import Path - -# Load data -times = pd.read_csv(Path("execution_times.csv")) -counts = pd.read_csv(Path("cohort_row_counts.csv")) - -# Pivot to wide format for paired comparison -r_times = times[times.implementation == "R"].rename(columns={"duration_s": "r_seconds"}) -py_times = times[times.implementation == "Python"].rename(columns={"duration_s": "py_seconds"}) - -paired = pd.merge( - r_times[["cohort_definition_id", "r_seconds"]], - py_times[["cohort_definition_id", "py_seconds"]], - on="cohort_definition_id", -) -paired = pd.merge(paired, counts, on="cohort_definition_id", how="left") -paired["speedup"] = paired["r_seconds"] / paired["py_seconds"] -paired["diff_seconds"] = paired["py_seconds"] - paired["r_seconds"] -paired["pct_diff"] = (paired["diff_seconds"] / paired["r_seconds"]) * 100 -``` - -## Summary Statistics - -```{python} -#| label: tbl-summary -#| tbl-cap: "Aggregate execution time comparison (707 shared cohorts)" - -summary = pd.DataFrame({ - "Metric": [ - "Total wall-clock time", - "Mean per cohort", - "Median per cohort", - "Std dev", - "Min", - "Max", - "Cohorts where Python faster", - ], - "R (CohortGenerator)": [ - f"{paired.r_seconds.sum():,.0f}s ({paired.r_seconds.sum()/3600:.1f}h)", - f"{paired.r_seconds.mean():.1f}s", - f"{paired.r_seconds.median():.1f}s", - f"{paired.r_seconds.std():.1f}s", - f"{paired.r_seconds.min():.1f}s", - f"{paired.r_seconds.max():.1f}s", - "", - ], - "Python (CircePy)": [ - f"{paired.py_seconds.sum():,.0f}s ({paired.py_seconds.sum()/3600:.1f}h)", - f"{paired.py_seconds.mean():.1f}s", - f"{paired.py_seconds.median():.1f}s", - f"{paired.py_seconds.std():.1f}s", - f"{paired.py_seconds.min():.1f}s", - f"{paired.py_seconds.max():.1f}s", - f"{(paired.py_seconds < paired.r_seconds).sum()} / {len(paired)} ({(paired.py_seconds < paired.r_seconds).mean()*100:.0f}%)", - ], -}) - -summary.style.hide(axis="index") -``` - -```{python} -#| label: tbl-ratio -#| tbl-cap: "Speed ratio distribution (R time / Python time)" - -quantiles = paired.speedup.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) -ratio_summary = pd.DataFrame({ - "Percentile": ["5th", "25th", "Median", "75th", "95th"], - "R/Python ratio": [f"{v:.2f}" for v in quantiles.values], - "Interpretation": [ - f"Python {1/quantiles.iloc[0]:.1f}x slower" if quantiles.iloc[0] > 1 else f"Python {quantiles.iloc[0]:.1f}x faster", - f"Python {1/quantiles.iloc[1]:.1f}x slower" if quantiles.iloc[1] > 1 else f"Python {quantiles.iloc[1]:.1f}x faster", - f"Python {1/quantiles.iloc[2]:.1f}x slower" if quantiles.iloc[2] > 1 else f"Python {quantiles.iloc[2]:.1f}x faster", - f"Python {1/quantiles.iloc[3]:.1f}x slower" if quantiles.iloc[3] > 1 else f"Python {quantiles.iloc[3]:.1f}x faster", - f"Python {1/quantiles.iloc[4]:.1f}x slower" if quantiles.iloc[4] > 1 else f"Python {quantiles.iloc[4]:.1f}x faster", - ], -}) -ratio_summary.style.hide(axis="index") -``` - -## Execution Time Distribution - -```{python} -#| label: fig-distribution -#| fig-cap: "Distribution of per-cohort execution times by implementation" -#| fig-width: 10 -#| fig-height: 5 - -import matplotlib.pyplot as plt - -fig, axes = plt.subplots(1, 2, figsize=(10, 5)) - -# Histogram -axes[0].hist(paired.r_seconds, bins=50, alpha=0.6, label="R", color="#2196F3") -axes[0].hist(paired.py_seconds, bins=50, alpha=0.6, label="Python", color="#FF9800") -axes[0].set_xlabel("Execution time (seconds)") -axes[0].set_ylabel("Number of cohorts") -axes[0].set_title("Execution Time Distribution") -axes[0].legend() - -# Log-scale box plot -data_box = pd.DataFrame({ - "R": paired.r_seconds, - "Python": paired.py_seconds, -}) -axes[1].boxplot([paired.r_seconds, paired.py_seconds], labels=["R", "Python"]) -axes[1].set_ylabel("Execution time (seconds)") -axes[1].set_title("Execution Time (Box Plot)") -axes[1].set_yscale("log") - -plt.tight_layout() -plt.show() -``` - -## Paired Comparison (Scatter) - -```{python} -#| label: fig-scatter -#| fig-cap: "Per-cohort execution time: R vs Python (each point = one cohort definition)" -#| fig-width: 8 -#| fig-height: 7 - -fig, ax = plt.subplots(figsize=(8, 7)) - -max_val = max(paired.r_seconds.max(), paired.py_seconds.max()) * 1.05 - -ax.scatter(paired.r_seconds, paired.py_seconds, alpha=0.4, s=15, color="#455A64") -ax.plot([0, max_val], [0, max_val], "k--", linewidth=1, label="y = x (equal time)") -ax.set_xlabel("R execution time (seconds)") -ax.set_ylabel("Python execution time (seconds)") -ax.set_title("Per-Cohort Execution Time: R vs Python") -ax.legend() -ax.set_xlim(0, max_val) -ax.set_ylim(0, max_val) -ax.set_aspect("equal") - -plt.tight_layout() -plt.show() -``` - -```{python} -#| label: fig-scatter-log -#| fig-cap: "Same comparison on log-log scale to reveal patterns across magnitudes" -#| fig-width: 8 -#| fig-height: 7 - -fig, ax = plt.subplots(figsize=(8, 7)) - -ax.scatter(paired.r_seconds, paired.py_seconds, alpha=0.4, s=15, color="#455A64") - -lims = [ - min(paired.r_seconds.min(), paired.py_seconds.min()) * 0.8, - max(paired.r_seconds.max(), paired.py_seconds.max()) * 1.2, -] -ax.plot(lims, lims, "k--", linewidth=1, label="y = x (equal time)") -ax.set_xlabel("R execution time (seconds)") -ax.set_ylabel("Python execution time (seconds)") -ax.set_title("Per-Cohort Execution Time: R vs Python (Log Scale)") -ax.set_xscale("log") -ax.set_yscale("log") -ax.legend() - -plt.tight_layout() -plt.show() -``` - -## Speed Ratio Distribution - -```{python} -#| label: fig-speedup -#| fig-cap: "Distribution of R/Python speed ratio (>1 means R is faster)" -#| fig-width: 9 -#| fig-height: 5 - -fig, ax = plt.subplots(figsize=(9, 5)) - -ax.hist(paired.speedup, bins=50, color="#607D8B", edgecolor="white", linewidth=0.5) -ax.axvline(x=1.0, color="red", linestyle="--", linewidth=1.5, label="Equal speed") -ax.axvline(x=paired.speedup.median(), color="#FF9800", linestyle="-", linewidth=1.5, - label=f"Median = {paired.speedup.median():.2f}") -ax.set_xlabel("Speed ratio (R seconds / Python seconds)") -ax.set_ylabel("Number of cohorts") -ax.set_title("Speed Ratio Distribution") -ax.legend() - -plt.tight_layout() -plt.show() -``` - -## Per-Cohort Detail - -```{python} -#| label: fig-waterfall -#| fig-cap: "Per-cohort time difference (Python − R), sorted by magnitude" -#| fig-width: 10 -#| fig-height: 6 - -sorted_diff = paired.sort_values("diff_seconds").reset_index(drop=True) - -fig, ax = plt.subplots(figsize=(10, 6)) -colors = ["#4CAF50" if d < 0 else "#F44336" for d in sorted_diff.diff_seconds] -ax.bar(range(len(sorted_diff)), sorted_diff.diff_seconds, color=colors, width=1.0) -ax.axhline(y=0, color="black", linewidth=0.5) -ax.set_xlabel("Cohort (sorted by time difference)") -ax.set_ylabel("Time difference: Python − R (seconds)") -ax.set_title("Per-Cohort Time Difference (green = Python faster, red = R faster)") - -plt.tight_layout() -plt.show() -``` - -## Execution Time vs Cohort Size - -```{python} -#| label: fig-size-vs-time -#| fig-cap: "Relationship between cohort output size and execution time" -#| fig-width: 10 -#| fig-height: 5 - -has_counts = paired.dropna(subset=["n_rows"]) - -if len(has_counts) > 0: - fig, axes = plt.subplots(1, 2, figsize=(10, 5)) - - axes[0].scatter(has_counts.n_rows, has_counts.r_seconds, alpha=0.3, s=10, label="R", color="#2196F3") - axes[0].scatter(has_counts.n_rows, has_counts.py_seconds, alpha=0.3, s=10, label="Python", color="#FF9800") - axes[0].set_xlabel("Cohort output rows") - axes[0].set_ylabel("Execution time (seconds)") - axes[0].set_xscale("log") - axes[0].set_yscale("log") - axes[0].set_title("Execution Time vs Cohort Size") - axes[0].legend() - - axes[1].scatter(has_counts.n_rows, has_counts.speedup, alpha=0.3, s=10, color="#455A64") - axes[1].axhline(y=1.0, color="red", linestyle="--", linewidth=1) - axes[1].set_xlabel("Cohort output rows") - axes[1].set_ylabel("R/Python speed ratio") - axes[1].set_xscale("log") - axes[1].set_title("Speed Ratio vs Cohort Size") - - plt.tight_layout() - plt.show() -``` - -## Slowest Cohorts - -```{python} -#| label: tbl-slowest -#| tbl-cap: "Top 20 slowest cohorts (by Python execution time)" - -top20 = paired.nlargest(20, "py_seconds")[ - ["cohort_definition_id", "r_seconds", "py_seconds", "diff_seconds", "speedup", "n_rows"] -].copy() -top20.columns = ["Cohort ID", "R (s)", "Python (s)", "Diff (s)", "R/Py Ratio", "Output Rows"] -top20["R (s)"] = top20["R (s)"].round(1) -top20["Python (s)"] = top20["Python (s)"].round(1) -top20["Diff (s)"] = top20["Diff (s)"].round(1) -top20["R/Py Ratio"] = top20["R/Py Ratio"].round(2) -top20["Output Rows"] = top20["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") -top20.style.hide(axis="index") -``` - -## Largest Python Advantage - -```{python} -#| label: tbl-py-faster -#| tbl-cap: "Top 20 cohorts where Python is fastest relative to R" - -py_wins = paired.nlargest(20, "speedup")[ - ["cohort_definition_id", "r_seconds", "py_seconds", "speedup", "n_rows"] -].copy() -py_wins.columns = ["Cohort ID", "R (s)", "Python (s)", "R/Py Ratio", "Output Rows"] -py_wins["R (s)"] = py_wins["R (s)"].round(1) -py_wins["Python (s)"] = py_wins["Python (s)"].round(1) -py_wins["R/Py Ratio"] = py_wins["R/Py Ratio"].round(2) -py_wins["Output Rows"] = py_wins["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") -py_wins.style.hide(axis="index") -``` - -## Largest R Advantage - -```{python} -#| label: tbl-r-faster -#| tbl-cap: "Top 20 cohorts where R is fastest relative to Python" - -r_wins = paired.nsmallest(20, "speedup")[ - ["cohort_definition_id", "r_seconds", "py_seconds", "speedup", "n_rows"] -].copy() -r_wins.columns = ["Cohort ID", "R (s)", "Python (s)", "R/Py Ratio", "Output Rows"] -r_wins["R (s)"] = r_wins["R (s)"].round(1) -r_wins["Python (s)"] = r_wins["Python (s)"].round(1) -r_wins["R/Py Ratio"] = r_wins["R/Py Ratio"].round(2) -r_wins["Output Rows"] = r_wins["Output Rows"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A") -r_wins.style.hide(axis="index") -``` - -## Methodology - -- **CDM**: Merative CCAE v3909 (~150M patients) -- **Backend**: Databricks SQL Warehouse (shared endpoint) -- **R stack**: CohortGenerator → DatabaseConnector → JDBC → Spark SQL -- **Python stack**: CircePy → ibis-framework → databricks-sql-connector → Spark SQL -- **Timing**: Wall-clock time from checksum table (`end_time - start_time`) — includes SQL compilation, network round-trips, and warehouse execution -- **Cohort definitions**: 707 PhenotypeLibrary phenotypes compiled and executed by both implementations -- **Correctness**: Row-level output verified identical between implementations diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 4f0cb74..0000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,96 +0,0 @@ -# CircePy CohortGeneration Benchmarks - -R and Python benchmarks that generate OHDSI PhenotypeLibrary cohort definitions -against the Eunomia synthetic OMOP CDM (DuckDB backend). Measures per-cohort -generation time and compares R `CohortGenerator` vs Python `circe` performance. - -## Prerequisites - -Install the required R packages: - -```r -install.packages("remotes") -remotes::install_github("OHDSI/Eunomia") -remotes::install_github("OHDSI/CohortGenerator") -remotes::install_github("OHDSI/DatabaseConnector") -remotes::install_github("OHDSI/PhenotypeLibrary") -``` - -Install the Python package with DuckDB support: - -```bash -pip install -e ".[dev]" -``` - -For Databricks benchmarks, install the Databricks backend extra as well: - -```bash -pip install -e ".[dev,ibis-databricks]" -``` - -## Quick Start - -```bash -# 1. Export PhenotypeLibrary cohort JSONs (one-time setup) -Rscript benchmarks/export_phenotypes.R - -# 2. Run R benchmark (creates Eunomia DuckDB, generates cohorts) -Rscript benchmarks/benchmark_run_r.R - -# 3. Run Python benchmark (reuses same DuckDB, generates cohorts) -python benchmarks/benchmark_run_py.py - -# 4. Analyze and compare results -python benchmarks/benchmark_analyze_duckdb.py -``` - -## Databricks - -The benchmark scripts can also run against a Databricks SQL warehouse. Set the -connection and schema environment variables referenced by -`benchmark_db_config.yaml`, then run the same entry points with -`--backend databricks`: - -```bash -export DATABRICKS_HOST="adb-..databricks.net" -export DATABRICKS_HTTP_PATH="/sql/1.0/warehouses/" -export DATABRICKS_TOKEN="..." -export DATABRICKS_CDM_SCHEMA="catalog.schema" -export DATABRICKS_VOCABULARY_SCHEMA="catalog.schema" -export DATABRICKS_RESULTS_SCHEMA="catalog.schema" - -Rscript benchmarks/benchmark_run_r.R --backend databricks -python benchmarks/benchmark_run_py.py --backend databricks -python benchmarks/benchmark_analyze_duckdb.py --backend databricks -``` - -If you use Unity Catalog defaults, you can also set `DATABRICKS_CATALOG` and -`DATABRICKS_DATABASE` for the Python Ibis connection. - -## Files - -| File | Language | Purpose | -|------|----------|---------| -| `export_phenotypes.R` | R | Exports PhenotypeLibrary cohort JSONs for Python consumption | -| `benchmark_run_r.R` | R | Runs R CohortGenerator against Eunomia DuckDB | -| `benchmark_run_py.py` | Python | Runs Python `generate_cohort_set()` against Eunomia DuckDB | -| `benchmark_analyze_duckdb.py` | Python | Side-by-side comparison of R vs Python timing, cross-validation | -| `benchmark_db_config.yaml` | — | Database backend configurations | - -## Output - -All output is written to `benchmark_output/`: - -| File | Source | Description | -|------|--------|-------------| -| `eunomia.duckdb` | R | Persistent DuckDB with Eunomia GiBleed CDM | -| `phenotype_jsons/` | R export | One Circe JSON per PhenotypeLibrary cohort | -| `phenotype_manifest.csv` | R export | Cohort ID and name mapping | -| `r_checksum_times.csv` | R benchmark | Per-cohort generation timing from R | -| `py_checksum_times.csv` | Python benchmark | Per-cohort generation timing from Python | - -## Incremental Mode - -Both benchmarks use incremental generation. After the first run, unchanged cohorts -are skipped based on SHA-256 checksums of their definitions. Timing from the -original run is preserved in the checksum history table. diff --git a/benchmarks/_backend.py b/benchmarks/_backend.py deleted file mode 100644 index 32b69a5..0000000 --- a/benchmarks/_backend.py +++ /dev/null @@ -1,249 +0,0 @@ -"""Shared backend connection helpers for the benchmarks. - -Used by both :file:`benchmark_run_py.py` and :file:`benchmark_analyze_duckdb.py` -to connect to DuckDB (local file) or Databricks (via YAML config) uniformly. -""" - -from __future__ import annotations - -import os -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import ibis -import yaml - -# Optional Databricks support — checked lazily at connect time. -try: - import ibis.backends.databricks # noqa: F401 - - _HAS_IBIS_DATABRICKS = True -except ImportError: - _HAS_IBIS_DATABRICKS = False - -REPO_ROOT = Path(__file__).resolve().parent.parent -OUTPUT_DIR = REPO_ROOT / "benchmark_output" -CONFIG_PATH = Path(__file__).resolve().parent / "benchmark_db_config.yaml" -DUCKDB_PATH = OUTPUT_DIR / "eunomia.duckdb" - -R_COHORT_TABLE = "cohort" -PY_COHORT_TABLE = "cohort_py" -R_CHECKSUM_TABLE = "cohort_checksum" -PY_CHECKSUM_TABLE = "cohort_py_checksum" - -# CSV paths -R_CSV = OUTPUT_DIR / "r_checksum_times.csv" -PY_CSV = OUTPUT_DIR / "py_checksum_times.csv" - -ENV_PATH = REPO_ROOT / ".env" - - -def _strip_wrapping_quotes(value: str) -> str: - if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}: - return value[1:-1] - return value - - -def _load_env_file() -> None: - """Load repo-local environment variables without overriding the shell.""" - if not ENV_PATH.exists(): - return - - for raw_line in ENV_PATH.read_text().splitlines(): - line = raw_line.strip() - if not line or line.startswith("#") or "=" not in line: - continue - - key, value = line.split("=", 1) - key = key.strip() - if not key or key in os.environ: - continue - - os.environ[key] = _strip_wrapping_quotes(value.strip()) - - -_load_env_file() - - -def _expandvars(text: str) -> str: - """Expand ``${ENV_VAR}`` patterns in *text*, falling back to an empty string.""" - return re.sub( - r"\$\{(\w+)\}", - lambda m: _get_env_var(m.group(1)), - text, - ) - - -def _get_env_var(name: str) -> str: - """Return an environment variable, including benchmark-specific aliases.""" - value = os.environ.get(name) - if value: - return value - - aliases = { - "DATABRICKS_RESULTS_SCHEMA": "DATABRICKS_SCRATCH_SCHEMA", - } - alias = aliases.get(name) - if alias is None: - return "" - return os.environ.get(alias, "") - - -def _expandvars_recursive(obj: Any) -> Any: - """Expand environment variables throughout a nested dict/list.""" - if isinstance(obj, str): - return _expandvars(obj) - if isinstance(obj, dict): - return {k: _expandvars_recursive(v) for k, v in obj.items()} - if isinstance(obj, list): - return [_expandvars_recursive(v) for v in obj] - return obj - - -def _require_config_value(cfg: dict[str, Any], path: tuple[str, ...], env_var: str | None = None) -> str: - """Return a non-empty configuration value or raise a helpful error.""" - current: Any = cfg - for key in path: - if not isinstance(current, dict): - current = None - break - current = current.get(key) - - if isinstance(current, str) and current: - return current - - dotted = ".".join(path) - if env_var is not None: - raise ValueError( - f"Missing Databricks config value '{dotted}'. Set {env_var} or update {CONFIG_PATH}." - ) - raise ValueError(f"Missing Databricks config value '{dotted}' in {CONFIG_PATH}.") - - -def _split_catalog_schema(qualified_schema: str | None) -> tuple[str | None, str | None]: - """Split a qualified Databricks schema into catalog and schema parts.""" - if not qualified_schema: - return None, None - - parts = [part for part in qualified_schema.split(".") if part] - if len(parts) >= 2: - return parts[0], parts[1] - return None, parts[0] if parts else None - - -def _infer_databricks_namespace(cfg: dict[str, Any]) -> tuple[str | None, str | None]: - """Infer a sensible catalog/schema for the initial Databricks connection.""" - conn_cfg = cfg.get("connection", {}) - if conn_cfg.get("catalog") or conn_cfg.get("schema"): - return conn_cfg.get("catalog"), conn_cfg.get("schema") - - for key in ("results_schema", "cdm_schema", "vocabulary_schema"): - catalog, schema = _split_catalog_schema(cfg.get(key)) - if catalog or schema: - return catalog, schema - - return None, None - - -@dataclass -class BackendConnection: - """Hold the configured connection and schema information for a benchmark run.""" - - backend: ibis.BaseBackend - cdm_schema: str - results_schema: str - vocabulary_schema: str - r_cohort_table: str - py_cohort_table: str - r_checksum_table: str - py_checksum_table: str - - -def load_config(backend_name: str) -> dict[str, Any]: - """Load the YAML configuration for *backend_name*. - - ``${ENV_VAR}`` placeholders are expanded from the process environment. - """ - if not CONFIG_PATH.exists(): - raise FileNotFoundError(f"Config not found: {CONFIG_PATH}") - - config = yaml.safe_load(CONFIG_PATH.read_text()) - section = config.get(backend_name) - if section is None: - available = [k for k in config if k != "eunomia"] - raise ValueError(f"Unknown backend '{backend_name}'. Available: {', '.join(available)}") - - return _expandvars_recursive(section) - - -def connect_backend(backend_name: str) -> BackendConnection: - """Create and return a backend connection from the YAML config. - - For ``duckdb`` the configuration is pre-set (points to the local Eunomia - DuckDB file). For ``databricks`` the configuration must be provided in - :file:`benchmarks/benchmark_db_config.yaml`. - """ - if backend_name == "duckdb": - if not DUCKDB_PATH.exists(): - raise FileNotFoundError( - f"{DUCKDB_PATH} not found. Run 'Rscript benchmarks/benchmark_run_r.R' first." - ) - backend = ibis.duckdb.connect(str(DUCKDB_PATH)) - return BackendConnection( - backend=backend, - cdm_schema="main", - results_schema="main", - vocabulary_schema="main", - r_cohort_table=R_COHORT_TABLE, - py_cohort_table=PY_COHORT_TABLE, - r_checksum_table=R_CHECKSUM_TABLE, - py_checksum_table=PY_CHECKSUM_TABLE, - ) - - cfg = load_config(backend_name) - driver = cfg.get("driver", backend_name) - - if driver == "databricks": - if not _HAS_IBIS_DATABRICKS: - raise ImportError( - "ibis-framework[databricks] is required. Install with: " - "pip install 'ibis-framework[databricks]'" - ) - - catalog, schema = _infer_databricks_namespace(cfg) - db_cfg: dict[str, Any] = { - "server_hostname": _require_config_value( - cfg, ("connection", "server_hostname"), env_var="DATABRICKS_HOST" - ), - "http_path": _require_config_value( - cfg, ("connection", "http_path"), env_var="DATABRICKS_HTTP_PATH" - ), - } - token = _require_config_value( - cfg, ("connection", "personal_access_token"), env_var="DATABRICKS_TOKEN" - ) - if token: - db_cfg["access_token"] = token - if catalog: - db_cfg["catalog"] = catalog - if schema: - db_cfg["schema"] = schema - - backend = ibis.databricks.connect(**db_cfg) - return BackendConnection( - backend=backend, - cdm_schema=_require_config_value(cfg, ("cdm_schema",), env_var="DATABRICKS_CDM_SCHEMA"), - results_schema=_require_config_value( - cfg, ("results_schema",), env_var="DATABRICKS_RESULTS_SCHEMA" - ), - vocabulary_schema=cfg.get("vocabulary_schema") - or _require_config_value(cfg, ("cdm_schema",), env_var="DATABRICKS_CDM_SCHEMA"), - r_cohort_table=cfg.get("r_cohort_table", R_COHORT_TABLE), - py_cohort_table=cfg.get("py_cohort_table", PY_COHORT_TABLE), - r_checksum_table=cfg.get("r_checksum_table", R_CHECKSUM_TABLE), - py_checksum_table=cfg.get("py_checksum_table", PY_CHECKSUM_TABLE), - ) - - raise ValueError(f"Unsupported driver: {driver}") diff --git a/benchmarks/benchmark_analyze_duckdb.py b/benchmarks/benchmark_analyze_duckdb.py deleted file mode 100644 index b897332..0000000 --- a/benchmarks/benchmark_analyze_duckdb.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python3 -"""Analyse R and Python benchmark results side-by-side. - -Reads the checksum timing CSVs produced by -:file:`benchmarks/benchmark_run_r.R` and :file:`benchmarks/benchmark_run_py.py`, -queries the persisted history tables directly from the database for -cross-validation, and prints a paper-ready comparative summary. - -Usage:: - - python benchmarks/benchmark_analyze_duckdb.py # DuckDB - python benchmarks/benchmark_analyze_duckdb.py --backend databricks -""" - -from __future__ import annotations - -import argparse -from pathlib import Path - -import pandas as pd -from _backend import PY_CSV, R_CSV, connect_backend -from compare_cohort_outputs import compare_cohort_outputs, print_comparison_report - -REPO_ROOT = Path(__file__).resolve().parent.parent - - -def _parse_args() -> argparse.Namespace: - p = argparse.ArgumentParser(description="CircePy benchmark result analyzer") - p.add_argument( - "--backend", - default="duckdb", - choices=("duckdb", "databricks"), - help="Target database backend for cross-validation (default: duckdb)", - ) - return p.parse_args() - - -def load_csv(path: Path) -> pd.DataFrame | None: - """Load a benchmark timing CSV, returning None if it does not exist.""" - if not path.exists(): - print(f" [WARN] {path} not found — skipping") - return None - return pd.read_csv(path) - - -def _has_status(df: pd.DataFrame) -> bool: - return "status" in df.columns - - -def print_coverage(label: str, df: pd.DataFrame) -> None: - n = len(df) - if _has_status(df): - n_ok = (df["status"] == "COMPLETE").sum() - n_fail = (df["status"] == "FAILED").sum() - n_skip = (df["status"] == "SKIPPED").sum() - print(f" {label}: {n} cohorts — {n_ok} COMPLETE, {n_fail} FAILED, {n_skip} SKIPPED") - else: - print(f" {label}: {n} cohorts (checksum table — all assumed COMPLETE)") - - -def print_timing(label: str, df: pd.DataFrame) -> None: - complete = df[df["status"] == "COMPLETE"] if _has_status(df) else df - if complete.empty: - print(f" {label}: no completed cohorts to report timing") - return - secs = complete["generation_seconds"] - print(f" {label} timing (n={len(secs)}):") - print(f" Total : {secs.sum():.4f}s") - print(f" Mean : {secs.mean():.4f}s") - print(f" Median: {secs.median():.4f}s") - print(f" Std : {secs.std():.4f}s") - print(f" Min : {secs.min():.4f}s") - print(f" Max : {secs.max():.4f}s") - - -def cross_validate( - label: str, - csv_df: pd.DataFrame, - conn, - cohort_table: str, - checksum_table: str, -) -> None: - """Read the persisted checksum table and compare with the CSV.""" - try: - history = conn.backend.table(checksum_table, database=conn.results_schema).execute() - except Exception: - print(f" {label} cross-validation: checksum table '{checksum_table}' not found") - return - - if history.empty: - print(f" {label} cross-validation: checksum table is empty") - return - - complete_csv = csv_df[csv_df["status"] == "COMPLETE"] if _has_status(csv_df) else csv_df - if complete_csv.empty: - return - - history_complete = history[history["status"] == "COMPLETE"] if _has_status(history) else history - print(f" {label} cross-validation:") - print(f" CSV rows : {len(complete_csv)}") - print(f" DB rows : {len(history_complete)}") - - csv_total = complete_csv["generation_seconds"].sum() - if "start_time" in history_complete.columns and "end_time" in history_complete.columns: - starts = history_complete["start_time"] - ends = history_complete["end_time"] - if pd.api.types.is_datetime64_any_dtype(starts): - db_total = (ends - starts).dt.total_seconds().sum() - else: - db_total = ((ends.astype(float) - starts.astype(float)) / 1000.0).sum() - delta = abs(csv_total - db_total) - print(f" CSV total time : {csv_total:.4f}s") - print(f" DB total time : {db_total:.4f}s") - print(f" Delta : {delta:.4f}s {'✓' if delta < 1.0 else '✗'}") - - -def print_cohort_row_counts(label: str, conn, cohort_table: str) -> None: - """Print row count summary from the cohort output table.""" - try: - rows = conn.backend.table(cohort_table, database=conn.results_schema).execute() - except Exception: - print(f" {label} row counts: table '{cohort_table}' not found") - return - if rows.empty: - print(f" {label}: no cohort rows") - return - counts = rows.groupby("cohort_definition_id").size() - print(f" {label} row counts: {len(counts)} cohorts, {counts.sum()} total rows") - print(f" Mean per cohort: {counts.mean():.1f}") - - -def compare_shared(label_prefix: str, r_df: pd.DataFrame, py_df: pd.DataFrame) -> None: - """Compare timing for cohorts present in both runs.""" - r_complete = r_df[r_df["status"] == "COMPLETE"].copy() if _has_status(r_df) else r_df.copy() - py_complete = py_df[py_df["status"] == "COMPLETE"].copy() if _has_status(py_df) else py_df.copy() - if r_complete.empty or py_complete.empty: - return - - r_lookup = r_complete.set_index("cohort_definition_id")["generation_seconds"] - py_lookup = py_complete.set_index("cohort_definition_id")["generation_seconds"] - shared = r_lookup.index.intersection(py_lookup.index) - if len(shared) < 2: - return - - r_shared = r_lookup[shared] - py_shared = py_lookup[shared] - ratio = (py_shared / r_shared.replace(0, float("nan"))).dropna() - ratio = ratio.replace([float("inf"), -float("inf")], float("nan")).dropna() - - print(f"\n {label_prefix} per-cohort comparison ({len(shared)} shared cohorts):") - print(f" R total (shared) : {r_shared.sum():.4f}s") - print(f" Py total (shared) : {py_shared.sum():.4f}s") - print(f" R mean (shared) : {r_shared.mean():.4f}s") - print(f" Py mean (shared) : {py_shared.mean():.4f}s") - if len(ratio) > 0: - print(f" Py/R ratio median : {ratio.median():.2f}x") - - -def main() -> None: - args = _parse_args() - backend_label = args.backend - - print("=" * 60) - print(f"R vs Python CohortGenerator Benchmark Comparison (backend={backend_label})") - print("=" * 60) - - r_df = load_csv(R_CSV) - py_df = load_csv(PY_CSV) - - if r_df is None and py_df is None: - print("\nNo benchmark results found. Run the benchmarks first:") - print(" Rscript benchmarks/benchmark_run_r.R") - print(" python benchmarks/benchmark_run_py.py") - return - - # ── Coverage ──────────────────────────────────────────────────────── - print("\nTable 1 — Coverage") - if r_df is not None: - print_coverage("R ", r_df) - if py_df is not None: - print_coverage("Py", py_df) - - # ── Generation timing ──────────────────────────────────────────────── - print("\nTable 2 — Generation timing") - if r_df is not None: - print_timing("R ", r_df) - if py_df is not None: - print_timing("Py", py_df) - - # ── Cross-validation & row counts (needs a backend connection) ────── - print("\nTable 3 — Cross-validation (CSV vs persisted checksum table)") - try: - conn = connect_backend(backend_label) - except Exception as exc: - print(f" Cannot connect to {backend_label}: {exc}") - conn = None - - if conn is not None: - if r_df is not None: - cross_validate("R ", r_df, conn, conn.r_cohort_table, conn.r_checksum_table) - if py_df is not None: - cross_validate("Py", py_df, conn, conn.py_cohort_table, conn.py_checksum_table) - - print("\nTable 4 — Cohort row counts") - if r_df is not None: - print_cohort_row_counts("R ", conn, conn.r_cohort_table) - if py_df is not None: - print_cohort_row_counts("Py", conn, conn.py_cohort_table) - - print("\nTable 6 — Row-level parity (R vs Python)") - report = compare_cohort_outputs( - conn.backend, - r_table=conn.r_cohort_table, - py_table=conn.py_cohort_table, - schema=conn.results_schema, - ) - print_comparison_report(report) - - # ── R vs Python shared-cohort comparison ───────────────────────────── - if r_df is not None and py_df is not None: - print("\nTable 5 — R vs Python shared-cohort comparison") - compare_shared("=>", r_df, py_df) - - print(f"\n{'=' * 60}") - print("Analysis complete") - print(f"{'=' * 60}\n") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/benchmark_db_config.yaml b/benchmarks/benchmark_db_config.yaml deleted file mode 100644 index fd84994..0000000 --- a/benchmarks/benchmark_db_config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -# CircePy OHDSI Phenotype Benchmark - Database Configuration -# This file defines database backends for running comprehensive benchmarks -# Add additional backends as needed for your infrastructure - -# Primary benchmark backend (DuckDB - recommended for local development) -duckdb: - driver: "duckdb" - description: "DuckDB in-memory database (fast, no external dependencies)" - connection: - database: ":memory:" # Use in-memory for speed, or provide file path for persistence - cdm_schema: "main" - vocabulary_schema: "main" - results_schema: "main" - data_source: "eunomia" # Use OHDSI Eunomia synthetic OMOP CDM data - notes: | - DuckDB provides fast iteration for benchmark development. - Set database to a file path (e.g., /tmp/benchmark_eunomia.duckdb) for persistence. - -# PostgreSQL backend (for production-like validation) -# Uncomment and configure to test against PostgreSQL -# postgresql: -# driver: "postgresql" -# description: "PostgreSQL database" -# connection: -# host: "localhost" -# port: 5432 -# database: "omop_cdm" -# user: "postgres" -# password: "${POSTGRES_PASSWORD}" # Use environment variable -# cdm_schema: "public" -# vocabulary_schema: "public" -# results_schema: "results" -# data_source: "eunomia" -# notes: | -# Requires PostgreSQL to be running and populated with OMOP CDM data. -# Use environment variables for sensitive credentials. - -# Databricks backend (for cloud-scale validation) -# Set the required environment variables before running: -# export DATABRICKS_HOST="..." -# export DATABRICKS_HTTP_PATH="..." -# export DATABRICKS_TOKEN="..." -databricks: - driver: "databricks" - description: "Databricks SQL warehouse" - connection: - server_hostname: "${DATABRICKS_HOST}" - http_path: "${DATABRICKS_HTTP_PATH}" - personal_access_token: "${DATABRICKS_TOKEN}" - # catalog: "main" # optional — set if using Unity Catalog - # schema: "default" # optional — default schema - cdm_schema: "hive_metastore.omop_cdm" - vocabulary_schema: "hive_metastore.omop_cdm" - results_schema: "hive_metastore.results" - r_cohort_table: "cohort_r" - py_cohort_table: "cohort_py" - r_checksum_table: "cohort_r_checksum" - py_checksum_table: "cohort_py_checksum" - notes: | - Requires Databricks workspace, valid credentials via env vars, and - OMOP CDM data already loaded into the configured schema. - Uses ibis-framework[databricks] for Python connectivity. - -# Eunomia data source configuration -eunomia: - description: "OHDSI Eunomia synthetic OMOP CDM dataset" - url: "https://github.com/OHDSI/Eunomia/releases/download/v2.0.0/GimlettData_5.4.zip" - size: "~10MB compressed, ~100MB uncompressed" - patient_count: "~2500 patients" - notes: | - Synthetic OMOP CDM dataset for testing and development. - Includes realistic OMOP structure with conditions, drugs, visits, procedures, measurements, etc. - Ideal for benchmarking SQL generation and execution without privacy concerns. diff --git a/benchmarks/benchmark_run_py.py b/benchmarks/benchmark_run_py.py deleted file mode 100644 index 02d81b7..0000000 --- a/benchmarks/benchmark_run_py.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 -"""Runnable Python benchmark of PhenotypeLibrary cohorts. - -Usage:: - - # Export PhenotypeLibrary cohort JSONs (one-time setup) - Rscript benchmarks/export_phenotypes.R - - # DuckDB (default — needs Eunomia DB from R) - python benchmarks/benchmark_run_py.py - - # Databricks (set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN) - python benchmarks/benchmark_run_py.py --backend databricks - -Output (written to *benchmark_output/*):: - - py_checksum_times.csv -- per-phenotype generation timing and status -""" - -from __future__ import annotations - -import argparse -import logging -import sys -from pathlib import Path - -import pandas as pd -from _backend import connect_backend - -from circe.cohort_definition_set import CohortDefinitionSet, generate_cohort_set -from circe.cohortdefinition import CohortExpression - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - datefmt="%H:%M:%S", -) - -logging.getLogger("circe").setLevel(logging.INFO) - -for logger_name in ( - "databricks", - "databricks.sql", - "databricks.sql.client", - "databricks.sql.http", - "urllib3", -): - logging.getLogger(logger_name).setLevel(logging.WARNING) - -REPO_ROOT = Path(__file__).resolve().parent.parent -OUTPUT_DIR = REPO_ROOT / "benchmark_output" -JSON_DIR = OUTPUT_DIR / "phenotype_jsons" -MANIFEST_PATH = OUTPUT_DIR / "phenotype_manifest.csv" -RESULTS_CSV = OUTPUT_DIR / "py_checksum_times.csv" - - -def _parse_args() -> argparse.Namespace: - p = argparse.ArgumentParser(description="Python circe cohort benchmark runner") - p.add_argument( - "--backend", - default="duckdb", - choices=("duckdb", "databricks"), - help="Target database backend (default: duckdb)", - ) - return p.parse_args() - - -def main() -> None: - args = _parse_args() - backend_label = args.backend - - # ── 1. Load phenotype definitions ──────────────────────────────────── - print("Loading phenotype definitions ...") - if not MANIFEST_PATH.exists(): - print( - f" {MANIFEST_PATH} not found. Run 'Rscript benchmarks/export_phenotypes.R' first.", - file=sys.stderr, - ) - sys.exit(1) - - manifest = pd.read_csv(MANIFEST_PATH) - print(f" Manifest has {len(manifest)} cohorts") - - cds = CohortDefinitionSet() - skipped = 0 - for _, row in manifest.iterrows(): - cohort_id = int(row["cohortId"]) - cohort_name = str(row["cohortName"]) - json_path = JSON_DIR / f"{cohort_id}.json" - if not json_path.exists(): - skipped += 1 - continue - expression = CohortExpression.model_validate_json(json_path.read_text()) - cds.add(cohort_id=cohort_id, cohort_name=cohort_name, expression=expression) - - if skipped: - print(f" Skipped {skipped} cohorts with missing JSON files") - print(f" Loaded {len(cds)} cohorts into CohortDefinitionSet") - - # ── 2. Connect to backend ──────────────────────────────────────────── - print(f"Connecting to backend: {backend_label}") - conn = connect_backend(backend_label) - - # ── 3. Generate cohorts ────────────────────────────────────────────── - checksum_table = conn.py_checksum_table - cohort_table = conn.py_cohort_table - print(f"Generating cohorts (incremental) → {conn.results_schema}.{cohort_table}") - results = generate_cohort_set( - cds, - backend=conn.backend, - cdm_schema=conn.cdm_schema, - cohort_table=cohort_table, - results_schema=conn.results_schema, - vocabulary_schema=conn.vocabulary_schema, - incremental=True, - checksum_table=checksum_table, - stop_on_error=False, - ) - - # ── 4. Extract timing ──────────────────────────────────────────────── - print("Extracting timing ...") - rows = [] - for r in results: - generation_seconds = (r.end_time - r.start_time).total_seconds() - rows.append( - { - "cohort_definition_id": r.cohort_id, - "cohort_name": r.cohort_name, - "checksum": r.checksum, - "status": r.status, - "generation_seconds": generation_seconds, - "start_time": r.start_time.isoformat(), - "end_time": r.end_time.isoformat(), - } - ) - - df = pd.DataFrame(rows) - df.to_csv(RESULTS_CSV, index=False) - print(f" Wrote {len(df)} rows to {RESULTS_CSV}") - - # ── 5. Summary ──────────────────────────────────────────────────────── - complete_df = df[df["status"] == "COMPLETE"] - failed_df = df[df["status"] == "FAILED"] - skipped_df = df[df["status"] == "SKIPPED"] - - print(f"\n{'=' * 55}") - print(f"Python benchmark complete (backend={backend_label})") - print(f" Phenotypes loaded : {len(manifest)}") - print(f" COMPLETE : {len(complete_df)}") - print(f" FAILED : {len(failed_df)}") - print(f" SKIPPED : {len(skipped_df)}") - if len(complete_df) > 0: - print(f" Total time (sum) : {complete_df['generation_seconds'].sum():.4f}s") - print(f" Median per-cohort : {complete_df['generation_seconds'].median():.4f}s") - print(f" Timings written to : {RESULTS_CSV}") - print(f"{'=' * 55}\n") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/benchmark_run_r.R b/benchmarks/benchmark_run_r.R deleted file mode 100644 index a1aba7c..0000000 --- a/benchmarks/benchmark_run_r.R +++ /dev/null @@ -1,282 +0,0 @@ -#!/usr/bin/env Rscript -# benchmark_run_r.R -# -# R CohortGenerator benchmark — PhenotypeLibrary on Eunomia (DuckDB) or Databricks. -# -# Usage: -# Rscript benchmarks/benchmark_run_r.R # DuckDB (default) -# Rscript benchmarks/benchmark_run_r.R --backend databricks -# -# Output (in benchmark_output/): -# r_checksum_times.csv -- per-phenotype generation timing from checksum table - -suppressPackageStartupMessages({ - library(CohortGenerator) - library(Eunomia) - library(DatabaseConnector) - library(PhenotypeLibrary) - library(dplyr) -}) - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- -args <- commandArgs(trailingOnly = TRUE) -backend <- "duckdb" -if ("--backend" %in% args) { - idx <- which(args == "--backend") - if (idx < length(args)) backend <- args[idx + 1] -} - -# --------------------------------------------------------------------------- -# Paths -# --------------------------------------------------------------------------- -script_path <- normalizePath(sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])) -REPO_ROOT <- dirname(dirname(script_path)) -OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") -dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE) - -EUNOMIA_DATA_DIR <- file.path(REPO_ROOT, "eunomia_data") -dir.create(EUNOMIA_DATA_DIR, showWarnings = FALSE, recursive = TRUE) -Sys.setenv(EUNOMIA_DATA_FOLDER = EUNOMIA_DATA_DIR) - -DUCKDB_PATH <- file.path(OUTPUT_DIR, "eunomia.duckdb") - -cfg_value <- function(value, default = "") { - if (is.null(value) || identical(value, "")) { - return(default) - } - value -} - -trim_quotes <- function(value) { - sub("^(['\"])", "", sub("(['\"])$", "", trimws(value))) -} - -expand_env_vars <- function(value) { - matches <- gregexpr("\\$\\{([A-Za-z0-9_]+)\\}", value, perl = TRUE) - tokens <- regmatches(value, matches)[[1]] - if (length(tokens) == 0) { - return(value) - } - - expanded <- value - for (token in unique(tokens)) { - var_name <- sub("^\\$\\{", "", sub("\\}$", "", token)) - expanded <- gsub(token, Sys.getenv(var_name, unset = ""), expanded, fixed = TRUE) - } - expanded -} - -load_databricks_config <- function(config_path) { - if (!file.exists(config_path)) { - stop(sprintf("Config not found: %s", config_path)) - } - - lines <- readLines(config_path, warn = FALSE) - section_started <- FALSE - current_group <- NULL - values <- list() - - for (line in lines) { - if (grepl("^\\s*$", line) || grepl("^\\s*#", line)) { - next - } - - if (!section_started) { - if (grepl("^databricks:\\s*$", line)) { - section_started <- TRUE - } - next - } - - if (grepl("^[A-Za-z0-9_-]+:\\s*$", line)) { - break - } - - if (grepl("^ [A-Za-z0-9_-]+:\\s*$", line)) { - current_group <- sub("^ ([A-Za-z0-9_-]+):\\s*$", "\\1", line) - if (is.null(values[[current_group]])) { - values[[current_group]] <- list() - } - next - } - - if (grepl("^ [A-Za-z0-9_-]+:\\s*", line)) { - key <- sub("^ ([A-Za-z0-9_-]+):.*$", "\\1", line) - raw_value <- sub("^ [A-Za-z0-9_-]+:\\s*", "", line) - values[[key]] <- expand_env_vars(trim_quotes(raw_value)) - current_group <- NULL - next - } - - if (!is.null(current_group) && grepl("^ [A-Za-z0-9_-]+:\\s*", line)) { - key <- sub("^ ([A-Za-z0-9_-]+):.*$", "\\1", line) - raw_value <- sub("^ [A-Za-z0-9_-]+:\\s*", "", line) - values[[current_group]][[key]] <- expand_env_vars(trim_quotes(raw_value)) - } - } - - values -} - -# --------------------------------------------------------------------------- -# 1. Load phenotype definitions from PhenotypeLibrary -# --------------------------------------------------------------------------- -cat("Loading phenotypes from PhenotypeLibrary...\n") -phenotype_log <- PhenotypeLibrary::getPhenotypeLog() -cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$cohortId) -cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) - -# --------------------------------------------------------------------------- -# 1b. Export phenotype JSONs and manifest for the Python benchmark -# --------------------------------------------------------------------------- -cat("Exporting phenotype JSONs and manifest ...\n") -json_dir <- file.path(OUTPUT_DIR, "phenotype_jsons") -dir.create(json_dir, showWarnings = FALSE, recursive = TRUE) - -for (i in seq_len(nrow(cds))) { - cohort_id <- cds$cohortId[i] - json_path <- file.path(json_dir, sprintf("%d.json", cohort_id)) - writeLines(cds$json[i], json_path) -} - -manifest <- data.frame( - cohortId = cds$cohortId, - cohortName = cds$cohortName, - stringsAsFactors = FALSE -) -write.csv(manifest, file.path(OUTPUT_DIR, "phenotype_manifest.csv"), row.names = FALSE) -cat(sprintf(" Wrote %d JSONs and manifest\n", nrow(cds))) - -# --------------------------------------------------------------------------- -# 2. Set up database connection -# --------------------------------------------------------------------------- -if (backend == "duckdb") { - - cat("Setting up Eunomia DuckDB...\n") - if (!file.exists(DUCKDB_PATH)) { - dbPath <- Eunomia::getDatabaseFile( - datasetName = "GiBleed", - dbms = "duckdb", - databaseFile = DUCKDB_PATH - ) - } else { - dbPath <- DUCKDB_PATH - } - cat(sprintf(" Database: %s\n", dbPath)) - - connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = "duckdb", - server = dbPath - ) - CDM_SCHEMA <- "main" - RESULTS_SCHEMA <- "main" - COHORT_TABLE <- "cohort" - TEMP_EMULATION_SCHEMA <- NULL - -} else if (backend == "databricks") { - - cat("Setting up Databricks connection...\n") - - config_path <- file.path(dirname(script_path), "benchmark_db_config.yaml") - cfg <- load_databricks_config(config_path) - conn_cfg <- cfg$connection - - server_hostname <- cfg_value(conn_cfg$server_hostname) - http_path <- cfg_value(conn_cfg$http_path) - databricks_token <- cfg_value(conn_cfg$personal_access_token) - - if (server_hostname == "" || http_path == "" || databricks_token == "") { - stop( - paste( - "Databricks credentials not found in benchmarks/benchmark_db_config.yaml.", - "Set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN", - "before running the benchmarks." - ) - ) - } - - CDM_SCHEMA <- cfg_value(cfg$cdm_schema) - RESULTS_SCHEMA <- cfg_value(cfg$results_schema) - VOCABULARY_SCHEMA <- cfg_value(cfg$vocabulary_schema, CDM_SCHEMA) - if (CDM_SCHEMA == "") { - stop("Databricks cdm_schema is required. Set DATABRICKS_CDM_SCHEMA in the environment or update the benchmark config.") - } - if (RESULTS_SCHEMA == "") { - stop("Databricks results_schema is required. Set DATABRICKS_RESULTS_SCHEMA in the environment or update the benchmark config.") - } - - COHORT_TABLE <- cfg_value(cfg$r_cohort_table, "cohort") - TEMP_EMULATION_SCHEMA <- RESULTS_SCHEMA # Databricks needs a real schema for temp - - conn_string <- paste0( - "jdbc:databricks://", server_hostname, ":443/default;", - "transportMode=http;ssl=1;", - "httpPath=", http_path, ";", - "AuthMech=3;UID=token;PWD=", databricks_token - ) - connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = "spark", connectionString = conn_string - ) - - cat(sprintf("Databricks host: %s\n", server_hostname)) - -} else { - stop(sprintf("Unknown backend: %s. Use 'duckdb' or 'databricks'.", backend)) -} - -cat(sprintf("CDM schema : %s\n", CDM_SCHEMA)) -if (backend == "databricks") { - cat(sprintf("Vocabulary schema: %s\n", VOCABULARY_SCHEMA)) -} -cat(sprintf("Results schema : %s\n", RESULTS_SCHEMA)) -cat(sprintf("Cohort table : %s\n", COHORT_TABLE)) - -# --------------------------------------------------------------------------- -# 3. Generate cohorts using runCohortGeneration (incremental mode) -# --------------------------------------------------------------------------- -cat("Generating cohorts (incremental)...\n") -CohortGenerator::runCohortGeneration( - connectionDetails = connectionDetails, - cdmDatabaseSchema = CDM_SCHEMA, - cohortDatabaseSchema = RESULTS_SCHEMA, - tempEmulationSchema = TEMP_EMULATION_SCHEMA, - cohortDefinitionSet = cds, - incremental = TRUE, - outputFolder = OUTPUT_DIR, - databaseId = backend, - stopOnError = FALSE -) - -# --------------------------------------------------------------------------- -# 4. Extract timing from checksum table -# --------------------------------------------------------------------------- -cat("Extracting checksum timing...\n") -checksums <- CohortGenerator::getLastGeneratedCohortChecksums( - connectionDetails = connectionDetails, - cohortDatabaseSchema = RESULTS_SCHEMA -) - -times <- checksums %>% - transmute( - cohort_definition_id = cohortDefinitionId, - checksum = checksum, - generation_seconds = as.numeric(difftime(endTime, startTime, units = "secs")), - start_time = startTime, - end_time = endTime - ) - -out_file <- file.path(OUTPUT_DIR, "r_checksum_times.csv") -write.csv(times, out_file, row.names = FALSE) - -# --------------------------------------------------------------------------- -# Summary -# --------------------------------------------------------------------------- -cat(sprintf("\n%s\n", paste(rep("=", 55), collapse = ""))) -cat(sprintf("R benchmark complete (backend=%s)\n", backend)) -cat(sprintf(" Phenotypes loaded : %d\n", nrow(cds))) -cat(sprintf(" Cohorts generated : %d\n", nrow(times))) -cat(sprintf(" Total time (sum) : %.4fs\n", sum(times$generation_seconds, na.rm = TRUE))) -cat(sprintf(" Times written to : %s\n", out_file)) -cat(sprintf("%s\n\n", paste(rep("=", 55), collapse = ""))) diff --git a/benchmarks/compare_cohort_outputs.py b/benchmarks/compare_cohort_outputs.py deleted file mode 100644 index ea92b94..0000000 --- a/benchmarks/compare_cohort_outputs.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python3 -"""Cross-implementation cohort output validator. - -Compares the row-level output of two cohort generation implementations -(e.g. R/CohortGenerator vs Python/circe) to verify they produce identical -``(subject_id, cohort_start_date, cohort_end_date)`` rows for each shared -cohort. - -Usage:: - - import ibis - from benchmarks.compare_cohort_outputs import compare_cohort_outputs, print_comparison_report - - backend = ibis.duckdb.connect("benchmark_output/eunomia.duckdb") - report = compare_cohort_outputs( - backend, r_table="cohort", py_table="cohort_py", - ) - print_comparison_report(report) - - # Optionally validate programmatically: - assert report.n_cohorts_matched_exactly == report.n_cohorts_shared -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - -import pandas as pd - -from circe.execution.typing import IbisBackendLike - - -@dataclass -class CohortMatchSummary: - """Per-cohort row-level comparison result.""" - - cohort_id: int - """Cohort definition identifier.""" - - n_r: int - """Row count in the reference (R) table.""" - - n_py: int - """Row count in the Python table.""" - - n_matched: int - """Rows found identically in both tables.""" - - n_only_r: int - """Rows present only in the reference (R) table.""" - - n_only_py: int - """Rows present only in the Python table.""" - - sample_only_r: list[tuple[int, str, str]] = field(default_factory=list) - """Up to 3 sample rows from the reference table not found in Python.""" - - sample_only_py: list[tuple[int, str, str]] = field(default_factory=list) - """Up to 3 sample rows from the Python table not found in the reference.""" - - @property - def is_exact_match(self) -> bool: - """True when the two implementations produce identical row sets.""" - return self.n_only_r == 0 and self.n_only_py == 0 - - @property - def pass_ratio(self) -> float: - """Fraction of rows found in both implementations (0-1).""" - denom = max(self.n_r, self.n_py) - return self.n_matched / denom if denom > 0 else 1.0 - - -@dataclass -class CohortComparisonReport: - """Aggregate row-level comparison across implementations.""" - - per_cohort: list[CohortMatchSummary] - """Per-cohort comparison results.""" - - n_cohorts_shared: int - """Number of cohorts present in both tables.""" - - n_cohorts_matched_exactly: int - """Number of cohorts with zero row-level differences.""" - - total_r_rows: int - """Total row count in the reference table (all compared cohorts).""" - - total_py_rows: int - """Total row count in the Python table (all compared cohorts).""" - - total_matched: int - """Total matched rows across all compared cohorts.""" - - total_only_r: int - """Total rows only in the reference table.""" - - total_only_py: int - """Total rows only in the Python table.""" - - @property - def exact_match_pct(self) -> float: - """Percentage of shared cohorts that match exactly.""" - return self.n_cohorts_matched_exactly / self.n_cohorts_shared * 100 if self.n_cohorts_shared else 0.0 - - -def _read_cohort_table( - backend: IbisBackendLike, - table_name: str, - schema: str | None, - label: str, -) -> pd.DataFrame | None: - """Read a cohort output table, cast columns to canonical types, return a DataFrame.""" - try: - raw = backend.table(table_name, database=schema).execute() - except Exception: - print(f" [WARN] {label} table '{table_name}' not found") - return None - - if raw.empty: - return None - - df = pd.DataFrame( - { - "cohort_definition_id": pd.to_numeric(raw["cohort_definition_id"], errors="coerce").astype( - "int64" - ), - "subject_id": pd.to_numeric(raw["subject_id"], errors="coerce").astype("int64"), - "cohort_start_date": pd.to_datetime(raw["cohort_start_date"], errors="coerce").dt.date, - "cohort_end_date": pd.to_datetime(raw["cohort_end_date"], errors="coerce").dt.date, - } - ) - return df.drop_duplicates().dropna() - - -def _compare_single_cohort( - cohort_id: int, - r_rows: pd.DataFrame, - py_rows: pd.DataFrame, -) -> CohortMatchSummary: - """Compare row-level output for a single cohort.""" - key_cols = ["subject_id", "cohort_start_date", "cohort_end_date"] - - r_set = tuple(tuple(row) for row in r_rows[key_cols].itertuples(index=False)) - py_set = tuple(tuple(row) for row in py_rows[key_cols].itertuples(index=False)) - - r_unique = set(r_set) - py_unique = set(py_set) - - only_r = sorted(r_unique - py_unique) - only_py = sorted(py_unique - r_unique) - matched = r_unique & py_unique - - return CohortMatchSummary( - cohort_id=cohort_id, - n_r=len(r_unique), - n_py=len(py_unique), - n_matched=len(matched), - n_only_r=len(only_r), - n_only_py=len(only_py), - sample_only_r=[(int(s), str(d), str(e)) for s, d, e in only_r[:3]], - sample_only_py=[(int(s), str(d), str(e)) for s, d, e in only_py[:3]], - ) - - -def compare_cohort_outputs( - backend: IbisBackendLike, - r_table: str = "cohort", - py_table: str = "cohort_py", - schema: str | None = "main", - *, - cohort_ids: list[int] | None = None, -) -> CohortComparisonReport: - """Compare row-level cohort output between two implementations. - - Args: - backend: Ibis backend connection pointing at the database. - r_table: Name of the reference (R/CohortGenerator) cohort output table. - py_table: Name of the Python/circe cohort output table. - schema: Database schema where both tables reside. - cohort_ids: Specific cohort IDs to compare (``None`` = all shared). - - Returns: - :class:`CohortComparisonReport` with per-cohort and aggregate results. - """ - r_df = _read_cohort_table(backend, r_table, schema, "R") - py_df = _read_cohort_table(backend, py_table, schema, "Py") - - if r_df is None or py_df is None: - return CohortComparisonReport( - per_cohort=[], - n_cohorts_shared=0, - n_cohorts_matched_exactly=0, - total_r_rows=0, - total_py_rows=0, - total_matched=0, - total_only_r=0, - total_only_py=0, - ) - - r_ids = set(r_df["cohort_definition_id"].unique()) - py_ids = set(py_df["cohort_definition_id"].unique()) - - shared = sorted(r_ids & py_ids & set(cohort_ids)) if cohort_ids is not None else sorted(r_ids & py_ids) - - per_cohort: list[CohortMatchSummary] = [] - total_r = 0 - total_py = 0 - total_m = 0 - total_o_r = 0 - total_o_py = 0 - exact_count = 0 - - for cid in shared: - r_rows = r_df[r_df["cohort_definition_id"] == cid] - py_rows = py_df[py_df["cohort_definition_id"] == cid] - summary = _compare_single_cohort(cid, r_rows, py_rows) - per_cohort.append(summary) - total_r += summary.n_r - total_py += summary.n_py - total_m += summary.n_matched - total_o_r += summary.n_only_r - total_o_py += summary.n_only_py - if summary.is_exact_match: - exact_count += 1 - - return CohortComparisonReport( - per_cohort=per_cohort, - n_cohorts_shared=len(shared), - n_cohorts_matched_exactly=exact_count, - total_r_rows=total_r, - total_py_rows=total_py, - total_matched=total_m, - total_only_r=total_o_r, - total_only_py=total_o_py, - ) - - -def print_comparison_report(report: CohortComparisonReport) -> None: - """Print a human-readable row-level parity report.""" - print("\nTable 6 — Row-level parity (R vs Python)") - - if report.n_cohorts_shared == 0: - print(" No shared cohorts to compare.") - return - - print(f" Shared cohorts: {report.n_cohorts_shared}") - print(f" Exactly matched: {report.n_cohorts_matched_exactly} ({report.exact_match_pct:.1f}%)") - print(f" Total R rows: {report.total_r_rows:,}") - print(f" Total Py rows: {report.total_py_rows:,}") - print(f" Total matched: {report.total_matched:,}") - print(f" Total only in R: {report.total_only_r:,}") - print(f" Total only in Py: {report.total_only_py:,}") - - mismatched = [c for c in report.per_cohort if not c.is_exact_match] - if mismatched: - print(f"\n Cohort mismatches ({len(mismatched)}):") - for c in mismatched: - print( - f" {c.cohort_id:>5d} " - f"R={c.n_r:<6d} Py={c.n_py:<6d} " - f"matched={c.n_matched:<6d} " - f"only_R={c.n_only_r:<4d} only_Py={c.n_only_py:<4d}" - ) - if c.sample_only_r: - print(f" samples only_R: {c.sample_only_r[:3]}") - if c.sample_only_py: - print(f" samples only_Py: {c.sample_only_py[:3]}") - else: - print(f"\n ✓ All {report.n_cohorts_shared} shared cohorts match exactly.") diff --git a/benchmarks/error_repoer.txt b/benchmarks/error_repoer.txt deleted file mode 100644 index 71aa831..0000000 --- a/benchmarks/error_repoer.txt +++ /dev/null @@ -1,90 +0,0 @@ -gh or Sputum) -- duration 105.8s -Traceback (most recent call last): - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 160, in - main() - ~~~~^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\benchmarks\benchmark_run_py.py", line 108, in main - results = generate_cohort_set( - cds, - ...<7 lines>... - stop_on_error=False, - ) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 346, in generate_cohort_set - return asyncio.run( - ~~~~~~~~~~~^ - async_generate_cohort_set( - ^^^^^^^^^^^^^^^^^^^^^^^^^^ - ...<9 lines>... - ) - ^ - ) - ^ - File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\runners.py", line 204, in run - return runner.run(main) - ~~~~~~~~~~^^^^^^ - File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\runners.py", line 127, in run - return self._loop.run_until_complete(task) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ - File "C:\Users\admin_jgilber2\AppData\Local\Python\pythoncore-3.14-64\Lib\asyncio\base_events.py", line 719, in run_until_complete - return future.result() - ~~~~~~~~~~~~~^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_generate.py", line 294, in async_generate_cohort_set - upsert_generation_history( - ~~~~~~~~~~~~~~~~~~~~~~~~~^ - backend, - ^^^^^^^^ - ...<6 lines>... - end_time=end_time or datetime.now(), - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\cohort_definition_set\_checksum_store.py", line 265, in upsert_generation_history - create_table( - ~~~~~~~~~~~~^ - backend, - ^^^^^^^^ - ...<11 lines>... - overwrite=False, - ^^^^^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 64, in create_table - _call_with_optional_database( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - backend.create_table, - ^^^^^^^^^^^^^^^^^^^^^ - ...<2 lines>... - **kwargs, - ^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\circe\execution\ibis\operations.py", line 13, in _call_with_optional_database - return method(*args, database=database, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 200, in create_table - self._run_pre_execute_hooks(table) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1300, in _run_pre_execute_hooks - self._register_in_memory_tables(expr) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\__init__.py", line 1277, in _register_in_memory_tables - self._register_in_memory_table(memtable) - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\ibis\backends\databricks\__init__.py", line 476, in _register_in_memory_table - cur.execute(put_into) - ~~~~~~~~~~~^^^^^^^^^^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\telemetry\latency_logger.py", line 182, in wrapper - return func(self, *args, **kwargs) - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1357, in execute - self._handle_staging_operation( - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ - staging_allowed_local_path=self.connection.staging_allowed_local_path, - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - input_stream=input_stream, - ^^^^^^^^^^^^^^^^^^^^^^^^^^ - ) - ^ - File "C:\Users\admin_jgilber2\PycharmProjects\Circepy\.venv\Lib\site-packages\databricks\sql\client.py", line 1103, in _handle_staging_operation - raise ProgrammingError( - ...<3 lines>... - ) -databricks.sql.exc.ProgrammingError: Local file operations are restricted to paths within the configured staging_allowed_local_path \ No newline at end of file diff --git a/benchmarks/export_phenotypes.R b/benchmarks/export_phenotypes.R deleted file mode 100644 index 7f6f088..0000000 --- a/benchmarks/export_phenotypes.R +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env Rscript -# export_phenotypes.R -# -# Export PhenotypeLibrary cohort JSONs for Python consumption. -# -# Usage: -# Rscript benchmarks/export_phenotypes.R -# -# Output: -# benchmark_output/phenotype_jsons/.json -- one JSON per cohort -# benchmark_output/phenotype_manifest.csv -- cohortId, cohortName - -suppressPackageStartupMessages({ - library(PhenotypeLibrary) -}) - -script_path <- normalizePath(sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])) -REPO_ROOT <- dirname(dirname(script_path)) -OUTPUT_DIR <- file.path(REPO_ROOT, "benchmark_output") -JSON_DIR <- file.path(OUTPUT_DIR, "phenotype_jsons") -dir.create(JSON_DIR, showWarnings = FALSE, recursive = TRUE) - -cat("Loading phenotypes from PhenotypeLibrary...\n") -phenotype_log <- PhenotypeLibrary::getPhenotypeLog() -cds <- PhenotypeLibrary::getPlCohortDefinitionSet(cohortIds = phenotype_log$cohortId) -cat(sprintf(" Loaded %d phenotype definitions\n", nrow(cds))) - -cat(sprintf("Writing JSONs to %s...\n", JSON_DIR)) -for (i in seq_len(nrow(cds))) { - cohort_id <- cds$cohortId[i] - json_path <- file.path(JSON_DIR, sprintf("%d.json", cohort_id)) - writeLines(cds$json[i], json_path) -} - -manifest <- data.frame( - cohortId = cds$cohortId, - cohortName = cds$cohortName, - stringsAsFactors = FALSE -) -manifest_path <- file.path(OUTPUT_DIR, "phenotype_manifest.csv") -write.csv(manifest, manifest_path, row.names = FALSE) - -cat(sprintf("Wrote %d JSONs and manifest to %s\n", nrow(cds), manifest_path)) From 023b28d4f2e0b294548bf9d83104aff10c0aa82f Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Wed, 17 Jun 2026 09:42:29 -0700 Subject: [PATCH 50/53] Fixing tests and removing python 3.9 support --- .github/workflows/basic_tests.yml | 2 +- circe/cohort_definition_set/_generate.py | 5 +++-- pyproject.toml | 5 ++--- tox.ini | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/basic_tests.yml b/.github/workflows/basic_tests.yml index 33fed6a..5eb8154 100644 --- a/.github/workflows/basic_tests.yml +++ b/.github/workflows/basic_tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.14" ] + python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ] steps: - name: Check out repository diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index ce981dd..4edf3e0 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,8 +10,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Literal -from ..execution.api import build_cohort, write_cohort -from ..execution.ibis.materialize import project_to_ohdsi_cohort_table from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult @@ -50,6 +48,9 @@ def _process_single_cohort( Each cohort gets its own per-cohort codeset table populated and dropped as it runs, mirroring the Java ``#Codesets`` pattern. """ + from ..execution.api import build_cohort, write_cohort + from ..execution.ibis.materialize import project_to_ohdsi_cohort_table + with _backend_lock: start_time = datetime.now() new_rows = build_cohort( diff --git a/pyproject.toml b/pyproject.toml index 4f0b2a0..068e9b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -32,7 +31,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", ] -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ "pydantic>=2.0.0", "typing-extensions>=4.0.0", @@ -97,7 +96,7 @@ circe = ["py.typed"] "circe.extensions.waveform" = ["templates/*.j2"] [tool.mypy] -python_version = "3.9" +python_version = "3.10" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true diff --git a/tox.ini b/tox.ini index 2d25e30..75d0cbf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py39, py310, py311, py312, py313, py314 +envlist = py310, py311, py312, py313, py314 skip_missing_interpreters = true isolated_build = true @@ -21,7 +21,6 @@ commands = [gh-actions] python = - 3.9: py39 3.10: py310 3.11: py311 3.12: py312 From 1b2b1e302523651d45b4e2a3ab2b9094991122f4 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Wed, 17 Jun 2026 09:58:08 -0700 Subject: [PATCH 51/53] Actual fix - make ibis a core dependency of the package and update required python version --- circe/cohort_definition_set/_generate.py | 5 ++--- pyproject.toml | 13 ++++--------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/circe/cohort_definition_set/_generate.py b/circe/cohort_definition_set/_generate.py index 4edf3e0..ce981dd 100644 --- a/circe/cohort_definition_set/_generate.py +++ b/circe/cohort_definition_set/_generate.py @@ -10,6 +10,8 @@ from datetime import datetime from typing import TYPE_CHECKING, Literal +from ..execution.api import build_cohort, write_cohort +from ..execution.ibis.materialize import project_to_ohdsi_cohort_table from ._checksum_store import load_checksums, upsert_generation_history from ._core import CohortDefinition, CohortDefinitionSet, CohortGenerationResult @@ -48,9 +50,6 @@ def _process_single_cohort( Each cohort gets its own per-cohort codeset table populated and dropped as it runs, mirroring the Java ``#Codesets`` pattern. """ - from ..execution.api import build_cohort, write_cohort - from ..execution.ibis.materialize import project_to_ohdsi_cohort_table - with _backend_lock: start_time = datetime.now() new_rows = build_cohort( diff --git a/pyproject.toml b/pyproject.toml index 068e9b8..d46c64b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ dependencies = [ "pydantic>=2.0.0", "typing-extensions>=4.0.0", "jinja2>=3.1.0", - "PyYAML>=6.0" + "PyYAML>=6.0", + "ibis-framework[duckdb]>=11.0.0", ] [project.optional-dependencies] @@ -58,17 +59,11 @@ docs = [ "sphinx-rtd-theme>=1.0.0", "myst-parser>=0.18.0", ] -ibis = [ - "ibis-framework>=11.0.0; python_version >= '3.9'", -] -ibis-duckdb = [ - "ibis-framework[duckdb]>=11.0.0; python_version >= '3.9'", -] ibis-postgres = [ - "ibis-framework[postgres]>=11.0.0; python_version >= '3.9'", + "ibis-framework[postgres]>=11.0.0; python_version >= '3.10'", ] ibis-databricks = [ - "ibis-framework[databricks]>=11.0.0; python_version >= '3.9'", + "ibis-framework[databricks]>=11.0.0; python_version >= '3.10'", ] waveform = [ "pydantic>=2.0.0", From b7a433a288fd5227192cd7ad1735becb2b302650 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Wed, 17 Jun 2026 10:22:00 -0700 Subject: [PATCH 52/53] Ruff fixes --- circe/api.py | 16 +- .../checkers/attribute_checker_factory.py | 3 +- circe/check/checkers/base_checker_factory.py | 2 +- circe/check/checkers/comparisons.py | 2 +- .../check/checkers/concept_checker_factory.py | 4 +- .../concept_set_selection_checker_factory.py | 3 +- .../checkers/criteria_checker_factory.py | 5 +- circe/check/checkers/drug_domain_check.py | 2 +- .../checkers/events_progression_check.py | 6 +- circe/check/checkers/range_checker_factory.py | 3 +- circe/check/checkers/text_checker_factory.py | 3 +- circe/check/checkers/time_window_check.py | 4 +- circe/check/checkers/unused_concepts_check.py | 5 +- circe/check/operations/__init__.py | 2 +- .../operations/conditional_operations.py | 3 +- .../check/operations/executive_operations.py | 3 +- circe/check/operations/operations.py | 9 +- circe/check/warnings/concept_set_warning.py | 5 +- circe/cohortdefinition/builders/base.py | 12 +- .../builders/condition_era.py | 7 +- .../builders/condition_occurrence.py | 7 +- circe/cohortdefinition/builders/death.py | 7 +- circe/cohortdefinition/builders/dose_era.py | 7 +- circe/cohortdefinition/builders/drug_era.py | 7 +- .../builders/drug_exposure.py | 7 +- .../builders/location_region.py | 7 +- .../cohortdefinition/builders/measurement.py | 9 +- .../cohortdefinition/builders/observation.py | 7 +- .../builders/observation_period.py | 9 +- .../builders/payer_plan_period.py | 9 +- .../builders/procedure_occurrence.py | 7 +- circe/cohortdefinition/builders/specimen.py | 5 +- circe/cohortdefinition/builders/utils.py | 16 +- .../cohortdefinition/builders/visit_detail.py | 9 +- .../builders/visit_occurrence.py | 7 +- circe/cohortdefinition/cohort.py | 24 +- .../cohort_expression_query_builder.py | 30 +- circe/cohortdefinition/core.py | 48 +- circe/cohortdefinition/criteria.py | 594 +++++++++--------- circe/cohortdefinition/interfaces.py | 41 +- .../printfriendly/markdown_render.py | 19 +- circe/execution/_dataclass.py | 9 +- circe/execution/plan/events.py | 44 +- circe/execution/typing.py | 4 +- circe/extensions/__init__.py | 11 +- circe/extensions/waveform/criteria.py | 77 ++- circe/io.py | 4 +- circe/vocabulary/concept.py | 56 +- tests/test_cohort_definition_set.py | 2 +- tests/test_extension_system.py | 5 +- tests/test_real_example_cohorts.py | 11 +- 51 files changed, 590 insertions(+), 608 deletions(-) diff --git a/circe/api.py b/circe/api.py index 1279fc7..04e2e6a 100644 --- a/circe/api.py +++ b/circe/api.py @@ -9,7 +9,7 @@ - cohort_print_friendly(): Generate Markdown from cohort expression """ -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Any, Literal from .cohort_definition_set import ( # noqa: F401 CohortDefinition, @@ -122,7 +122,7 @@ def cohort_expression_from_yaml(yaml_str: str) -> CohortExpression: def build_cohort_query( expression: CohortExpression, - options: Optional[BuildExpressionQueryOptions] = None, + options: BuildExpressionQueryOptions | None = None, ) -> str: """Generate SQL query from a cohort expression. @@ -155,8 +155,8 @@ def build_cohort( *, backend: IbisBackendLike, cdm_schema: str, - vocabulary_schema: Optional[str] = None, - results_schema: Optional[str] = None, + vocabulary_schema: str | None = None, + results_schema: str | None = None, ) -> Table: """Build a cohort as a relational table expression. @@ -207,8 +207,8 @@ def write_cohort( cdm_schema: str, cohort_table: str, cohort_id: int, - vocabulary_schema: Optional[str] = None, - results_schema: Optional[str] = None, + vocabulary_schema: str | None = None, + results_schema: str | None = None, if_exists: Literal["fail", "replace"] = "fail", ) -> None: """Build and write an OHDSI cohort table. @@ -268,8 +268,8 @@ def write_cohort( def cohort_print_friendly( expression: CohortExpression, - concept_sets: Optional[list[ConceptSet]] = None, - title: Optional[str] = None, + concept_sets: list[ConceptSet] | None = None, + title: str | None = None, include_concept_sets: bool = False, ) -> str: """Generate human-readable Markdown from a cohort expression. diff --git a/circe/check/checkers/attribute_checker_factory.py b/circe/check/checkers/attribute_checker_factory.py index 2d7681d..7b918c3 100644 --- a/circe/check/checkers/attribute_checker_factory.py +++ b/circe/check/checkers/attribute_checker_factory.py @@ -8,7 +8,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Any, Callable +from collections.abc import Callable +from typing import Any from ..constants import Constants from .base_checker_factory import BaseCheckerFactory diff --git a/circe/check/checkers/base_checker_factory.py b/circe/check/checkers/base_checker_factory.py index 10b3eb0..78839ba 100644 --- a/circe/check/checkers/base_checker_factory.py +++ b/circe/check/checkers/base_checker_factory.py @@ -9,7 +9,7 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable +from collections.abc import Callable from .warning_reporter import WarningReporter diff --git a/circe/check/checkers/comparisons.py b/circe/check/checkers/comparisons.py index 29306aa..8c379cc 100644 --- a/circe/check/checkers/comparisons.py +++ b/circe/check/checkers/comparisons.py @@ -76,7 +76,7 @@ def start_is_greater_than_end(range_val) -> bool: return False @staticmethod - def is_date_valid(date: Optional[str]) -> bool: + def is_date_valid(date: str | None) -> bool: """Check if a date string is valid. Args: diff --git a/circe/check/checkers/concept_checker_factory.py b/circe/check/checkers/concept_checker_factory.py index 118058d..c987cc7 100644 --- a/circe/check/checkers/concept_checker_factory.py +++ b/circe/check/checkers/concept_checker_factory.py @@ -8,7 +8,7 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable, Optional +from collections.abc import Callable from ..constants import Constants from ..operations.operations import Operations @@ -427,7 +427,7 @@ def check(c: "DemographicCriteria") -> None: return check - def _check_concept(self, concepts: Optional[list["Concept"]], criteria_name: str, attribute: str) -> None: + def _check_concept(self, concepts: list["Concept"] | None, criteria_name: str, attribute: str) -> None: """Check if a concept array is empty. Args: diff --git a/circe/check/checkers/concept_set_selection_checker_factory.py b/circe/check/checkers/concept_set_selection_checker_factory.py index 3967a9f..3e08370 100644 --- a/circe/check/checkers/concept_set_selection_checker_factory.py +++ b/circe/check/checkers/concept_set_selection_checker_factory.py @@ -8,7 +8,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable, Optional +from collections.abc import Callable +from typing import Optional from ..constants import Constants from ..operations.operations import Operations diff --git a/circe/check/checkers/criteria_checker_factory.py b/circe/check/checkers/criteria_checker_factory.py index 47f19fe..6958121 100644 --- a/circe/check/checkers/criteria_checker_factory.py +++ b/circe/check/checkers/criteria_checker_factory.py @@ -8,7 +8,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable, Optional +from collections.abc import Callable +from typing import Optional # Import at runtime to avoid circular dependencies try: @@ -213,7 +214,7 @@ def _get_concept_set_selection_suppliers( Returns: A list of functions that return ConceptSetSelection objects """ - suppliers: list[Callable[[], Optional[ConceptSetSelection]]] = [] + suppliers: list[Callable[[], ConceptSetSelection | None]] = [] suppliers.append(lambda: criteria.place_of_service_cs) suppliers.append(lambda: criteria.gender_cs) suppliers.append(lambda: criteria.provider_specialty_cs) diff --git a/circe/check/checkers/drug_domain_check.py b/circe/check/checkers/drug_domain_check.py index c251a72..3d14e37 100644 --- a/circe/check/checkers/drug_domain_check.py +++ b/circe/check/checkers/drug_domain_check.py @@ -78,7 +78,7 @@ def _check(self, expression: "CohortExpression", reporter: WarningReporter) -> N title = "Concept sets" if len(concept_sets) > 1 else "Concept set" reporter(self.MESSAGE, title, names) - def _map_criteria(self, criteria: "Criteria") -> Optional[int]: + def _map_criteria(self, criteria: "Criteria") -> int | None: """Map a criteria to its codeset ID. Args: diff --git a/circe/check/checkers/events_progression_check.py b/circe/check/checkers/events_progression_check.py index 5a46211..e7c04e7 100644 --- a/circe/check/checkers/events_progression_check.py +++ b/circe/check/checkers/events_progression_check.py @@ -38,7 +38,7 @@ class LimitType(Enum): LATEST = (1, "Last") ALL = (2, "All") - def __init__(self, weight: int, name: Optional[str]): + def __init__(self, weight: int, name: str | None): """Initialize a limit type. Args: @@ -58,7 +58,7 @@ def weight(self) -> int: return self._weight @property - def name(self) -> Optional[str]: + def name(self) -> str | None: """Get the name of this limit type. Returns: @@ -67,7 +67,7 @@ def name(self) -> Optional[str]: return self._name @staticmethod - def from_name(name: Optional[str]) -> "LimitType": + def from_name(name: str | None) -> "LimitType": """Get a limit type from its name. Args: diff --git a/circe/check/checkers/range_checker_factory.py b/circe/check/checkers/range_checker_factory.py index 3c9f36c..a9e5ab1 100644 --- a/circe/check/checkers/range_checker_factory.py +++ b/circe/check/checkers/range_checker_factory.py @@ -8,7 +8,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any, Optional from ..constants import Constants from ..operations.operations import Operations diff --git a/circe/check/checkers/text_checker_factory.py b/circe/check/checkers/text_checker_factory.py index d002f2d..42f1a82 100644 --- a/circe/check/checkers/text_checker_factory.py +++ b/circe/check/checkers/text_checker_factory.py @@ -8,7 +8,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable, Optional +from collections.abc import Callable +from typing import Optional from ..constants import Constants from ..operations.operations import Operations diff --git a/circe/check/checkers/time_window_check.py b/circe/check/checkers/time_window_check.py index 410f34d..e9aefbe 100644 --- a/circe/check/checkers/time_window_check.py +++ b/circe/check/checkers/time_window_check.py @@ -8,7 +8,7 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Any, Optional +from typing import Any from ..operations.operations import Operations from ..utils.criteria_name_helper import CriteriaNameHelper @@ -42,7 +42,7 @@ class TimeWindowCheck(BaseCorelatedCriteriaCheck): def __init__(self): """Initialize the time window check.""" super().__init__() - self._observation_filter: Optional[ObservationFilter] = None + self._observation_filter: ObservationFilter | None = None def _define_severity(self) -> WarningSeverity: """Define the severity level for this check. diff --git a/circe/check/checkers/unused_concepts_check.py b/circe/check/checkers/unused_concepts_check.py index 3cf065a..0695791 100644 --- a/circe/check/checkers/unused_concepts_check.py +++ b/circe/check/checkers/unused_concepts_check.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..warning_severity import WarningSeverity from ..warnings.concept_set_warning import ConceptSetWarning @@ -243,7 +242,7 @@ def _correlated_criteria_to_list(self, correlated_criteria) -> list["Criteria"]: ) return criteria_list - def _to_criteria_list(self, criteria_list: Optional[list["CorelatedCriteria"]]) -> list["Criteria"]: + def _to_criteria_list(self, criteria_list: list["CorelatedCriteria"] | None) -> list["Criteria"]: """Convert a list of CorelatedCriteria to a list of Criteria. Args: @@ -256,7 +255,7 @@ def _to_criteria_list(self, criteria_list: Optional[list["CorelatedCriteria"]]) return [] return [c.criteria for c in criteria_list if hasattr(c, "criteria") and c.criteria] - def _to_criteria_list_from_groups(self, groups: Optional[list["CriteriaGroup"]]) -> list["Criteria"]: + def _to_criteria_list_from_groups(self, groups: list["CriteriaGroup"] | None) -> list["Criteria"]: """Convert groups to a list of criteria. Args: diff --git a/circe/check/operations/__init__.py b/circe/check/operations/__init__.py index b23bc8c..3b6fb8e 100644 --- a/circe/check/operations/__init__.py +++ b/circe/check/operations/__init__.py @@ -5,7 +5,7 @@ """ # Type alias for convenience (Callable[[], None]) -from typing import Callable +from collections.abc import Callable from .conditional_operations import ConditionalOperations from .execution import Execution diff --git a/circe/check/operations/conditional_operations.py b/circe/check/operations/conditional_operations.py index 700aadb..ed38f62 100644 --- a/circe/check/operations/conditional_operations.py +++ b/circe/check/operations/conditional_operations.py @@ -9,7 +9,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import TYPE_CHECKING, Callable, Generic, Protocol, TypeVar +from collections.abc import Callable +from typing import TYPE_CHECKING, Generic, Protocol, TypeVar if TYPE_CHECKING: from .executive_operations import ExecutiveOperations diff --git a/circe/check/operations/executive_operations.py b/circe/check/operations/executive_operations.py index aa1ce75..1b0b266 100644 --- a/circe/check/operations/executive_operations.py +++ b/circe/check/operations/executive_operations.py @@ -9,7 +9,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Callable, Generic, Protocol, TypeVar, overload +from collections.abc import Callable +from typing import Generic, Protocol, TypeVar, overload from .conditional_operations import ConditionalOperations from .execution import Execution diff --git a/circe/check/operations/operations.py b/circe/check/operations/operations.py index 3e4f0ca..0faa199 100644 --- a/circe/check/operations/operations.py +++ b/circe/check/operations/operations.py @@ -9,7 +9,8 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Any, Callable, Generic, Optional, TypeVar +from collections.abc import Callable +from typing import Any, Generic, TypeVar from .conditional_operations import ConditionalOperations from .executive_operations import ExecutiveOperations @@ -34,8 +35,8 @@ def __init__(self, value: T): value: The value to match against """ self._value = value - self._result: Optional[bool] = None - self._return_value: Optional[V] = None + self._result: bool | None = None + self._return_value: V | None = None @staticmethod def match(value: T) -> ConditionalOperations[T, V]: @@ -113,7 +114,7 @@ def or_else(self, consumer: Callable[[T], None]) -> None: if not self._result: consumer(self._value) - def value(self) -> Optional[V]: + def value(self) -> V | None: """Get the return value from then_return operations. Returns: diff --git a/circe/check/warnings/concept_set_warning.py b/circe/check/warnings/concept_set_warning.py index 0be3c8c..1f23924 100644 --- a/circe/check/warnings/concept_set_warning.py +++ b/circe/check/warnings/concept_set_warning.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ...vocabulary.concept import ConceptSet from ..warning_severity import WarningSeverity @@ -28,7 +27,7 @@ def __init__( self, severity: WarningSeverity, template: str, - concept_set: Optional[ConceptSet], + concept_set: ConceptSet | None, ): """Initialize a concept set warning. @@ -42,7 +41,7 @@ def __init__( self._concept_set = concept_set @property - def concept_set(self) -> Optional[ConceptSet]: + def concept_set(self) -> ConceptSet | None: """Get the concept set associated with this warning. Returns: diff --git a/circe/cohortdefinition/builders/base.py b/circe/cohortdefinition/builders/base.py index 3cf9c9f..4d158ba 100644 --- a/circe/cohortdefinition/builders/base.py +++ b/circe/cohortdefinition/builders/base.py @@ -10,7 +10,7 @@ """ from abc import ABC, abstractmethod -from typing import Generic, Optional, TypeVar +from typing import Generic, TypeVar from ..criteria import Criteria from .utils import BuilderOptions, CriteriaColumn @@ -24,14 +24,14 @@ class CriteriaSqlBuilder(ABC, Generic[T]): Java equivalent: org.ohdsi.circe.cohortdefinition.builders.CriteriaSqlBuilder """ - def get_criteria_sql(self, criteria: T, options: Optional[BuilderOptions] = None) -> str: + def get_criteria_sql(self, criteria: T, options: BuilderOptions | None = None) -> str: """Get SQL query for criteria. Java equivalent: CriteriaSqlBuilder.getCriteriaSql(T criteria) """ return self.get_criteria_sql_with_options(criteria, options) - def get_criteria_sql_with_options(self, criteria: T, options: Optional[BuilderOptions]) -> str: + def get_criteria_sql_with_options(self, criteria: T, options: BuilderOptions | None) -> str: """Get SQL query for criteria with builder options. Java equivalent: CriteriaSqlBuilder.getCriteriaSql(T criteria, BuilderOptions options) @@ -99,7 +99,7 @@ def embed_codeset_clause(self, query: str, criteria: T) -> str: # This would need to be implemented based on the Java logic return query.replace("@codesetClause", "") - def resolve_select_clauses(self, criteria: T, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_select_clauses(self, criteria: T, options: BuilderOptions | None = None) -> list[str]: """Resolve select clauses for criteria. Java equivalent: CriteriaSqlBuilder.resolveSelectClauses() @@ -107,7 +107,7 @@ def resolve_select_clauses(self, criteria: T, options: Optional[BuilderOptions] # This would need to be implemented based on the Java logic return [] - def resolve_join_clauses(self, criteria: T, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: T, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for criteria. Java equivalent: CriteriaSqlBuilder.resolveJoinClauses() @@ -115,7 +115,7 @@ def resolve_join_clauses(self, criteria: T, options: Optional[BuilderOptions] = # This would need to be implemented based on the Java logic return [] - def resolve_where_clauses(self, criteria: T, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_where_clauses(self, criteria: T, options: BuilderOptions | None = None) -> list[str]: """Resolve where clauses for criteria. Java equivalent: CriteriaSqlBuilder.resolveWhereClauses() diff --git a/circe/cohortdefinition/builders/condition_era.py b/circe/cohortdefinition/builders/condition_era.py index 180480c..577d51d 100644 --- a/circe/cohortdefinition/builders/condition_era.py +++ b/circe/cohortdefinition/builders/condition_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import ConditionEra from .base import CriteriaSqlBuilder @@ -90,7 +89,7 @@ def embed_ordinal_expression(self, query: str, criteria: ConditionEra, where_cla def resolve_select_clauses( self, criteria: ConditionEra, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for condition era criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -120,7 +119,7 @@ def resolve_select_clauses( return select_cols def resolve_join_clauses( - self, criteria: ConditionEra, options: Optional[BuilderOptions] = None + self, criteria: ConditionEra, options: BuilderOptions | None = None ) -> list[str]: """Resolve join clauses for condition era criteria.""" join_clauses = [] @@ -139,7 +138,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: ConditionEra, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for condition era criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/condition_occurrence.py b/circe/cohortdefinition/builders/condition_occurrence.py index d9173fd..3399274 100644 --- a/circe/cohortdefinition/builders/condition_occurrence.py +++ b/circe/cohortdefinition/builders/condition_occurrence.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import ConditionOccurrence from .base import CriteriaSqlBuilder @@ -104,7 +103,7 @@ def embed_ordinal_expression( def resolve_select_clauses( self, criteria: ConditionOccurrence, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for condition occurrence criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -158,7 +157,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: ConditionOccurrence, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for condition occurrence criteria.""" join_clauses = [] @@ -190,7 +189,7 @@ def resolve_join_clauses( return join_clauses def resolve_where_clauses( - self, criteria: ConditionOccurrence, options: Optional[BuilderOptions] = None + self, criteria: ConditionOccurrence, options: BuilderOptions | None = None ) -> list[str]: """Resolve where clauses for condition occurrence criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/death.py b/circe/cohortdefinition/builders/death.py index 2ae267c..e6d8748 100644 --- a/circe/cohortdefinition/builders/death.py +++ b/circe/cohortdefinition/builders/death.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import Death from .base import CriteriaSqlBuilder @@ -81,7 +80,7 @@ def embed_ordinal_expression(self, query: str, criteria: Death, where_clauses: l """ return query - def resolve_select_clauses(self, criteria: Death, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_select_clauses(self, criteria: Death, options: BuilderOptions | None = None) -> list[str]: """Resolve select clauses for death criteria.""" select_cols = ["d.person_id", "d.cause_concept_id"] @@ -106,7 +105,7 @@ def resolve_select_clauses(self, criteria: Death, options: Optional[BuilderOptio return select_cols - def resolve_join_clauses(self, criteria: Death, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: Death, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for death criteria.""" joins = [] @@ -120,7 +119,7 @@ def resolve_join_clauses(self, criteria: Death, options: Optional[BuilderOptions return joins - def resolve_where_clauses(self, criteria: Death, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_where_clauses(self, criteria: Death, options: BuilderOptions | None = None) -> list[str]: """Resolve where clauses for death criteria.""" where_clauses = super().resolve_where_clauses(criteria) diff --git a/circe/cohortdefinition/builders/dose_era.py b/circe/cohortdefinition/builders/dose_era.py index f0b9c59..812eaa9 100644 --- a/circe/cohortdefinition/builders/dose_era.py +++ b/circe/cohortdefinition/builders/dose_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import DoseEra from .base import CriteriaSqlBuilder @@ -97,7 +96,7 @@ def embed_ordinal_expression(self, query: str, criteria: DoseEra, where_clauses: def resolve_select_clauses( self, criteria: DoseEra, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for dose era criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -124,7 +123,7 @@ def resolve_select_clauses( return select_cols - def resolve_join_clauses(self, criteria: DoseEra, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: DoseEra, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for dose era criteria.""" join_clauses = [] @@ -139,7 +138,7 @@ def resolve_join_clauses(self, criteria: DoseEra, options: Optional[BuilderOptio return join_clauses - def resolve_where_clauses(self, criteria: DoseEra, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_where_clauses(self, criteria: DoseEra, options: BuilderOptions | None = None) -> list[str]: """Resolve where clauses for dose era criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/drug_era.py b/circe/cohortdefinition/builders/drug_era.py index a44c9b4..ff94a07 100644 --- a/circe/cohortdefinition/builders/drug_era.py +++ b/circe/cohortdefinition/builders/drug_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import DrugEra from .base import CriteriaSqlBuilder @@ -101,7 +100,7 @@ def embed_ordinal_expression(self, query: str, criteria: DrugEra, where_clauses: def resolve_select_clauses( self, criteria: DrugEra, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for drug era criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -130,7 +129,7 @@ def resolve_select_clauses( return select_cols - def resolve_join_clauses(self, criteria: DrugEra, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: DrugEra, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for drug era criteria.""" join_clauses = [] @@ -145,7 +144,7 @@ def resolve_join_clauses(self, criteria: DrugEra, options: Optional[BuilderOptio return join_clauses - def resolve_where_clauses(self, criteria: DrugEra, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_where_clauses(self, criteria: DrugEra, options: BuilderOptions | None = None) -> list[str]: """Resolve where clauses for drug era criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/drug_exposure.py b/circe/cohortdefinition/builders/drug_exposure.py index 6ae1994..d8125c3 100644 --- a/circe/cohortdefinition/builders/drug_exposure.py +++ b/circe/cohortdefinition/builders/drug_exposure.py @@ -9,7 +9,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import DrugExposure from .base import CriteriaSqlBuilder @@ -122,7 +121,7 @@ def embed_ordinal_expression(self, query: str, criteria: DrugExposure, where_cla def resolve_select_clauses( self, criteria: DrugExposure, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for drug exposure criteria. @@ -192,7 +191,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: DrugExposure, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for drug exposure criteria. @@ -223,7 +222,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: DrugExposure, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for drug exposure criteria. diff --git a/circe/cohortdefinition/builders/location_region.py b/circe/cohortdefinition/builders/location_region.py index e305408..98bfc7d 100644 --- a/circe/cohortdefinition/builders/location_region.py +++ b/circe/cohortdefinition/builders/location_region.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import LocationRegion from .base import CriteriaSqlBuilder @@ -80,7 +79,7 @@ def embed_ordinal_expression(self, query: str, criteria: LocationRegion, where_c def resolve_select_clauses( self, criteria: LocationRegion, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for location region criteria.""" # Default select columns that are always returned @@ -101,7 +100,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: LocationRegion, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for location region criteria.""" return [] @@ -109,7 +108,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: LocationRegion, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for location region criteria.""" return [] diff --git a/circe/cohortdefinition/builders/measurement.py b/circe/cohortdefinition/builders/measurement.py index f00f9b2..2b6515f 100644 --- a/circe/cohortdefinition/builders/measurement.py +++ b/circe/cohortdefinition/builders/measurement.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import Measurement from .base import CriteriaSqlBuilder @@ -89,7 +88,7 @@ def embed_codeset_clause(self, query: str, criteria: Measurement) -> str: def resolve_select_clauses( self, criteria: Measurement, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for measurement criteria. @@ -155,7 +154,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: Measurement, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for measurement criteria. @@ -193,7 +192,7 @@ def resolve_join_clauses( def resolve_ordinal_expression( self, criteria: Measurement, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> str: """Resolve ordinal expression for measurement criteria.""" if criteria.first: @@ -203,7 +202,7 @@ def resolve_ordinal_expression( def resolve_where_clauses( self, criteria: Measurement, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for measurement criteria. diff --git a/circe/cohortdefinition/builders/observation.py b/circe/cohortdefinition/builders/observation.py index 8efdcb7..281f7e6 100644 --- a/circe/cohortdefinition/builders/observation.py +++ b/circe/cohortdefinition/builders/observation.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import Observation from .base import CriteriaSqlBuilder @@ -72,7 +71,7 @@ def embed_codeset_clause(self, query: str, criteria: Observation) -> str: def resolve_select_clauses( self, criteria: Observation, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for observation criteria. @@ -111,7 +110,7 @@ def resolve_select_clauses( return select_cols def resolve_join_clauses( - self, criteria: Observation, options: Optional[BuilderOptions] = None + self, criteria: Observation, options: BuilderOptions | None = None ) -> list[str]: """Resolve join clauses for observation criteria. @@ -149,7 +148,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: Observation, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for observation criteria.""" where_clauses = super().resolve_where_clauses(criteria) diff --git a/circe/cohortdefinition/builders/observation_period.py b/circe/cohortdefinition/builders/observation_period.py index 2ece7b5..2b620bf 100644 --- a/circe/cohortdefinition/builders/observation_period.py +++ b/circe/cohortdefinition/builders/observation_period.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import ObservationPeriod from .base import CriteriaSqlBuilder @@ -73,7 +72,7 @@ def get_table_column_for_criteria_column(self, criteria_column: CriteriaColumn) def get_criteria_sql_with_options( self, criteria: ObservationPeriod, - options: Optional[BuilderOptions], + options: BuilderOptions | None, ) -> str: """Get SQL query for criteria with builder options.""" query = super().get_criteria_sql_with_options(criteria, options) @@ -112,7 +111,7 @@ def embed_ordinal_expression( def resolve_select_clauses( self, criteria: ObservationPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for observation period criteria. @@ -148,7 +147,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: ObservationPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for observation period criteria.""" join_clauses = [] @@ -162,7 +161,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: ObservationPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for observation period criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/payer_plan_period.py b/circe/cohortdefinition/builders/payer_plan_period.py index 27466f7..fb25d62 100644 --- a/circe/cohortdefinition/builders/payer_plan_period.py +++ b/circe/cohortdefinition/builders/payer_plan_period.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import PayerPlanPeriod from .base import CriteriaSqlBuilder @@ -77,7 +76,7 @@ def get_table_column_for_criteria_column(self, criteria_column: CriteriaColumn) def get_criteria_sql_with_options( self, criteria: PayerPlanPeriod, - options: Optional[BuilderOptions], + options: BuilderOptions | None, ) -> str: """Get SQL query for criteria with builder options.""" query = super().get_criteria_sql_with_options(criteria, options) @@ -115,7 +114,7 @@ def embed_ordinal_expression( def resolve_select_clauses( self, criteria: PayerPlanPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for payer plan period criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -184,7 +183,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: PayerPlanPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for payer plan period criteria.""" join_clauses = [] @@ -202,7 +201,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: PayerPlanPeriod, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for payer plan period criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/procedure_occurrence.py b/circe/cohortdefinition/builders/procedure_occurrence.py index de1d7bc..16dec87 100644 --- a/circe/cohortdefinition/builders/procedure_occurrence.py +++ b/circe/cohortdefinition/builders/procedure_occurrence.py @@ -9,7 +9,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import Criteria from .base import CriteriaSqlBuilder @@ -129,7 +128,7 @@ def embed_codeset_clause(self, query: str, criteria: Criteria) -> str: def resolve_select_clauses( self, criteria: Criteria, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for criteria. @@ -183,7 +182,7 @@ def resolve_select_clauses( return select_cols - def resolve_join_clauses(self, criteria: Criteria, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: Criteria, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for criteria. Java equivalent: ProcedureOccurrenceSqlBuilder.resolveJoinClauses() @@ -221,7 +220,7 @@ def resolve_join_clauses(self, criteria: Criteria, options: Optional[BuilderOpti def resolve_where_clauses( self, criteria: Criteria, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for criteria. diff --git a/circe/cohortdefinition/builders/specimen.py b/circe/cohortdefinition/builders/specimen.py index 7aaea3d..2670cd4 100644 --- a/circe/cohortdefinition/builders/specimen.py +++ b/circe/cohortdefinition/builders/specimen.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import Specimen from .base import CriteriaSqlBuilder @@ -82,7 +81,7 @@ def embed_ordinal_expression(self, query: str, criteria: Specimen, where_clauses query = query.replace("@ordinalExpression", "") return query - def resolve_join_clauses(self, criteria: Specimen, options: Optional[BuilderOptions] = None) -> list[str]: + def resolve_join_clauses(self, criteria: Specimen, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for specimen criteria.""" joins = [] @@ -99,7 +98,7 @@ def resolve_join_clauses(self, criteria: Specimen, options: Optional[BuilderOpti def resolve_where_clauses( self, criteria: Specimen, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for specimen criteria.""" where_clauses = [] diff --git a/circe/cohortdefinition/builders/utils.py b/circe/cohortdefinition/builders/utils.py index 5ef033f..03af850 100644 --- a/circe/cohortdefinition/builders/utils.py +++ b/circe/cohortdefinition/builders/utils.py @@ -9,7 +9,7 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Any, Optional +from typing import Any from ...vocabulary.concept import Concept from ..core import DateAdjustment, DateRange, NumericRange @@ -62,9 +62,9 @@ def get_date_adjustment_expression( @staticmethod def get_codeset_join_expression( - standard_codeset_id: Optional[int], + standard_codeset_id: int | None, standard_concept_column: str, - source_codeset_id: Optional[int], + source_codeset_id: int | None, source_concept_column: str, ) -> str: """Get codeset join expression for SQL. @@ -133,7 +133,7 @@ def get_operator(op: str) -> str: raise RuntimeError(f"Unknown operator type: {op}") @staticmethod - def build_date_range_clause(sql_expression: str, date_range: Optional[DateRange]) -> Optional[str]: + def build_date_range_clause(sql_expression: str, date_range: DateRange | None) -> str | None: """Build date range clause for SQL. Java equivalent: BuilderUtils.buildDateRangeClause(String sqlExpression, DateRange range) @@ -157,9 +157,9 @@ def build_date_range_clause(sql_expression: str, date_range: Optional[DateRange] @staticmethod def build_numeric_range_clause( sql_expression: str, - numeric_range: Optional[NumericRange], - format: Optional[str] = None, - ) -> Optional[str]: + numeric_range: NumericRange | None, + format: str | None = None, + ) -> str | None: """Build numeric range clause for SQL. Java equivalent: BuilderUtils.buildNumericRangeClause(String sqlExpression, NumericRange range, String format) @@ -193,7 +193,7 @@ def build_numeric_range_clause( return f"{sql_expression} {BuilderUtils.get_operator(op)} {int(numeric_range.value)}" @staticmethod - def build_text_filter_clause(text_filter: Optional[Any], column_name: str) -> Optional[str]: + def build_text_filter_clause(text_filter: Any | None, column_name: str) -> str | None: """Build text filter clause for SQL. Java equivalent: BuilderUtils.buildTextFilterClause() diff --git a/circe/cohortdefinition/builders/visit_detail.py b/circe/cohortdefinition/builders/visit_detail.py index 8eb9356..21f29d6 100644 --- a/circe/cohortdefinition/builders/visit_detail.py +++ b/circe/cohortdefinition/builders/visit_detail.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import VisitDetail from .base import CriteriaSqlBuilder @@ -102,7 +101,7 @@ def embed_ordinal_expression(self, query: str, criteria: VisitDetail, where_clau def resolve_select_clauses( self, criteria: VisitDetail, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for visit detail criteria.""" select_cols = list(self.DEFAULT_SELECT_COLUMNS) @@ -151,7 +150,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: VisitDetail, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for visit detail criteria.""" join_clauses = [] @@ -177,7 +176,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: VisitDetail, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for visit detail criteria.""" where_clauses = [] @@ -264,7 +263,7 @@ def add_where_clause( where_clauses: list[str], concept_set_selection, concept_column: str, - exclude: Optional[bool] = None, + exclude: bool | None = None, ): """Add where clause for concept set selection.""" is_exclusion = exclude if exclude is not None else concept_set_selection.is_exclusion diff --git a/circe/cohortdefinition/builders/visit_occurrence.py b/circe/cohortdefinition/builders/visit_occurrence.py index b68ec3f..89708d6 100644 --- a/circe/cohortdefinition/builders/visit_occurrence.py +++ b/circe/cohortdefinition/builders/visit_occurrence.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ -from typing import Optional from ..criteria import VisitOccurrence from .base import CriteriaSqlBuilder @@ -74,7 +73,7 @@ def embed_codeset_clause(self, query: str, criteria: VisitOccurrence) -> str: def resolve_select_clauses( self, criteria: VisitOccurrence, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve select clauses for visit occurrence criteria.""" # Default select columns that are always returned @@ -125,7 +124,7 @@ def resolve_select_clauses( def resolve_join_clauses( self, criteria: VisitOccurrence, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve join clauses for visit occurrence criteria.""" join_clauses = [] @@ -162,7 +161,7 @@ def resolve_join_clauses( def resolve_where_clauses( self, criteria: VisitOccurrence, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> list[str]: """Resolve where clauses for visit occurrence criteria.""" where_clauses = super().resolve_where_clauses(criteria, options) diff --git a/circe/cohortdefinition/cohort.py b/circe/cohortdefinition/cohort.py index 8cd4570..b290d83 100644 --- a/circe/cohortdefinition/cohort.py +++ b/circe/cohortdefinition/cohort.py @@ -9,7 +9,7 @@ """ import contextlib -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any from pydantic import ( AliasChoices, @@ -87,38 +87,38 @@ class CohortExpression(CirceBaseModel): validation_alias=AliasChoices("ConceptSets", "conceptSets"), serialization_alias="ConceptSets", ) - qualified_limit: Optional[ResultLimit] = Field( + qualified_limit: ResultLimit | None = Field( default=None, validation_alias=AliasChoices("QualifiedLimit", "qualifiedLimit"), serialization_alias="QualifiedLimit", ) - additional_criteria: Optional[CriteriaGroup] = Field( + additional_criteria: CriteriaGroup | None = Field( default=None, validation_alias=AliasChoices("AdditionalCriteria", "additionalCriteria"), serialization_alias="AdditionalCriteria", ) - end_strategy: Optional[Union[EndStrategy, DateOffsetStrategy, CustomEraStrategy]] = Field( + end_strategy: EndStrategy | DateOffsetStrategy | CustomEraStrategy | None = Field( default=None, validation_alias=AliasChoices("EndStrategy", "endStrategy"), serialization_alias="EndStrategy", ) - cdm_version_range: Optional[str] = Field(default=None, alias="cdmVersionRange") - primary_criteria: Optional[PrimaryCriteria] = Field( + cdm_version_range: str | None = Field(default=None, alias="cdmVersionRange") + primary_criteria: PrimaryCriteria | None = Field( default=None, validation_alias=AliasChoices("PrimaryCriteria", "primaryCriteria"), serialization_alias="PrimaryCriteria", ) - expression_limit: Optional[ResultLimit] = Field( + expression_limit: ResultLimit | None = Field( default=None, validation_alias=AliasChoices("ExpressionLimit", "expressionLimit"), serialization_alias="ExpressionLimit", ) - collapse_settings: Optional[CollapseSettings] = Field( + collapse_settings: CollapseSettings | None = Field( default=None, validation_alias=AliasChoices("CollapseSettings", "collapseSettings"), serialization_alias="CollapseSettings", ) - title: Optional[str] = Field( + title: str | None = Field( default=None, validation_alias=AliasChoices("Title", "title"), serialization_alias="Title", @@ -128,7 +128,7 @@ class CohortExpression(CirceBaseModel): validation_alias=AliasChoices("InclusionRules", "inclusionRules"), serialization_alias="InclusionRules", ) - censor_window: Optional[Period] = Field( + censor_window: Period | None = Field( default=None, validation_alias=AliasChoices("CensorWindow", "censorWindow"), serialization_alias="CensorWindow", @@ -534,7 +534,7 @@ def has_end_strategy(self) -> bool: """ return self.end_strategy is not None - def get_end_strategy_type(self) -> Optional[str]: + def get_end_strategy_type(self) -> str | None: """Get the type of end strategy. Returns: @@ -573,7 +573,7 @@ def has_observation_window(self) -> bool: return self.primary_criteria.observation_window is not None - def get_primary_limit_type(self) -> Optional[str]: + def get_primary_limit_type(self) -> str | None: """Get the primary limit type. Returns: diff --git a/circe/cohortdefinition/cohort_expression_query_builder.py b/circe/cohortdefinition/cohort_expression_query_builder.py index b275ce6..254bff0 100644 --- a/circe/cohortdefinition/cohort_expression_query_builder.py +++ b/circe/cohortdefinition/cohort_expression_query_builder.py @@ -9,7 +9,7 @@ """ import json -from typing import Any, Optional, Union +from typing import Any from circe.extensions import get_registry @@ -69,12 +69,12 @@ class BuildExpressionQueryOptions: """ def __init__(self): - self.cohort_id_field_name: Optional[str] = None - self.cohort_id: Optional[int] = None - self.cdm_schema: Optional[str] = None - self.target_table: Optional[str] = None - self.result_schema: Optional[str] = None - self.vocabulary_schema: Optional[str] = None + self.cohort_id_field_name: str | None = None + self.cohort_id: int | None = None + self.cdm_schema: str | None = None + self.target_table: str | None = None + self.result_schema: str | None = None + self.vocabulary_schema: str | None = None self.generate_stats: bool = False @classmethod @@ -592,7 +592,7 @@ def get_censoring_events_query(self, censoring_criteria: list[Criteria]) -> str: def get_primary_events_query( self, primary_criteria: PrimaryCriteria, - subquery: Optional[str] = None, + subquery: str | None = None, ) -> str: """Get primary events query. @@ -652,7 +652,7 @@ def _get_primary_events_subquery(self, primary_criteria: PrimaryCriteria) -> str return query - def get_final_cohort_query(self, censor_window: Optional[Period]) -> str: + def get_final_cohort_query(self, censor_window: Period | None) -> str: """Get final cohort query. Java equivalent: getFinalCohortQuery() @@ -780,7 +780,7 @@ def _build_inclusion_analysis_section(self, expression: CohortExpression) -> str def build_expression_query( self, - expression: Union[str, CohortExpression], + expression: str | CohortExpression, options: BuildExpressionQueryOptions, ) -> str: """Build expression query from CohortExpression object or JSON string. @@ -1170,7 +1170,7 @@ def _get_windowed_criteria_query_internal( sql_template: str, criteria: Any, event_table: str, - options: Optional[BuilderOptions], + options: BuilderOptions | None, ) -> str: """Get windowed criteria query (internal method with all parameters). @@ -1365,7 +1365,7 @@ def get_windowed_criteria_query( self, criteria: Any, event_table: str, - options: Optional[BuilderOptions] = None, + options: BuilderOptions | None = None, ) -> str: """Get windowed criteria query. @@ -1465,7 +1465,7 @@ def get_corelated_criteria_query(self, corelated_criteria: CorelatedCriteria, ev return query - def get_criteria_sql(self, criteria: Criteria, options: Optional[BuilderOptions] = None) -> str: + def get_criteria_sql(self, criteria: Criteria, options: BuilderOptions | None = None) -> str: """Get criteria SQL for any criteria type. Java equivalent: Various getCriteriaSql methods @@ -1614,7 +1614,7 @@ def _get_criteria_sql_from_builder( self, builder: Any, criteria: Criteria, - options: Optional[BuilderOptions], + options: BuilderOptions | None, ) -> str: """Generic method to get criteria SQL from builder.""" query = builder.get_criteria_sql_with_options(criteria, options) @@ -1637,7 +1637,7 @@ def get_date_field_for_offset_strategy(self, date_field: str) -> str: def get_strategy_sql( self, - strategy: Union[DateOffsetStrategy, CustomEraStrategy], + strategy: DateOffsetStrategy | CustomEraStrategy, event_table: str, ) -> str: """Get strategy SQL for date offset or custom era strategy.""" diff --git a/circe/cohortdefinition/core.py b/circe/cohortdefinition/core.py index 714da87..b5dd7ec 100644 --- a/circe/cohortdefinition/core.py +++ b/circe/cohortdefinition/core.py @@ -9,7 +9,7 @@ """ from enum import Enum -from typing import Any, Optional, Union +from typing import Any from pydantic import ( AliasChoices, @@ -91,7 +91,7 @@ class ResultLimit(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.ResultLimit """ - type: Optional[str] = Field( + type: str | None = Field( default=None, validation_alias=AliasChoices("Type", "type"), serialization_alias="Type", @@ -104,8 +104,8 @@ class Period(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.Period """ - start_date: Optional[str] = None - end_date: Optional[str] = None + start_date: str | None = None + end_date: str | None = None model_config = ConfigDict(populate_by_name=True, alias_generator=to_pascal_alias) @@ -116,17 +116,17 @@ class DateRange(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.DateRange """ - op: Optional[str] = Field( + op: str | None = Field( default=None, validation_alias=AliasChoices("Op", "op"), serialization_alias="Op", ) - value: Optional[Union[str, float]] = Field( + value: str | float | None = Field( default=None, validation_alias=AliasChoices("Value", "value"), serialization_alias="Value", ) - extent: Optional[Union[str, float]] = Field( + extent: str | float | None = Field( default=None, validation_alias=AliasChoices("Extent", "extent"), serialization_alias="Extent", @@ -139,17 +139,17 @@ class NumericRange(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.NumericRange """ - op: Optional[str] = Field( + op: str | None = Field( default=None, validation_alias=AliasChoices("Op", "op"), serialization_alias="Op", ) - value: Optional[Union[int, float]] = Field( + value: int | float | None = Field( default=None, validation_alias=AliasChoices("Value", "value"), serialization_alias="Value", ) - extent: Optional[Union[int, float]] = Field( + extent: int | float | None = Field( default=None, validation_alias=AliasChoices("Extent", "extent"), serialization_alias="Extent", @@ -170,12 +170,12 @@ class DateAdjustment(CirceBaseModel): validation_alias=AliasChoices("endOffset", "EndOffset"), serialization_alias="endOffset", ) - start_with: Optional[DateType] = Field( + start_with: DateType | None = Field( default=DateType.START_DATE, validation_alias=AliasChoices("startWith", "StartWith"), serialization_alias="startWith", ) - end_with: Optional[DateType] = Field( + end_with: DateType | None = Field( default=DateType.END_DATE, validation_alias=AliasChoices("endWith", "EndWith"), serialization_alias="endWith", @@ -209,7 +209,7 @@ class CollapseSettings(CirceBaseModel): """ era_pad: int = Field(validation_alias=AliasChoices("EraPad", "eraPad"), serialization_alias="EraPad") - collapse_type: Optional[CollapseType] = Field( + collapse_type: CollapseType | None = Field( default=CollapseType.ERA, validation_alias=AliasChoices("CollapseType", "collapseType"), serialization_alias="CollapseType", @@ -224,7 +224,7 @@ class EndStrategy(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.EndStrategy """ - include: Optional[str] = None # JsonTypeInfo.Id.NAME + include: str | None = None # JsonTypeInfo.Id.NAME @model_serializer(mode="wrap") def _serialize_polymorphic(self, serializer, info): @@ -243,7 +243,7 @@ class ConceptSetSelection(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.ConceptSetSelection """ - codeset_id: Optional[int] = Field( + codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("CodesetId", "codesetId"), serialization_alias="CodesetId", @@ -263,12 +263,12 @@ class TextFilter(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.TextFilter """ - text: Optional[str] = Field( + text: str | None = Field( default=None, validation_alias=AliasChoices("Text", "text"), serialization_alias="Text", ) - op: Optional[str] = Field( + op: str | None = Field( default=None, validation_alias=AliasChoices("Op", "op"), serialization_alias="Op", @@ -282,7 +282,7 @@ class WindowBound(CirceBaseModel): """ coeff: int = Field(validation_alias=AliasChoices("Coeff", "coeff"), serialization_alias="Coeff") - days: Optional[int] = Field( + days: int | None = Field( default=None, validation_alias=AliasChoices("Days", "days"), serialization_alias="Days", @@ -297,22 +297,22 @@ class Window(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.Window """ - start: Optional[WindowBound] = Field( + start: WindowBound | None = Field( default=None, validation_alias=AliasChoices("Start", "start"), serialization_alias="Start", ) - end: Optional[WindowBound] = Field( + end: WindowBound | None = Field( default=None, validation_alias=AliasChoices("End", "end"), serialization_alias="End", ) - use_event_end: Optional[bool] = Field( + use_event_end: bool | None = Field( default=None, validation_alias=AliasChoices("UseEventEnd", "useEventEnd"), serialization_alias="UseEventEnd", ) - use_index_end: Optional[bool] = Field( + use_index_end: bool | None = Field( default=None, validation_alias=AliasChoices("UseIndexEnd", "useIndexEnd"), serialization_alias="UseIndexEnd", @@ -346,7 +346,7 @@ class CustomEraStrategy(EndStrategy): Java equivalent: org.ohdsi.circe.cohortdefinition.CustomEraStrategy """ - drug_codeset_id: Optional[int] = Field( + drug_codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("DrugCodesetId", "drugCodesetId"), serialization_alias="DrugCodesetId", @@ -361,7 +361,7 @@ class CustomEraStrategy(EndStrategy): validation_alias=AliasChoices("Offset", "offset"), serialization_alias="Offset", ) - days_supply_override: Optional[int] = Field( + days_supply_override: int | None = Field( default=None, validation_alias=AliasChoices("DaysSupplyOverride", "daysSupplyOverride"), serialization_alias="DaysSupplyOverride", diff --git a/circe/cohortdefinition/criteria.py b/circe/cohortdefinition/criteria.py index d1542b1..692b091 100644 --- a/circe/cohortdefinition/criteria.py +++ b/circe/cohortdefinition/criteria.py @@ -9,7 +9,7 @@ """ from enum import Enum -from typing import Annotated, Any, Optional, Union +from typing import Annotated, Any, Optional from pydantic import ( AliasChoices, @@ -109,7 +109,7 @@ class Occurrence(CirceBaseModel): validation_alias=AliasChoices("IsDistinct", "isDistinct"), serialization_alias="IsDistinct", ) - count_column: Optional[CriteriaColumn] = Field( + count_column: CriteriaColumn | None = Field( default=None, validation_alias=AliasChoices("CountColumn", "countColumn"), serialization_alias="CountColumn", @@ -135,12 +135,12 @@ class WindowedCriteria(CirceBaseModel): validation_alias=AliasChoices("Criteria", "criteria"), serialization_alias="Criteria", ) - start_window: Optional[Window] = Field( + start_window: Window | None = Field( default=None, validation_alias=AliasChoices("StartWindow", "startWindow"), serialization_alias="StartWindow", ) - end_window: Optional[Window] = Field( + end_window: Window | None = Field( default=None, validation_alias=AliasChoices("EndWindow", "endWindow"), serialization_alias="EndWindow", @@ -166,7 +166,7 @@ class CorelatedCriteria(WindowedCriteria): Java equivalent: org.ohdsi.circe.cohortdefinition.CorelatedCriteria """ - occurrence: Optional[Occurrence] = Field( + occurrence: Occurrence | None = Field( default=None, validation_alias=AliasChoices("Occurrence", "occurrence"), serialization_alias="Occurrence", @@ -180,47 +180,47 @@ class DemographicCriteria(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.DemographicCriteria """ - gender: Optional[list[Concept]] = Field( + gender: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("Gender", "gender"), serialization_alias="Gender", ) - occurrence_end_date: Optional[DateRange] = Field( + occurrence_end_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceEndDate", "occurrenceEndDate"), serialization_alias="OccurrenceEndDate", ) - gender_cs: Optional[ConceptSetSelection] = Field( + gender_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("GenderCS", "genderCS"), serialization_alias="GenderCS", ) - race: Optional[list[Concept]] = Field( + race: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("Race", "race"), serialization_alias="Race", ) - ethnicity_cs: Optional[ConceptSetSelection] = Field( + ethnicity_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("EthnicityCS", "ethnicityCS"), serialization_alias="EthnicityCS", ) - age: Optional[NumericRange] = Field( + age: NumericRange | None = Field( default=None, validation_alias=AliasChoices("Age", "age"), serialization_alias="Age", ) - race_cs: Optional[ConceptSetSelection] = Field( + race_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("RaceCS", "raceCS"), serialization_alias="RaceCS", ) - ethnicity: Optional[list[Concept]] = Field( + ethnicity: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("Ethnicity", "ethnicity"), serialization_alias="Ethnicity", ) - occurrence_start_date: Optional[DateRange] = Field( + occurrence_start_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceStartDate", "occurrenceStartDate"), serialization_alias="OccurrenceStartDate", @@ -235,7 +235,7 @@ class Criteria(CirceBaseModel): Java equivalent: org.ohdsi.circe.cohortdefinition.Criteria """ - date_adjustment: Optional[DateAdjustment] = Field( + date_adjustment: DateAdjustment | None = Field( default=None, validation_alias=AliasChoices("DateAdjustment", "dateAdjustment"), serialization_alias="DateAdjustment", @@ -245,7 +245,7 @@ class Criteria(CirceBaseModel): validation_alias=AliasChoices("CorrelatedCriteria", "correlatedCriteria"), serialization_alias="CorrelatedCriteria", ) - include: Optional[str] = None # JsonTypeInfo.Id.NAME + include: str | None = None # JsonTypeInfo.Id.NAME @model_serializer(mode="wrap") def _serialize_polymorphic(self, serializer, info): @@ -271,7 +271,7 @@ def _serialize_polymorphic(self, serializer, info): return {self.__class__.__name__: data} - def accept(self, dispatcher: Any, options: Optional[Any] = None) -> str: + def accept(self, dispatcher: Any, options: Any | None = None) -> str: """Accept method for visitor pattern.""" return dispatcher.get_criteria_sql(self, options) @@ -287,12 +287,12 @@ class InclusionRule(CirceBaseModel): validation_alias=AliasChoices("Expression", "expression"), serialization_alias="Expression", ) - description: Optional[str] = Field( + description: str | None = Field( default=None, validation_alias=AliasChoices("Description", "description"), serialization_alias="Description", ) - name: Optional[str] = Field( + name: str | None = Field( default=None, validation_alias=AliasChoices("Name", "name"), serialization_alias="Name", @@ -310,93 +310,93 @@ class ConditionOccurrence(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.ConditionOccurrence """ - codeset_id: Optional[int] = Field( + codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("CodesetId", "codesetId"), serialization_alias="CodesetId", ) - first: Optional[bool] = Field( + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - occurrence_start_date: Optional[DateRange] = Field( + occurrence_start_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceStartDate", "occurrenceStartDate"), serialization_alias="OccurrenceStartDate", ) - occurrence_end_date: Optional[DateRange] = Field( + occurrence_end_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceEndDate", "occurrenceEndDate"), serialization_alias="OccurrenceEndDate", ) - condition_type: Optional[list[Concept]] = Field( + condition_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ConditionType", "conditionType"), serialization_alias="ConditionType", ) - condition_type_cs: Optional[ConceptSetSelection] = Field( + condition_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ConditionTypeCS", "conditionTypeCS"), serialization_alias="ConditionTypeCS", ) - condition_type_exclude: Optional[bool] = Field( + condition_type_exclude: bool | None = Field( default=False, validation_alias=AliasChoices("ConditionTypeExclude", "conditionTypeExclude"), serialization_alias="ConditionTypeExclude", ) - stop_reason: Optional[TextFilter] = Field( + stop_reason: TextFilter | None = Field( default=None, validation_alias=AliasChoices("StopReason", "stopReason"), serialization_alias="StopReason", ) - condition_source_concept: Optional[int] = Field( + condition_source_concept: int | None = Field( default=None, validation_alias=AliasChoices("ConditionSourceConcept", "conditionSourceConcept"), serialization_alias="ConditionSourceConcept", ) - age: Optional[NumericRange] = Field( + age: NumericRange | None = Field( default=None, validation_alias=AliasChoices("Age", "age"), serialization_alias="Age", ) - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field( + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("GenderCS", "genderCS"), serialization_alias="GenderCS", ) - provider_specialty: Optional[list[Concept]] = Field( + provider_specialty: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialty", "providerSpecialty"), serialization_alias="ProviderSpecialty", ) - provider_specialty_cs: Optional[ConceptSetSelection] = Field( + provider_specialty_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialtyCS", "providerSpecialtyCS"), serialization_alias="ProviderSpecialtyCS", ) - visit_type: Optional[list[Concept]] = Field( + visit_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("VisitType", "visitType"), serialization_alias="VisitType", ) - visit_type_cs: Optional[ConceptSetSelection] = Field( + visit_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("VisitTypeCS", "visitTypeCS"), serialization_alias="VisitTypeCS", ) - condition_status: Optional[list[Concept]] = Field( + condition_status: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ConditionStatus", "conditionStatus"), serialization_alias="ConditionStatus", ) - condition_status_cs: Optional[ConceptSetSelection] = Field( + condition_status_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ConditionStatusCS", "conditionStatusCS"), serialization_alias="ConditionStatusCS", ) - date_adjustment: Optional[DateAdjustment] = Field( + date_adjustment: DateAdjustment | None = Field( default=None, validation_alias=AliasChoices("DateAdjustment", "dateAdjustment"), serialization_alias="DateAdjustment", @@ -411,33 +411,33 @@ class DrugExposure(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.DrugExposure """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field( + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceEndDate", "occurrenceEndDate"), serialization_alias="OccurrenceEndDate", ) - stop_reason: Optional[TextFilter] = Field( + stop_reason: TextFilter | None = Field( default=None, validation_alias=AliasChoices("StopReason", "stopReason"), serialization_alias="StopReason", ) - drug_source_concept: Optional[int] = Field( + drug_source_concept: int | None = Field( default=None, validation_alias=AliasChoices("DrugSourceConcept", "drugSourceConcept"), serialization_alias="DrugSourceConcept", ) - gender_cs: Optional[ConceptSetSelection] = Field( + gender_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("GenderCS", "genderCS"), serialization_alias="GenderCS", ) - drug_type: Optional[list[Concept]] = Field( + drug_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("DrugType", "drugType"), serialization_alias="DrugType", ) - drug_type_cs: Optional[ConceptSetSelection] = Field( + drug_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("DrugTypeCS", "drugTypeCS"), serialization_alias="DrugTypeCS", @@ -447,83 +447,83 @@ class DrugExposure(Criteria): validation_alias=AliasChoices("DrugTypeExclude", "drugTypeExclude"), serialization_alias="DrugTypeExclude", ) - provider_specialty_cs: Optional[ConceptSetSelection] = Field( + provider_specialty_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialtyCS", "providerSpecialtyCS"), serialization_alias="ProviderSpecialtyCS", ) - visit_type_cs: Optional[ConceptSetSelection] = Field( + visit_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("VisitTypeCS", "visitTypeCS"), serialization_alias="VisitTypeCS", ) - visit_type: Optional[list[Concept]] = Field( + visit_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("VisitType", "visitType"), serialization_alias="VisitType", ) - route_concept: Optional[list[Concept]] = Field( + route_concept: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("RouteConcept", "routeConcept"), serialization_alias="RouteConcept", ) - route_concept_cs: Optional[ConceptSetSelection] = Field( + route_concept_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("RouteConceptCS", "routeConceptCS"), serialization_alias="RouteConceptCS", ) - codeset_id: Optional[int] = Field( + codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("CodesetId", "codesetId"), serialization_alias="CodesetId", ) - first: Optional[bool] = Field( + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - provider_specialty: Optional[list[Concept]] = Field( + provider_specialty: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialty", "providerSpecialty"), serialization_alias="ProviderSpecialty", ) - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field( + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceStartDate", "occurrenceStartDate"), serialization_alias="OccurrenceStartDate", ) - dose_unit: Optional[list[Concept]] = Field( + dose_unit: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("DoseUnit", "doseUnit"), serialization_alias="DoseUnit", ) - dose_unit_cs: Optional[ConceptSetSelection] = Field( + dose_unit_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("DoseUnitCS", "doseUnitCS"), serialization_alias="DoseUnitCS", ) - lot_number: Optional[TextFilter] = Field( + lot_number: TextFilter | None = Field( default=None, validation_alias=AliasChoices("LotNumber", "lotNumber"), serialization_alias="LotNumber", ) - quantity: Optional[NumericRange] = Field( + quantity: NumericRange | None = Field( default=None, validation_alias=AliasChoices("Quantity", "quantity"), serialization_alias="Quantity", ) - days_supply: Optional[NumericRange] = Field( + days_supply: NumericRange | None = Field( default=None, validation_alias=AliasChoices("DaysSupply", "daysSupply"), serialization_alias="DaysSupply", ) - refills: Optional[NumericRange] = Field( + refills: NumericRange | None = Field( default=None, validation_alias=AliasChoices("Refills", "refills"), serialization_alias="Refills", ) - effective_drug_dose: Optional[NumericRange] = Field( + effective_drug_dose: NumericRange | None = Field( default=None, validation_alias=AliasChoices("EffectiveDrugDose", "effectiveDrugDose"), serialization_alias="EffectiveDrugDose", @@ -538,28 +538,28 @@ class ProcedureOccurrence(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.ProcedureOccurrence """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - procedure_source_concept: Optional[int] = Field(default=None, alias="ProcedureSourceConcept") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - procedure_type: Optional[list[Concept]] = Field(default=None, alias="ProcedureType") - procedure_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProcedureTypeCS") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + procedure_source_concept: int | None = Field(default=None, alias="ProcedureSourceConcept") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + procedure_type: list[Concept] | None = Field(default=None, alias="ProcedureType") + procedure_type_cs: ConceptSetSelection | None = Field(default=None, alias="ProcedureTypeCS") procedure_type_exclude: bool = Field(default=False, alias="ProcedureTypeExclude") - provider_specialty_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProviderSpecialtyCS") - visit_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="VisitTypeCS") - visit_type: Optional[list[Concept]] = Field(default=None, alias="VisitType") - modifier: Optional[list[Concept]] = Field(default=None, alias="Modifier") - modifier_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ModifierCS") - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field( + provider_specialty_cs: ConceptSetSelection | None = Field(default=None, alias="ProviderSpecialtyCS") + visit_type_cs: ConceptSetSelection | None = Field(default=None, alias="VisitTypeCS") + visit_type: list[Concept] | None = Field(default=None, alias="VisitType") + modifier: list[Concept] | None = Field(default=None, alias="Modifier") + modifier_cs: ConceptSetSelection | None = Field(default=None, alias="ModifierCS") + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - age: Optional[NumericRange] = None - quantity: Optional[NumericRange] = Field(default=None, alias="Quantity") - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + age: NumericRange | None = None + quantity: NumericRange | None = Field(default=None, alias="Quantity") + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -570,23 +570,23 @@ class VisitOccurrence(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.VisitOccurrence """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field(default=None, alias="First") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - visit_type: Optional[list[Concept]] = Field(default=None, alias="VisitType") - visit_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="VisitTypeCS") + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field(default=None, alias="First") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + visit_type: list[Concept] | None = Field(default=None, alias="VisitType") + visit_type_cs: ConceptSetSelection | None = Field(default=None, alias="VisitTypeCS") visit_type_exclude: bool = Field(default=False, alias="VisitTypeExclude") - visit_source_concept: Optional[int] = Field(default=None, alias="VisitSourceConcept") - visit_length: Optional[NumericRange] = Field(default=None, alias="VisitLength") - provider_specialty_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProviderSpecialtyCS") - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - place_of_service: Optional[list[Concept]] = Field(default=None, alias="PlaceOfService") - place_of_service_cs: Optional[ConceptSetSelection] = Field(default=None, alias="PlaceOfServiceCS") - place_of_service_location: Optional[int] = Field(default=None, alias="PlaceOfServiceLocation") - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + visit_source_concept: int | None = Field(default=None, alias="VisitSourceConcept") + visit_length: NumericRange | None = Field(default=None, alias="VisitLength") + provider_specialty_cs: ConceptSetSelection | None = Field(default=None, alias="ProviderSpecialtyCS") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + place_of_service: list[Concept] | None = Field(default=None, alias="PlaceOfService") + place_of_service_cs: ConceptSetSelection | None = Field(default=None, alias="PlaceOfServiceCS") + place_of_service_location: int | None = Field(default=None, alias="PlaceOfServiceLocation") + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -597,28 +597,28 @@ class Observation(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.Observation """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field( + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceEndDate", "occurrenceEndDate"), serialization_alias="OccurrenceEndDate", ) - observation_source_concept: Optional[int] = Field( + observation_source_concept: int | None = Field( default=None, validation_alias=AliasChoices("ObservationSourceConcept", "observationSourceConcept"), serialization_alias="ObservationSourceConcept", ) - gender_cs: Optional[ConceptSetSelection] = Field( + gender_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("GenderCS", "genderCS"), serialization_alias="GenderCS", ) - observation_type: Optional[list[Concept]] = Field( + observation_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ObservationType", "observationType"), serialization_alias="ObservationType", ) - observation_type_cs: Optional[ConceptSetSelection] = Field( + observation_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ObservationTypeCS", "observationTypeCS"), serialization_alias="ObservationTypeCS", @@ -628,78 +628,78 @@ class Observation(Criteria): validation_alias=AliasChoices("ObservationTypeExclude", "observationTypeExclude"), serialization_alias="ObservationTypeExclude", ) - provider_specialty_cs: Optional[ConceptSetSelection] = Field( + provider_specialty_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialtyCS", "providerSpecialtyCS"), serialization_alias="ProviderSpecialtyCS", ) - visit_type_cs: Optional[ConceptSetSelection] = Field( + visit_type_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("VisitTypeCS", "visitTypeCS"), serialization_alias="VisitTypeCS", ) - visit_type: Optional[list[Concept]] = Field( + visit_type: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("VisitType", "visitType"), serialization_alias="VisitType", ) - value_as_number: Optional[NumericRange] = Field( + value_as_number: NumericRange | None = Field( default=None, validation_alias=AliasChoices("ValueAsNumber", "valueAsNumber"), serialization_alias="ValueAsNumber", ) - unit: Optional[list[Concept]] = Field( + unit: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("Unit", "unit"), serialization_alias="Unit", ) - unit_cs: Optional[ConceptSetSelection] = Field( + unit_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("UnitCS", "unitCS"), serialization_alias="UnitCS", ) - value_as_concept: Optional[list[Concept]] = Field( + value_as_concept: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ValueAsConcept", "valueAsConcept"), serialization_alias="ValueAsConcept", ) - value_as_concept_cs: Optional[ConceptSetSelection] = Field( + value_as_concept_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ValueAsConceptCS", "valueAsConceptCS"), serialization_alias="ValueAsConceptCS", ) - qualifier: Optional[list[Concept]] = Field( + qualifier: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("Qualifier", "qualifier"), serialization_alias="Qualifier", ) - qualifier_cs: Optional[ConceptSetSelection] = Field( + qualifier_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("QualifierCS", "qualifierCS"), serialization_alias="QualifierCS", ) - value_as_string: Optional[TextFilter] = Field( + value_as_string: TextFilter | None = Field( default=None, validation_alias=AliasChoices("ValueAsString", "valueAsString"), serialization_alias="ValueAsString", ) - codeset_id: Optional[int] = Field( + codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("CodesetId", "codesetId"), serialization_alias="CodesetId", ) - first: Optional[bool] = Field( + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - provider_specialty: Optional[list[Concept]] = Field( + provider_specialty: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ProviderSpecialty", "providerSpecialty"), serialization_alias="ProviderSpecialty", ) - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field( + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceStartDate", "occurrenceStartDate"), serialization_alias="OccurrenceStartDate", @@ -714,72 +714,72 @@ class Measurement(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.Measurement """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - measurement_source_concept: Optional[int] = Field(default=None, alias="MeasurementSourceConcept") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - measurement_type: Optional[list[Concept]] = Field(default=None, alias="MeasurementType") - measurement_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="MeasurementTypeCS") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + measurement_source_concept: int | None = Field(default=None, alias="MeasurementSourceConcept") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + measurement_type: list[Concept] | None = Field(default=None, alias="MeasurementType") + measurement_type_cs: ConceptSetSelection | None = Field(default=None, alias="MeasurementTypeCS") measurement_type_exclude: bool = Field( default=False, validation_alias=AliasChoices("MeasurementTypeExclude", "measurementTypeExclude"), serialization_alias="MeasurementTypeExclude", ) - operator: Optional[list[Concept]] = None - operator_cs: Optional[ConceptSetSelection] = Field(default=None, alias="OperatorCS") - value_as_number: Optional[NumericRange] = Field(default=None, alias="ValueAsNumber") - value_as_string: Optional[TextFilter] = Field(default=None, alias="ValueAsString") - unit: Optional[list[Concept]] = Field(default=None, alias="Unit") - unit_cs: Optional[ConceptSetSelection] = Field(default=None, alias="UnitCS") - range_low: Optional[NumericRange] = Field(default=None, alias="RangeLow") - range_high: Optional[NumericRange] = Field(default=None, alias="RangeHigh") - provider_specialty_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProviderSpecialtyCS") - visit_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="VisitTypeCS") - visit_type: Optional[list[Concept]] = Field(default=None, alias="VisitType") - codeset_id: Optional[int] = Field( + operator: list[Concept] | None = None + operator_cs: ConceptSetSelection | None = Field(default=None, alias="OperatorCS") + value_as_number: NumericRange | None = Field(default=None, alias="ValueAsNumber") + value_as_string: TextFilter | None = Field(default=None, alias="ValueAsString") + unit: list[Concept] | None = Field(default=None, alias="Unit") + unit_cs: ConceptSetSelection | None = Field(default=None, alias="UnitCS") + range_low: NumericRange | None = Field(default=None, alias="RangeLow") + range_high: NumericRange | None = Field(default=None, alias="RangeHigh") + provider_specialty_cs: ConceptSetSelection | None = Field(default=None, alias="ProviderSpecialtyCS") + visit_type_cs: ConceptSetSelection | None = Field(default=None, alias="VisitTypeCS") + visit_type: list[Concept] | None = Field(default=None, alias="VisitType") + codeset_id: int | None = Field( default=None, validation_alias=AliasChoices("CodesetId", "codesetId"), serialization_alias="CodesetId", ) - value_as_concept: Optional[list[Concept]] = Field( + value_as_concept: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ValueAsConcept", "valueAsConcept"), serialization_alias="ValueAsConcept", ) - value_as_concept_cs: Optional[ConceptSetSelection] = Field( + value_as_concept_cs: ConceptSetSelection | None = Field( default=None, validation_alias=AliasChoices("ValueAsConceptCS", "valueAsConceptCS"), serialization_alias="ValueAsConceptCS", ) - abnormal: Optional[bool] = Field( + abnormal: bool | None = Field( default=None, validation_alias=AliasChoices("Abnormal", "abnormal"), serialization_alias="Abnormal", ) - range_low_ratio: Optional[NumericRange] = Field( + range_low_ratio: NumericRange | None = Field( default=None, validation_alias=AliasChoices("RangeLowRatio", "rangeLowRatio"), serialization_alias="RangeLowRatio", ) - range_high_ratio: Optional[NumericRange] = Field( + range_high_ratio: NumericRange | None = Field( default=None, validation_alias=AliasChoices("RangeHighRatio", "rangeHighRatio"), serialization_alias="RangeHighRatio", ) - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") - visits: Optional[list[Concept]] = None # Placeholder if needed, but not in list - visit_type: Optional[list[Concept]] = Field(default=None, alias="VisitType") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") + visits: list[Concept] | None = None # Placeholder if needed, but not in list + visit_type: list[Concept] | None = Field(default=None, alias="VisitType") - first: Optional[bool] = Field( + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -790,27 +790,27 @@ class DeviceExposure(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.DeviceExposure """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - device_source_concept: Optional[int] = Field(default=None, alias="DeviceSourceConcept") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - device_type: Optional[list[Concept]] = Field(default=None, alias="DeviceType") - device_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="DeviceTypeCS") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + device_source_concept: int | None = Field(default=None, alias="DeviceSourceConcept") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + device_type: list[Concept] | None = Field(default=None, alias="DeviceType") + device_type_cs: ConceptSetSelection | None = Field(default=None, alias="DeviceTypeCS") device_type_exclude: bool = Field(default=False, alias="DeviceTypeExclude") - unique_device_id: Optional[TextFilter] = Field(default=None, alias="UniqueDeviceId") - quantity: Optional[NumericRange] = None - provider_specialty_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProviderSpecialtyCS") - visit_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="VisitTypeCS") - visit_type: Optional[list[Concept]] = Field(default=None, alias="VisitType") - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field( + unique_device_id: TextFilter | None = Field(default=None, alias="UniqueDeviceId") + quantity: NumericRange | None = None + provider_specialty_cs: ConceptSetSelection | None = Field(default=None, alias="ProviderSpecialtyCS") + visit_type_cs: ConceptSetSelection | None = Field(default=None, alias="VisitTypeCS") + visit_type: list[Concept] | None = Field(default=None, alias="VisitType") + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - age: Optional[NumericRange] = Field(default=None, alias="Age") - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + age: NumericRange | None = Field(default=None, alias="Age") + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -821,29 +821,29 @@ class Specimen(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.Specimen """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - specimen_source_concept: Optional[int] = Field(default=None, alias="SpecimenSourceConcept") - source_id: Optional[TextFilter] = Field(default=None, alias="SourceId") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - specimen_type: Optional[list[Concept]] = Field(default=None, alias="SpecimenType") - specimen_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="SpecimenTypeCS") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + specimen_source_concept: int | None = Field(default=None, alias="SpecimenSourceConcept") + source_id: TextFilter | None = Field(default=None, alias="SourceId") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + specimen_type: list[Concept] | None = Field(default=None, alias="SpecimenType") + specimen_type_cs: ConceptSetSelection | None = Field(default=None, alias="SpecimenTypeCS") specimen_type_exclude: bool = Field(default=False, alias="SpecimenTypeExclude") - unit: Optional[list[Concept]] = None - unit_cs: Optional[ConceptSetSelection] = Field(default=None, alias="UnitCS") - anatomic_site: Optional[list[Concept]] = Field(default=None, alias="AnatomicSite") - anatomic_site_cs: Optional[ConceptSetSelection] = Field(default=None, alias="AnatomicSiteCS") - disease_status: Optional[list[Concept]] = Field(default=None, alias="DiseaseStatus") - disease_status_cs: Optional[ConceptSetSelection] = Field(default=None, alias="DiseaseStatusCS") - quantity: Optional[NumericRange] = None - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field( + unit: list[Concept] | None = None + unit_cs: ConceptSetSelection | None = Field(default=None, alias="UnitCS") + anatomic_site: list[Concept] | None = Field(default=None, alias="AnatomicSite") + anatomic_site_cs: ConceptSetSelection | None = Field(default=None, alias="AnatomicSiteCS") + disease_status: list[Concept] | None = Field(default=None, alias="DiseaseStatus") + disease_status_cs: ConceptSetSelection | None = Field(default=None, alias="DiseaseStatusCS") + quantity: NumericRange | None = None + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -854,23 +854,23 @@ class Death(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.Death """ - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - occurrence_end_date: Optional[DateRange] = Field(default=None, alias="OccurrenceEndDate") - death_source_concept: Optional[int] = Field(default=None, alias="DeathSourceConcept") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - death_type: Optional[list[Concept]] = Field(default=None, alias="DeathType") - death_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="DeathTypeCS") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + occurrence_end_date: DateRange | None = Field(default=None, alias="OccurrenceEndDate") + death_source_concept: int | None = Field(default=None, alias="DeathSourceConcept") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + death_type: list[Concept] | None = Field(default=None, alias="DeathType") + death_type_cs: ConceptSetSelection | None = Field(default=None, alias="DeathTypeCS") death_type_exclude: bool = Field( default=False, validation_alias=AliasChoices("DeathTypeExclude", "deathTypeExclude"), serialization_alias="DeathTypeExclude", ) - cause_source_concept: Optional[int] = Field(default=None, alias="CauseSourceConcept") - cause_source_concept_cs: Optional[ConceptSetSelection] = Field(default=None, alias="CauseSourceConceptCS") - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") + cause_source_concept: int | None = Field(default=None, alias="CauseSourceConcept") + cause_source_concept_cs: ConceptSetSelection | None = Field(default=None, alias="CauseSourceConceptCS") + codeset_id: int | None = Field(default=None, alias="CodesetId") - age: Optional[NumericRange] = None - occurrence_start_date: Optional[DateRange] = Field(default=None, alias="OccurrenceStartDate") + age: NumericRange | None = None + occurrence_start_date: DateRange | None = Field(default=None, alias="OccurrenceStartDate") model_config = ConfigDict(populate_by_name=True) @@ -881,25 +881,25 @@ class VisitDetail(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.VisitDetail """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field(default=None, alias="First") - visit_detail_start_date: Optional[DateRange] = Field(default=None, alias="VisitDetailStartDate") - visit_detail_end_date: Optional[DateRange] = Field(default=None, alias="VisitDetailEndDate") - visit_detail_type: Optional[list[Concept]] = Field(default=None, alias="VisitDetailType") - visit_detail_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="VisitDetailTypeCS") + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field(default=None, alias="First") + visit_detail_start_date: DateRange | None = Field(default=None, alias="VisitDetailStartDate") + visit_detail_end_date: DateRange | None = Field(default=None, alias="VisitDetailEndDate") + visit_detail_type: list[Concept] | None = Field(default=None, alias="VisitDetailType") + visit_detail_type_cs: ConceptSetSelection | None = Field(default=None, alias="VisitDetailTypeCS") visit_detail_type_exclude: bool = Field(default=False, alias="VisitDetailTypeExclude") - visit_detail_source_concept: Optional[int] = Field(default=None, alias="VisitDetailSourceConcept") - visit_detail_length: Optional[NumericRange] = Field(default=None, alias="VisitDetailLength") - age: Optional[NumericRange] = Field(default=None, alias="Age") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - provider_specialty: Optional[list[Concept]] = Field(default=None, alias="ProviderSpecialty") - provider_specialty_cs: Optional[ConceptSetSelection] = Field(default=None, alias="ProviderSpecialtyCS") - place_of_service: Optional[list[Concept]] = Field(default=None, alias="PlaceOfService") - place_of_service_cs: Optional[ConceptSetSelection] = Field(default=None, alias="PlaceOfServiceCS") - place_of_service_location: Optional[int] = Field(default=None, alias="PlaceOfServiceLocation") - discharge_to: Optional[list[Concept]] = Field(default=None, alias="DischargeTo") - discharge_to_cs: Optional[ConceptSetSelection] = Field(default=None, alias="DischargeToCS") + visit_detail_source_concept: int | None = Field(default=None, alias="VisitDetailSourceConcept") + visit_detail_length: NumericRange | None = Field(default=None, alias="VisitDetailLength") + age: NumericRange | None = Field(default=None, alias="Age") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + provider_specialty: list[Concept] | None = Field(default=None, alias="ProviderSpecialty") + provider_specialty_cs: ConceptSetSelection | None = Field(default=None, alias="ProviderSpecialtyCS") + place_of_service: list[Concept] | None = Field(default=None, alias="PlaceOfService") + place_of_service_cs: ConceptSetSelection | None = Field(default=None, alias="PlaceOfServiceCS") + place_of_service_location: int | None = Field(default=None, alias="PlaceOfServiceLocation") + discharge_to: list[Concept] | None = Field(default=None, alias="DischargeTo") + discharge_to_cs: ConceptSetSelection | None = Field(default=None, alias="DischargeToCS") model_config = ConfigDict(populate_by_name=True) @@ -910,15 +910,15 @@ class ObservationPeriod(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.ObservationPeriod """ - first: Optional[bool] = Field(default=None, alias="First") - period_start_date: Optional[DateRange] = Field(default=None, alias="PeriodStartDate") - period_end_date: Optional[DateRange] = Field(default=None, alias="PeriodEndDate") - user_defined_period: Optional[Period] = Field(default=None, alias="UserDefinedPeriod") - period_type: Optional[list[Concept]] = Field(default=None, alias="PeriodType") - period_type_cs: Optional[ConceptSetSelection] = Field(default=None, alias="PeriodTypeCS") - period_length: Optional[NumericRange] = Field(default=None, alias="PeriodLength") - age_at_start: Optional[NumericRange] = Field(default=None, alias="AgeAtStart") - age_at_end: Optional[NumericRange] = Field(default=None, alias="AgeAtEnd") + first: bool | None = Field(default=None, alias="First") + period_start_date: DateRange | None = Field(default=None, alias="PeriodStartDate") + period_end_date: DateRange | None = Field(default=None, alias="PeriodEndDate") + user_defined_period: Period | None = Field(default=None, alias="UserDefinedPeriod") + period_type: list[Concept] | None = Field(default=None, alias="PeriodType") + period_type_cs: ConceptSetSelection | None = Field(default=None, alias="PeriodTypeCS") + period_length: NumericRange | None = Field(default=None, alias="PeriodLength") + age_at_start: NumericRange | None = Field(default=None, alias="AgeAtStart") + age_at_end: NumericRange | None = Field(default=None, alias="AgeAtEnd") model_config = ConfigDict(populate_by_name=True) @@ -929,23 +929,23 @@ class PayerPlanPeriod(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.PayerPlanPeriod """ - first: Optional[bool] = Field(default=None, alias="First") - period_start_date: Optional[DateRange] = Field(default=None, alias="PeriodStartDate") - period_end_date: Optional[DateRange] = Field(default=None, alias="PeriodEndDate") - user_defined_period: Optional[Period] = Field(default=None, alias="UserDefinedPeriod") - period_length: Optional[NumericRange] = Field(default=None, alias="PeriodLength") - age_at_start: Optional[NumericRange] = Field(default=None, alias="AgeAtStart") - age_at_end: Optional[NumericRange] = Field(default=None, alias="AgeAtEnd") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - payer_concept: Optional[int] = Field(default=None, alias="PayerConcept") - plan_concept: Optional[int] = Field(default=None, alias="PlanConcept") - sponsor_concept: Optional[int] = Field(default=None, alias="SponsorConcept") - stop_reason_concept: Optional[int] = Field(default=None, alias="StopReasonConcept") - payer_source_concept: Optional[int] = Field(default=None, alias="PayerSourceConcept") - plan_source_concept: Optional[int] = Field(default=None, alias="PlanSourceConcept") - sponsor_source_concept: Optional[int] = Field(default=None, alias="SponsorSourceConcept") - stop_reason_source_concept: Optional[int] = Field(default=None, alias="StopReasonSourceConcept") + first: bool | None = Field(default=None, alias="First") + period_start_date: DateRange | None = Field(default=None, alias="PeriodStartDate") + period_end_date: DateRange | None = Field(default=None, alias="PeriodEndDate") + user_defined_period: Period | None = Field(default=None, alias="UserDefinedPeriod") + period_length: NumericRange | None = Field(default=None, alias="PeriodLength") + age_at_start: NumericRange | None = Field(default=None, alias="AgeAtStart") + age_at_end: NumericRange | None = Field(default=None, alias="AgeAtEnd") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + payer_concept: int | None = Field(default=None, alias="PayerConcept") + plan_concept: int | None = Field(default=None, alias="PlanConcept") + sponsor_concept: int | None = Field(default=None, alias="SponsorConcept") + stop_reason_concept: int | None = Field(default=None, alias="StopReasonConcept") + payer_source_concept: int | None = Field(default=None, alias="PayerSourceConcept") + plan_source_concept: int | None = Field(default=None, alias="PlanSourceConcept") + sponsor_source_concept: int | None = Field(default=None, alias="SponsorSourceConcept") + stop_reason_source_concept: int | None = Field(default=None, alias="StopReasonSourceConcept") model_config = ConfigDict(populate_by_name=True) @@ -956,7 +956,7 @@ class LocationRegion(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.LocationRegion """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") + codeset_id: int | None = Field(default=None, alias="CodesetId") model_config = ConfigDict(populate_by_name=True) @@ -972,21 +972,21 @@ class ConditionEra(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.ConditionEra """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field( + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - era_start_date: Optional[DateRange] = Field(default=None, alias="EraStartDate") - era_end_date: Optional[DateRange] = Field(default=None, alias="EraEndDate") - occurrence_count: Optional[NumericRange] = Field(default=None, alias="OccurrenceCount") - era_length: Optional[NumericRange] = Field(default=None, alias="EraLength") - age_at_start: Optional[NumericRange] = Field(default=None, alias="AgeAtStart") - age_at_end: Optional[NumericRange] = Field(default=None, alias="AgeAtEnd") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - date_adjustment: Optional[DateAdjustment] = Field(default=None, alias="DateAdjustment") + era_start_date: DateRange | None = Field(default=None, alias="EraStartDate") + era_end_date: DateRange | None = Field(default=None, alias="EraEndDate") + occurrence_count: NumericRange | None = Field(default=None, alias="OccurrenceCount") + era_length: NumericRange | None = Field(default=None, alias="EraLength") + age_at_start: NumericRange | None = Field(default=None, alias="AgeAtStart") + age_at_end: NumericRange | None = Field(default=None, alias="AgeAtEnd") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + date_adjustment: DateAdjustment | None = Field(default=None, alias="DateAdjustment") model_config = ConfigDict(populate_by_name=True) @@ -997,22 +997,22 @@ class DrugEra(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.DrugEra """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field( + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field( default=None, validation_alias=AliasChoices("First", "first"), serialization_alias="First", ) - era_start_date: Optional[DateRange] = Field(default=None, alias="EraStartDate") - era_end_date: Optional[DateRange] = Field(default=None, alias="EraEndDate") - occurrence_count: Optional[NumericRange] = Field(default=None, alias="OccurrenceCount") - gap_days: Optional[NumericRange] = Field(default=None, alias="GapDays") - era_length: Optional[NumericRange] = Field(default=None, alias="EraLength") - age_at_start: Optional[NumericRange] = Field(default=None, alias="AgeAtStart") - age_at_end: Optional[NumericRange] = Field(default=None, alias="AgeAtEnd") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") - date_adjustment: Optional[DateAdjustment] = Field(default=None, alias="DateAdjustment") + era_start_date: DateRange | None = Field(default=None, alias="EraStartDate") + era_end_date: DateRange | None = Field(default=None, alias="EraEndDate") + occurrence_count: NumericRange | None = Field(default=None, alias="OccurrenceCount") + gap_days: NumericRange | None = Field(default=None, alias="GapDays") + era_length: NumericRange | None = Field(default=None, alias="EraLength") + age_at_start: NumericRange | None = Field(default=None, alias="AgeAtStart") + age_at_end: NumericRange | None = Field(default=None, alias="AgeAtEnd") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") + date_adjustment: DateAdjustment | None = Field(default=None, alias="DateAdjustment") model_config = ConfigDict(populate_by_name=True) @@ -1023,18 +1023,18 @@ class DoseEra(Criteria): Java equivalent: org.ohdsi.circe.cohortdefinition.DoseEra """ - codeset_id: Optional[int] = Field(default=None, alias="CodesetId") - first: Optional[bool] = Field(default=None, alias="First") - era_start_date: Optional[DateRange] = Field(default=None, alias="EraStartDate") - era_end_date: Optional[DateRange] = Field(default=None, alias="EraEndDate") - unit: Optional[list[Concept]] = Field(default=None, alias="Unit") - unit_cs: Optional[ConceptSetSelection] = Field(default=None, alias="UnitCS") - dose_value: Optional[NumericRange] = Field(default=None, alias="DoseValue") - era_length: Optional[NumericRange] = Field(default=None, alias="EraLength") - age_at_start: Optional[NumericRange] = Field(default=None, alias="AgeAtStart") - age_at_end: Optional[NumericRange] = Field(default=None, alias="AgeAtEnd") - gender: Optional[list[Concept]] = Field(default=None, serialization_alias="gender") - gender_cs: Optional[ConceptSetSelection] = Field(default=None, alias="GenderCS") + codeset_id: int | None = Field(default=None, alias="CodesetId") + first: bool | None = Field(default=None, alias="First") + era_start_date: DateRange | None = Field(default=None, alias="EraStartDate") + era_end_date: DateRange | None = Field(default=None, alias="EraEndDate") + unit: list[Concept] | None = Field(default=None, alias="Unit") + unit_cs: ConceptSetSelection | None = Field(default=None, alias="UnitCS") + dose_value: NumericRange | None = Field(default=None, alias="DoseValue") + era_length: NumericRange | None = Field(default=None, alias="EraLength") + age_at_start: NumericRange | None = Field(default=None, alias="AgeAtStart") + age_at_end: NumericRange | None = Field(default=None, alias="AgeAtEnd") + gender: list[Concept] | None = Field(default=None, serialization_alias="gender") + gender_cs: ConceptSetSelection | None = Field(default=None, alias="GenderCS") model_config = ConfigDict(populate_by_name=True) @@ -1069,7 +1069,7 @@ class CriteriaGroup(BaseModel): validation_alias=AliasChoices("CriteriaList", "criteriaList"), serialization_alias="CriteriaList", ) - count: Optional[int] = Field( + count: int | None = Field( default=None, validation_alias=AliasChoices("Count", "count"), serialization_alias="Count", @@ -1084,7 +1084,7 @@ class CriteriaGroup(BaseModel): validation_alias=AliasChoices("DemographicCriteriaList", "demographicCriteriaList"), serialization_alias="DemographicCriteriaList", ) - type: Optional[str] = Field( + type: str | None = Field( default=None, validation_alias=AliasChoices("Type", "type"), serialization_alias="Type", @@ -1372,25 +1372,25 @@ def normalize_window(window_dict: dict) -> dict: # Define CriteriaType Union for strict typing. # Criteria is last so known subtypes are tried first; it also acts as # a catch-all that accepts any registered extension subclass. -_CriteriaTypeUnion = Union[ - ConditionOccurrence, - DrugExposure, - ProcedureOccurrence, - VisitOccurrence, - Observation, - Measurement, - DeviceExposure, - Specimen, - Death, - VisitDetail, - ObservationPeriod, - PayerPlanPeriod, - LocationRegion, - ConditionEra, - DrugEra, - DoseEra, - Criteria, # catch-all for extension subclasses -] +_CriteriaTypeUnion = ( + ConditionOccurrence + | DrugExposure + | ProcedureOccurrence + | VisitOccurrence + | Observation + | Measurement + | DeviceExposure + | Specimen + | Death + | VisitDetail + | ObservationPeriod + | PayerPlanPeriod + | LocationRegion + | ConditionEra + | DrugEra + | DoseEra + | Criteria # catch-all for extension subclasses +) def _validate_criteria_extension(v: Any) -> Any: @@ -1443,12 +1443,12 @@ class PrimaryCriteria(BaseModel): validation_alias=AliasChoices("CriteriaList", "criteriaList"), serialization_alias="CriteriaList", ) - observation_window: Optional[ObservationFilter] = Field( + observation_window: ObservationFilter | None = Field( default=None, validation_alias=AliasChoices("ObservationWindow", "observationWindow"), serialization_alias="ObservationWindow", ) - primary_limit: Optional[ResultLimit] = Field( + primary_limit: ResultLimit | None = Field( default=None, validation_alias=AliasChoices( "PrimaryLimit", diff --git a/circe/cohortdefinition/interfaces.py b/circe/cohortdefinition/interfaces.py index 0492ab6..65d7ffa 100644 --- a/circe/cohortdefinition/interfaces.py +++ b/circe/cohortdefinition/interfaces.py @@ -10,7 +10,6 @@ """ from abc import ABC, abstractmethod -from typing import Optional, Union from .builders.utils import BuilderOptions from .core import CustomEraStrategy, DateOffsetStrategy @@ -34,24 +33,24 @@ ) # Type alias for all criteria types -Criteria = Union[ - LocationRegion, - ConditionEra, - ConditionOccurrence, - Death, - DeviceExposure, - DoseEra, - DrugEra, - DrugExposure, - Measurement, - Observation, - ObservationPeriod, - PayerPlanPeriod, - ProcedureOccurrence, - Specimen, - VisitOccurrence, - VisitDetail, -] +Criteria = ( + LocationRegion + | ConditionEra + | ConditionOccurrence + | Death + | DeviceExposure + | DoseEra + | DrugEra + | DrugExposure + | Measurement + | Observation + | ObservationPeriod + | PayerPlanPeriod + | ProcedureOccurrence + | Specimen + | VisitOccurrence + | VisitDetail +) class IGetCriteriaSqlDispatcher(ABC): @@ -61,7 +60,7 @@ class IGetCriteriaSqlDispatcher(ABC): """ @abstractmethod - def get_criteria_sql(self, criteria: Criteria, options: Optional[BuilderOptions] = None) -> str: + def get_criteria_sql(self, criteria: Criteria, options: BuilderOptions | None = None) -> str: """Generate SQL for various criteria types. Args: @@ -75,7 +74,7 @@ def get_criteria_sql(self, criteria: Criteria, options: Optional[BuilderOptions] # Type alias for end strategies -EndStrategy = Union[DateOffsetStrategy, CustomEraStrategy] +EndStrategy = DateOffsetStrategy | CustomEraStrategy class IGetEndStrategySqlDispatcher(ABC): diff --git a/circe/cohortdefinition/printfriendly/markdown_render.py b/circe/cohortdefinition/printfriendly/markdown_render.py index e2a1c76..48423ed 100644 --- a/circe/cohortdefinition/printfriendly/markdown_render.py +++ b/circe/cohortdefinition/printfriendly/markdown_render.py @@ -14,7 +14,6 @@ import json from datetime import datetime from pathlib import Path -from typing import Optional, Union import jinja2 @@ -34,9 +33,9 @@ class MarkdownRender: def __init__( self, - concept_sets: Optional[list[ConceptSet]] = None, + concept_sets: list[ConceptSet] | None = None, include_concept_sets: bool = False, - template_paths: Optional[list[Path]] = None, + template_paths: list[Path] | None = None, ): """Initialize the markdown renderer. @@ -90,9 +89,9 @@ def get_template_for_criteria(criteria): def render_cohort_expression( self, - cohort_expression: Union[CohortExpression, str], - include_concept_sets: Optional[bool] = None, - title: Optional[str] = None, + cohort_expression: CohortExpression | str, + include_concept_sets: bool | None = None, + title: str | None = None, ) -> str: """Render a cohort expression to markdown format. @@ -133,7 +132,7 @@ def render_cohort_expression( include_concept_sets=should_include, ) - def render_concept_set_list(self, concept_sets: Union[list[ConceptSet], str]) -> str: + def render_concept_set_list(self, concept_sets: list[ConceptSet] | str) -> str: """Render a list of concept sets to markdown format. Java equivalent: renderConceptSetList(ConceptSet[]) @@ -164,7 +163,7 @@ def render_concept_set_list(self, concept_sets: Union[list[ConceptSet], str]) -> return template.render(conceptSets=concept_sets) - def render_concept_set(self, concept_set: Union[ConceptSet, str]) -> str: + def render_concept_set(self, concept_set: ConceptSet | str) -> str: """Render a single concept set to markdown format. Java equivalent: renderConceptSet(ConceptSet) @@ -186,7 +185,7 @@ def render_concept_set(self, concept_set: Union[ConceptSet, str]) -> str: # Custom Filters and Functions (matching Java utils.ftl) # ========================================================================= - def _codeset_name(self, codeset_id: Optional[int], default_name: str = "any") -> str: + def _codeset_name(self, codeset_id: int | None, default_name: str = "any") -> str: """Get concept set name from codeset ID, or return default. Java equivalent: utils.codesetName() @@ -228,7 +227,7 @@ def _format_date(self, date_string: str) -> str: except (ValueError, AttributeError): return "_invalid date_" - def _format_number(self, value: Union[int, float]) -> str: + def _format_number(self, value: int | float) -> str: """Format number with thousands separators and handle integer/float logic. Args: diff --git a/circe/execution/_dataclass.py b/circe/execution/_dataclass.py index f7129f3..f2b4f35 100644 --- a/circe/execution/_dataclass.py +++ b/circe/execution/_dataclass.py @@ -1,8 +1,8 @@ from __future__ import annotations -import sys +from collections.abc import Callable from dataclasses import dataclass -from typing import Any, Callable, TypeVar, cast, overload +from typing import Any, TypeVar, cast, overload from typing_extensions import dataclass_transform @@ -29,10 +29,7 @@ def frozen_slots_dataclass( """ def wrap(cls: type[T]) -> type[T]: - dataclass_factory = cast(Any, dataclass) - if sys.version_info >= (3, 10): - return cast(type[T], dataclass_factory(frozen=True, slots=True, **kwargs)(cls)) - return cast(type[T], dataclass_factory(frozen=True, **kwargs)(cls)) + return cast(type[T], dataclass(frozen=True, slots=True, **kwargs)(cls)) if _cls is None: return wrap diff --git a/circe/execution/plan/events.py b/circe/execution/plan/events.py index 99652fd..efe38a4 100644 --- a/circe/execution/plan/events.py +++ b/circe/execution/plan/events.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union +from typing import Any from .._dataclass import frozen_slots_dataclass from .predicates import DateRangePredicate, NumericRangePredicate @@ -148,27 +148,27 @@ class StandardizeEventShape: end_with: str = "end_date" -PlanStep = Union[ - FilterByCodeset, - FilterByConceptSet, - FilterByDateRange, - FilterByNumericRange, - FilterByText, - JoinLocationRegion, - FilterByVisit, - FilterByVisitDetail, - FilterByProviderSpecialty, - FilterByCareSite, - FilterByCareSiteLocationRegion, - FilterByPersonAge, - FilterByPersonGender, - FilterByPersonRace, - FilterByPersonEthnicity, - KeepFirstPerPerson, - ApplyDateAdjustment, - RestrictToCorrelatedWindow, - StandardizeEventShape, -] +PlanStep = ( + FilterByCodeset + | FilterByConceptSet + | FilterByDateRange + | FilterByNumericRange + | FilterByText + | JoinLocationRegion + | FilterByVisit + | FilterByVisitDetail + | FilterByProviderSpecialty + | FilterByCareSite + | FilterByCareSiteLocationRegion + | FilterByPersonAge + | FilterByPersonGender + | FilterByPersonRace + | FilterByPersonEthnicity + | KeepFirstPerPerson + | ApplyDateAdjustment + | RestrictToCorrelatedWindow + | StandardizeEventShape +) @frozen_slots_dataclass diff --git a/circe/execution/typing.py b/circe/execution/typing.py index edf6ba2..be632a4 100644 --- a/circe/execution/typing.py +++ b/circe/execution/typing.py @@ -1,8 +1,6 @@ from __future__ import annotations -from typing import Any, Protocol - -from typing_extensions import TypeAlias +from typing import Any, Protocol, TypeAlias # Ibis does not currently ship usable type information for its table expressions. # Treat them as `Any` at the compatibility boundary rather than propagating diff --git a/circe/extensions/__init__.py b/circe/extensions/__init__.py index d67a624..71da52d 100644 --- a/circe/extensions/__init__.py +++ b/circe/extensions/__init__.py @@ -25,11 +25,12 @@ class WaveformOccurrenceMarkdownRenderer: ... """ +from collections.abc import Callable from pathlib import Path # Forward references to avoid circular imports # Actual imports happen inside methods or with TYPE_CHECKING -from typing import TYPE_CHECKING, Callable, Optional, Union +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from ..cohortdefinition.builders.base import CriteriaSqlBuilder @@ -143,7 +144,7 @@ def get_lowerer(self, criteria_cls: type["Criteria"]) -> Optional["LowerFn"]: """ return self._lowerers.get(criteria_cls) - def get_normalizer(self, criteria_cls: type["Criteria"]) -> Optional[NormalizerFn]: + def get_normalizer(self, criteria_cls: type["Criteria"]) -> NormalizerFn | None: """Get the normalizer function for a criteria type. Args: @@ -154,7 +155,7 @@ def get_normalizer(self, criteria_cls: type["Criteria"]) -> Optional[NormalizerF """ return self._normalizers.get(criteria_cls) - def get_template(self, criteria: "Criteria") -> Optional[str]: + def get_template(self, criteria: "Criteria") -> str | None: """Get the markdown template name for a criteria instance. Args: @@ -165,7 +166,7 @@ def get_template(self, criteria: "Criteria") -> Optional[str]: """ return self._markdown_templates.get(type(criteria)) - def get_criteria_class(self, name: str) -> Optional[type["Criteria"]]: + def get_criteria_class(self, name: str) -> type["Criteria"] | None: """Get a registered criteria class by name. Args: @@ -301,7 +302,7 @@ def decorator(cls: type) -> type: return decorator -def template_path(path: Union[str, Path]) -> None: +def template_path(path: str | Path) -> None: """Register a directory as a template search path. This is a convenience function (not a decorator) that adds *path* to the diff --git a/circe/extensions/waveform/criteria.py b/circe/extensions/waveform/criteria.py index d79f9e5..35f799e 100644 --- a/circe/extensions/waveform/criteria.py +++ b/circe/extensions/waveform/criteria.py @@ -1,4 +1,3 @@ -from typing import Optional from pydantic import AliasChoices, Field @@ -20,52 +19,52 @@ class WaveformOccurrence(Criteria): """ # Core concept - type of waveform recording - waveform_occurrence_concept_id: Optional[list[Concept]] = Field( + waveform_occurrence_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("WaveformOccurrenceConceptId", "waveformOccurrenceConceptId"), serialization_alias="WaveformOccurrenceConceptId", ) # Temporal bounds - occurrence_start_datetime: Optional[DateRange] = Field( + occurrence_start_datetime: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceStartDatetime", "occurrenceStartDatetime"), serialization_alias="OccurrenceStartDatetime", ) - occurrence_end_datetime: Optional[DateRange] = Field( + occurrence_end_datetime: DateRange | None = Field( default=None, validation_alias=AliasChoices("OccurrenceEndDatetime", "occurrenceEndDatetime"), serialization_alias="OccurrenceEndDatetime", ) # Visit context - visit_occurrence_id: Optional[NumericRange] = Field( + visit_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("VisitOccurrenceId", "visitOccurrenceId"), serialization_alias="VisitOccurrenceId", ) - visit_detail_id: Optional[NumericRange] = Field( + visit_detail_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("VisitDetailId", "visitDetailId"), serialization_alias="VisitDetailId", ) # File metadata - num_of_files: Optional[NumericRange] = Field( + num_of_files: NumericRange | None = Field( default=None, validation_alias=AliasChoices("NumOfFiles", "numOfFiles"), serialization_alias="NumOfFiles", ) # Source identifiers - waveform_occurrence_source_value: Optional[TextFilter] = Field( + waveform_occurrence_source_value: TextFilter | None = Field( default=None, validation_alias=AliasChoices("WaveformOccurrenceSourceValue", "waveformOccurrenceSourceValue"), serialization_alias="WaveformOccurrenceSourceValue", ) # Sequence/chain filtering - preceding_waveform_occurrence_id: Optional[NumericRange] = Field( + preceding_waveform_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("PrecedingWaveformOccurrenceId", "precedingWaveformOccurrenceId"), serialization_alias="PrecedingWaveformOccurrenceId", @@ -84,43 +83,43 @@ class WaveformRegistry(Criteria): """ # Link to parent occurrence - waveform_occurrence_id: Optional[NumericRange] = Field( + waveform_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("WaveformOccurrenceId", "waveformOccurrenceId"), serialization_alias="WaveformOccurrenceId", ) # File temporal bounds - file_start_datetime: Optional[DateRange] = Field( + file_start_datetime: DateRange | None = Field( default=None, validation_alias=AliasChoices("FileStartDatetime", "fileStartDatetime"), serialization_alias="FileStartDatetime", ) - file_end_datetime: Optional[DateRange] = Field( + file_end_datetime: DateRange | None = Field( default=None, validation_alias=AliasChoices("FileEndDatetime", "fileEndDatetime"), serialization_alias="FileEndDatetime", ) # File format - file_extension_concept_id: Optional[list[Concept]] = Field( + file_extension_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("FileExtensionConceptId", "fileExtensionConceptId"), serialization_alias="FileExtensionConceptId", ) - file_extension_source_value: Optional[TextFilter] = Field( + file_extension_source_value: TextFilter | None = Field( default=None, validation_alias=AliasChoices("FileExtensionSourceValue", "fileExtensionSourceValue"), serialization_alias="FileExtensionSourceValue", ) # Visit context (denormalized for easier querying) - visit_occurrence_id: Optional[NumericRange] = Field( + visit_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("VisitOccurrenceId", "visitOccurrenceId"), serialization_alias="VisitOccurrenceId", ) - visit_detail_id: Optional[NumericRange] = Field( + visit_detail_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("VisitDetailId", "visitDetailId"), serialization_alias="VisitDetailId", @@ -140,62 +139,62 @@ class WaveformChannelMetadata(Criteria): """ # Link to registry file - waveform_registry_id: Optional[NumericRange] = Field( + waveform_registry_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("WaveformRegistryId", "waveformRegistryId"), serialization_alias="WaveformRegistryId", ) # Channel identification - channel_concept_id: Optional[list[Concept]] = Field( + channel_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ChannelConceptId", "channelConceptId"), serialization_alias="ChannelConceptId", ) - waveform_channel_source_value: Optional[TextFilter] = Field( + waveform_channel_source_value: TextFilter | None = Field( default=None, validation_alias=AliasChoices("WaveformChannelSourceValue", "waveformChannelSourceValue"), serialization_alias="WaveformChannelSourceValue", ) # Metadata type (e.g., sampling rate, gain, offset) - metadata_concept_id: Optional[list[Concept]] = Field( + metadata_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("MetadataConceptId", "metadataConceptId"), serialization_alias="MetadataConceptId", ) - metadata_source_value: Optional[TextFilter] = Field( + metadata_source_value: TextFilter | None = Field( default=None, validation_alias=AliasChoices("MetadataSourceValue", "metadataSourceValue"), serialization_alias="MetadataSourceValue", ) # Metadata values (at least one must be populated) - value_as_number: Optional[NumericRange] = Field( + value_as_number: NumericRange | None = Field( default=None, validation_alias=AliasChoices("ValueAsNumber", "valueAsNumber"), serialization_alias="ValueAsNumber", ) - value_as_concept_id: Optional[list[Concept]] = Field( + value_as_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ValueAsConceptId", "valueAsConceptId"), serialization_alias="ValueAsConceptId", ) # Units for numeric values - unit_concept_id: Optional[list[Concept]] = Field( + unit_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("UnitConceptId", "unitConceptId"), serialization_alias="UnitConceptId", ) # Device/procedure linkage - device_exposure_id: Optional[NumericRange] = Field( + device_exposure_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("DeviceExposureId", "deviceExposureId"), serialization_alias="DeviceExposureId", ) - procedure_occurrence_id: Optional[NumericRange] = Field( + procedure_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("ProcedureOccurrenceId", "procedureOccurrenceId"), serialization_alias="ProcedureOccurrenceId", @@ -215,79 +214,79 @@ class WaveformFeature(Criteria): """ # Parent links - waveform_occurrence_id: Optional[NumericRange] = Field( + waveform_occurrence_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("WaveformOccurrenceId", "waveformOccurrenceId"), serialization_alias="WaveformOccurrenceId", ) - waveform_registry_id: Optional[NumericRange] = Field( + waveform_registry_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("WaveformRegistryId", "waveformRegistryId"), serialization_alias="WaveformRegistryId", ) - waveform_channel_metadata_id: Optional[NumericRange] = Field( + waveform_channel_metadata_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("WaveformChannelMetadataId", "waveformChannelMetadataId"), serialization_alias="WaveformChannelMetadataId", ) # Feature type (e.g., heart rate, SpO2, QRS detection) - feature_concept_id: Optional[list[Concept]] = Field( + feature_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("FeatureConceptId", "featureConceptId"), serialization_alias="FeatureConceptId", ) # Algorithm used to derive feature - algorithm_concept_id: Optional[list[Concept]] = Field( + algorithm_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("AlgorithmConceptId", "algorithmConceptId"), serialization_alias="AlgorithmConceptId", ) - algorithm_source_value: Optional[TextFilter] = Field( + algorithm_source_value: TextFilter | None = Field( default=None, validation_alias=AliasChoices("AlgorithmSourceValue", "algorithmSourceValue"), serialization_alias="AlgorithmSourceValue", ) # Temporal window for feature - feature_start_timestamp: Optional[DateRange] = Field( + feature_start_timestamp: DateRange | None = Field( default=None, validation_alias=AliasChoices("FeatureStartTimestamp", "featureStartTimestamp"), serialization_alias="FeatureStartTimestamp", ) - feature_end_timestamp: Optional[DateRange] = Field( + feature_end_timestamp: DateRange | None = Field( default=None, validation_alias=AliasChoices("FeatureEndTimestamp", "featureEndTimestamp"), serialization_alias="FeatureEndTimestamp", ) # Feature values (at least one must be populated) - value_as_number: Optional[NumericRange] = Field( + value_as_number: NumericRange | None = Field( default=None, validation_alias=AliasChoices("ValueAsNumber", "valueAsNumber"), serialization_alias="ValueAsNumber", ) - value_as_concept_id: Optional[list[Concept]] = Field( + value_as_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("ValueAsConceptId", "valueAsConceptId"), serialization_alias="ValueAsConceptId", ) # Units for numeric values - unit_concept_id: Optional[list[Concept]] = Field( + unit_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("UnitConceptId", "unitConceptId"), serialization_alias="UnitConceptId", ) # Links to standard OMOP tables - measurement_id: Optional[NumericRange] = Field( + measurement_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("MeasurementId", "measurementId"), serialization_alias="MeasurementId", ) - observation_id: Optional[NumericRange] = Field( + observation_id: NumericRange | None = Field( default=None, validation_alias=AliasChoices("ObservationId", "observationId"), serialization_alias="ObservationId", diff --git a/circe/io.py b/circe/io.py index af2f151..98695a4 100644 --- a/circe/io.py +++ b/circe/io.py @@ -10,13 +10,13 @@ import json from collections.abc import Mapping from pathlib import Path -from typing import Any, Union +from typing import Any from .api import cohort_expression_from_json, cohort_expression_from_yaml from .cohortdefinition import CohortExpression from .cohortdefinition.yaml_utils import cohort_expression_to_snake_case -ExpressionInput = Union[CohortExpression, Mapping[str, Any], str, Path] +ExpressionInput = CohortExpression | Mapping[str, Any] | str | Path def load_expression(value: ExpressionInput) -> CohortExpression: diff --git a/circe/vocabulary/concept.py b/circe/vocabulary/concept.py index 6b752b6..0d6e037 100644 --- a/circe/vocabulary/concept.py +++ b/circe/vocabulary/concept.py @@ -9,7 +9,7 @@ """ from datetime import datetime -from typing import Any, Optional +from typing import Any from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator @@ -26,53 +26,53 @@ class Concept(BaseModel): New schema adds: validStartDate, validEndDate, invalidReason with specific formats. """ - concept_id: Optional[int] = Field( + concept_id: int | None = Field( default=None, validation_alias=AliasChoices("ConceptId", "CONCEPT_ID", "conceptId", "ConceptID"), serialization_alias="CONCEPT_ID", ) - concept_name: Optional[str] = Field( + concept_name: str | None = Field( default=None, validation_alias=AliasChoices("ConceptName", "CONCEPT_NAME", "conceptName"), serialization_alias="CONCEPT_NAME", ) - concept_code: Optional[str] = Field( + concept_code: str | None = Field( default=None, validation_alias=AliasChoices("ConceptCode", "CONCEPT_CODE", "conceptCode"), serialization_alias="CONCEPT_CODE", ) - concept_class_id: Optional[str] = Field( + concept_class_id: str | None = Field( default=None, validation_alias=AliasChoices("ConceptClassId", "CONCEPT_CLASS_ID", "conceptClassId"), serialization_alias="CONCEPT_CLASS_ID", ) - standard_concept: Optional[str] = Field( + standard_concept: str | None = Field( default=None, validation_alias=AliasChoices("StandardConcept", "STANDARD_CONCEPT", "standardConcept"), serialization_alias="STANDARD_CONCEPT", ) - invalid_reason: Optional[str] = Field( + invalid_reason: str | None = Field( default=None, validation_alias=AliasChoices("InvalidReason", "INVALID_REASON", "invalidReason"), serialization_alias="INVALID_REASON", ) - domain_id: Optional[str] = Field( + domain_id: str | None = Field( default=None, validation_alias=AliasChoices("DomainId", "DOMAIN_ID", "domainId"), serialization_alias="DOMAIN_ID", ) - vocabulary_id: Optional[str] = Field( + vocabulary_id: str | None = Field( default=None, validation_alias=AliasChoices("VocabularyId", "VOCABULARY_ID", "vocabularyId"), serialization_alias="VOCABULARY_ID", ) # New schema fields - valid_start_date: Optional[str] = Field( + valid_start_date: str | None = Field( default=None, validation_alias=AliasChoices("validStartDate", "valid_start_date"), serialization_alias="validStartDate", ) - valid_end_date: Optional[str] = Field( + valid_end_date: str | None = Field( default=None, validation_alias=AliasChoices("validEndDate", "valid_end_date"), serialization_alias="validEndDate", @@ -82,7 +82,7 @@ class Concept(BaseModel): @field_validator("standard_concept") @classmethod - def validate_standard_concept(cls, v: Optional[str]) -> Optional[str]: + def validate_standard_concept(cls, v: str | None) -> str | None: """Validate standard_concept is 'S', 'C', or null (relaxed for legacy data).""" # Relaxed validation - warn but don't fail on unexpected values return v @@ -121,11 +121,11 @@ class ConceptSetExpression(BaseModel): (they're sometimes only on the items), so we provide defaults. """ - concept: Optional[Concept] = None + concept: Concept | None = None is_excluded: bool = Field(default=False, alias="isExcluded") include_mapped: bool = Field(default=False, alias="includeMapped") include_descendants: bool = Field(default=False, alias="includeDescendants") - items: Optional[list[ConceptExpressionItem]] = None + items: list[ConceptExpressionItem] | None = None model_config = ConfigDict(populate_by_name=True) @@ -145,7 +145,7 @@ class ConceptSet(BaseModel): description="Unique identifier for the concept set", ) - name: Optional[str] = Field( + name: str | None = Field( default=None, min_length=1, max_length=255, @@ -154,7 +154,7 @@ class ConceptSet(BaseModel): description="Human-readable name for the concept set", ) - expression: Optional[ConceptSetExpression] = Field( + expression: ConceptSetExpression | None = Field( default=None, alias="expression", validation_alias=AliasChoices("expression", "EXPRESSION"), @@ -162,62 +162,62 @@ class ConceptSet(BaseModel): ) # Optional fields for both legacy and new schema - description: Optional[str] = Field( + description: str | None = Field( default=None, max_length=4000, description="Optional detailed description of the concept set purpose and contents", ) # New schema fields (all optional for backward compatibility) - version: Optional[str] = Field( + version: str | None = Field( default=None, description="Version identifier for the concept set (semantic versioning)", ) - created_by: Optional[str] = Field( + created_by: str | None = Field( default=None, alias="createdBy", validation_alias=AliasChoices("createdBy", "created_by"), max_length=255, description="Username or identifier of the concept set creator", ) - created_date: Optional[datetime] = Field( + created_date: datetime | None = Field( default=None, alias="createdDate", validation_alias=AliasChoices("createdDate", "created_date"), description="ISO 8601 timestamp of concept set creation", ) - modified_by: Optional[str] = Field( + modified_by: str | None = Field( default=None, alias="modifiedBy", validation_alias=AliasChoices("modifiedBy", "modified_by"), max_length=255, description="Username or identifier of the last modifier", ) - modified_date: Optional[datetime] = Field( + modified_date: datetime | None = Field( default=None, alias="modifiedDate", validation_alias=AliasChoices("modifiedDate", "modified_date"), description="ISO 8601 timestamp of last modification", ) - created_by_tool: Optional[str] = Field( + created_by_tool: str | None = Field( default=None, alias="createdByTool", validation_alias=AliasChoices("createdByTool", "created_by_tool"), max_length=255, description="Name and version of the tool used to create the concept set", ) - modified_by_tool: Optional[str] = Field( + modified_by_tool: str | None = Field( default=None, alias="modifiedByTool", validation_alias=AliasChoices("modifiedByTool", "modified_by_tool"), max_length=255, description="Name and version of the tool used for the last modification", ) - tags: Optional[list[str]] = Field( + tags: list[str] | None = Field( default=None, description="Optional array of tags for categorization", ) - metadata: Optional[dict[str, Any]] = Field( + metadata: dict[str, Any] | None = Field( default=None, description="Optional additional metadata", ) @@ -226,14 +226,14 @@ class ConceptSet(BaseModel): @field_validator("version") @classmethod - def validate_version(cls, v: Optional[str]) -> Optional[str]: + def validate_version(cls, v: str | None) -> str | None: """Validate semantic versioning pattern if provided (relaxed for legacy compatibility).""" # Relaxed - allow any version string for backward compatibility return v @field_validator("tags") @classmethod - def validate_tags(cls, v: Optional[list[str]]) -> Optional[list[str]]: + def validate_tags(cls, v: list[str] | None) -> list[str] | None: """Validate tags if provided.""" if v is not None: for tag in v: diff --git a/tests/test_cohort_definition_set.py b/tests/test_cohort_definition_set.py index 037e0fe..9b7e343 100644 --- a/tests/test_cohort_definition_set.py +++ b/tests/test_cohort_definition_set.py @@ -528,7 +528,7 @@ def _failing_write(*, compiled_relation, cohort_id, **kwargs): assert statuses[2] == "COMPLETE" history = conn.table(CHECKSUM_TABLE, database="main").execute() - history_statuses = dict(zip(history["cohort_definition_id"], history["status"])) + history_statuses = dict(zip(history["cohort_definition_id"], history["status"], strict=True)) assert history_statuses[1] == "FAILED" assert history_statuses[2] == "COMPLETE" diff --git a/tests/test_extension_system.py b/tests/test_extension_system.py index e76c16b..bdfedf1 100644 --- a/tests/test_extension_system.py +++ b/tests/test_extension_system.py @@ -1,5 +1,4 @@ import json -from typing import Optional from pydantic import AliasChoices, Field @@ -26,12 +25,12 @@ class WeatherCondition(Criteria): Imagine a CDM extension where weather data is linked to persons. """ - weather_concept_id: Optional[list[Concept]] = Field( + weather_concept_id: list[Concept] | None = Field( default=None, validation_alias=AliasChoices("WeatherConceptId", "weatherConceptId"), serialization_alias="WeatherConceptId", ) - temperature_celsius: Optional[float] = Field( + temperature_celsius: float | None = Field( default=None, validation_alias=AliasChoices("TemperatureCelsius", "temperatureCelsius"), serialization_alias="TemperatureCelsius", diff --git a/tests/test_real_example_cohorts.py b/tests/test_real_example_cohorts.py index cb3fae6..aaf477a 100644 --- a/tests/test_real_example_cohorts.py +++ b/tests/test_real_example_cohorts.py @@ -14,7 +14,6 @@ import textwrap from difflib import unified_diff from pathlib import Path -from typing import Optional import pytest @@ -61,7 +60,7 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("cohort_name", params) -def get_reference_sql(cohort_name: str) -> Optional[str]: +def get_reference_sql(cohort_name: str) -> str | None: """Get pre-generated reference SQL from R/Java implementation.""" ref_file = REFERENCE_DIR / cohort_name.replace(".json", ".sql") if ref_file.exists(): @@ -69,7 +68,7 @@ def get_reference_sql(cohort_name: str) -> Optional[str]: return None -def generate_python_outputs(cohort_file: Path) -> tuple[Optional[str], Optional[str]]: +def generate_python_outputs(cohort_file: Path) -> tuple[str | None, str | None]: """ Run Python reference implementation to generate SQL. @@ -392,10 +391,10 @@ def test_sql_matches_reference(cohort_name): # ============================================================================= # Cache for generated markdown to avoid redundant work -_MARKDOWN_CACHE: dict[str, tuple[Optional[str], Optional[str]]] = {} +_MARKDOWN_CACHE: dict[str, tuple[str | None, str | None]] = {} -def get_generated_markdown(cohort_name: str) -> tuple[Optional[str], Optional[str]]: +def get_generated_markdown(cohort_name: str) -> tuple[str | None, str | None]: """ Get generated markdown for a cohort, using cache if available. """ @@ -424,7 +423,7 @@ def get_generated_markdown(cohort_name: str) -> tuple[Optional[str], Optional[st return markdown, error -def get_reference_markdown(cohort_name: str) -> Optional[str]: +def get_reference_markdown(cohort_name: str) -> str | None: """Get pre-generated reference Markdown from R/Java implementation.""" ref_file = REFERENCE_DIR / cohort_name.replace(".json", ".md") if ref_file.exists(): From d99b9c7680393b0e3af415e095444375023dd400 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Wed, 17 Jun 2026 10:23:09 -0700 Subject: [PATCH 53/53] Ruff fixes --- circe/check/checkers/unused_concepts_check.py | 1 - circe/check/warnings/concept_set_warning.py | 1 - circe/cohortdefinition/builders/condition_era.py | 1 - circe/cohortdefinition/builders/condition_occurrence.py | 1 - circe/cohortdefinition/builders/death.py | 1 - circe/cohortdefinition/builders/dose_era.py | 1 - circe/cohortdefinition/builders/drug_era.py | 1 - circe/cohortdefinition/builders/drug_exposure.py | 1 - circe/cohortdefinition/builders/location_region.py | 1 - circe/cohortdefinition/builders/measurement.py | 1 - circe/cohortdefinition/builders/observation.py | 5 +---- circe/cohortdefinition/builders/observation_period.py | 1 - circe/cohortdefinition/builders/payer_plan_period.py | 1 - circe/cohortdefinition/builders/procedure_occurrence.py | 1 - circe/cohortdefinition/builders/specimen.py | 1 - circe/cohortdefinition/builders/visit_detail.py | 1 - circe/cohortdefinition/builders/visit_occurrence.py | 1 - circe/extensions/waveform/criteria.py | 1 - 18 files changed, 1 insertion(+), 21 deletions(-) diff --git a/circe/check/checkers/unused_concepts_check.py b/circe/check/checkers/unused_concepts_check.py index 0695791..b0da7e8 100644 --- a/circe/check/checkers/unused_concepts_check.py +++ b/circe/check/checkers/unused_concepts_check.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..warning_severity import WarningSeverity from ..warnings.concept_set_warning import ConceptSetWarning from .base_check import BaseCheck diff --git a/circe/check/warnings/concept_set_warning.py b/circe/check/warnings/concept_set_warning.py index 1f23924..d7cdc87 100644 --- a/circe/check/warnings/concept_set_warning.py +++ b/circe/check/warnings/concept_set_warning.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ...vocabulary.concept import ConceptSet from ..warning_severity import WarningSeverity from .base_warning import BaseWarning diff --git a/circe/cohortdefinition/builders/condition_era.py b/circe/cohortdefinition/builders/condition_era.py index 577d51d..02bedc8 100644 --- a/circe/cohortdefinition/builders/condition_era.py +++ b/circe/cohortdefinition/builders/condition_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import ConditionEra from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/condition_occurrence.py b/circe/cohortdefinition/builders/condition_occurrence.py index 3399274..e937f82 100644 --- a/circe/cohortdefinition/builders/condition_occurrence.py +++ b/circe/cohortdefinition/builders/condition_occurrence.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import ConditionOccurrence from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/death.py b/circe/cohortdefinition/builders/death.py index e6d8748..247ed35 100644 --- a/circe/cohortdefinition/builders/death.py +++ b/circe/cohortdefinition/builders/death.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import Death from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/dose_era.py b/circe/cohortdefinition/builders/dose_era.py index 812eaa9..8835e1a 100644 --- a/circe/cohortdefinition/builders/dose_era.py +++ b/circe/cohortdefinition/builders/dose_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import DoseEra from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/drug_era.py b/circe/cohortdefinition/builders/drug_era.py index ff94a07..92aa718 100644 --- a/circe/cohortdefinition/builders/drug_era.py +++ b/circe/cohortdefinition/builders/drug_era.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import DrugEra from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/drug_exposure.py b/circe/cohortdefinition/builders/drug_exposure.py index d8125c3..188fdac 100644 --- a/circe/cohortdefinition/builders/drug_exposure.py +++ b/circe/cohortdefinition/builders/drug_exposure.py @@ -9,7 +9,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import DrugExposure from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/location_region.py b/circe/cohortdefinition/builders/location_region.py index 98bfc7d..d3aed82 100644 --- a/circe/cohortdefinition/builders/location_region.py +++ b/circe/cohortdefinition/builders/location_region.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import LocationRegion from .base import CriteriaSqlBuilder from .utils import BuilderOptions, CriteriaColumn diff --git a/circe/cohortdefinition/builders/measurement.py b/circe/cohortdefinition/builders/measurement.py index 2b6515f..3c66a17 100644 --- a/circe/cohortdefinition/builders/measurement.py +++ b/circe/cohortdefinition/builders/measurement.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import Measurement from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/observation.py b/circe/cohortdefinition/builders/observation.py index 281f7e6..4c274e7 100644 --- a/circe/cohortdefinition/builders/observation.py +++ b/circe/cohortdefinition/builders/observation.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import Observation from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn @@ -109,9 +108,7 @@ def resolve_select_clauses( return select_cols - def resolve_join_clauses( - self, criteria: Observation, options: BuilderOptions | None = None - ) -> list[str]: + def resolve_join_clauses(self, criteria: Observation, options: BuilderOptions | None = None) -> list[str]: """Resolve join clauses for observation criteria. Java equivalent: ObservationSqlBuilder.resolveJoinClauses() diff --git a/circe/cohortdefinition/builders/observation_period.py b/circe/cohortdefinition/builders/observation_period.py index 2b620bf..c085441 100644 --- a/circe/cohortdefinition/builders/observation_period.py +++ b/circe/cohortdefinition/builders/observation_period.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import ObservationPeriod from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/payer_plan_period.py b/circe/cohortdefinition/builders/payer_plan_period.py index fb25d62..1185c81 100644 --- a/circe/cohortdefinition/builders/payer_plan_period.py +++ b/circe/cohortdefinition/builders/payer_plan_period.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import PayerPlanPeriod from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/procedure_occurrence.py b/circe/cohortdefinition/builders/procedure_occurrence.py index 16dec87..864836f 100644 --- a/circe/cohortdefinition/builders/procedure_occurrence.py +++ b/circe/cohortdefinition/builders/procedure_occurrence.py @@ -9,7 +9,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import Criteria from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/specimen.py b/circe/cohortdefinition/builders/specimen.py index 2670cd4..a09fbca 100644 --- a/circe/cohortdefinition/builders/specimen.py +++ b/circe/cohortdefinition/builders/specimen.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import Specimen from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/visit_detail.py b/circe/cohortdefinition/builders/visit_detail.py index 21f29d6..813b805 100644 --- a/circe/cohortdefinition/builders/visit_detail.py +++ b/circe/cohortdefinition/builders/visit_detail.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import VisitDetail from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/cohortdefinition/builders/visit_occurrence.py b/circe/cohortdefinition/builders/visit_occurrence.py index 89708d6..1620035 100644 --- a/circe/cohortdefinition/builders/visit_occurrence.py +++ b/circe/cohortdefinition/builders/visit_occurrence.py @@ -8,7 +8,6 @@ Reference: JAVA_CLASS_MAPPINGS.md for Java equivalents. """ - from ..criteria import VisitOccurrence from .base import CriteriaSqlBuilder from .utils import BuilderOptions, BuilderUtils, CriteriaColumn diff --git a/circe/extensions/waveform/criteria.py b/circe/extensions/waveform/criteria.py index 35f799e..b2d17c9 100644 --- a/circe/extensions/waveform/criteria.py +++ b/circe/extensions/waveform/criteria.py @@ -1,4 +1,3 @@ - from pydantic import AliasChoices, Field from circe.cohortdefinition.core import DateRange, NumericRange, TextFilter