diff --git a/Makefile b/Makefile index d500170a9..47ddf37e3 100644 --- a/Makefile +++ b/Makefile @@ -179,6 +179,33 @@ app-test: ## Run app backend pytest suite (K= filter, COV=1 for coverage) ##@ App deploy (require PROFILE=; most also need TARGET=) +# Minimum Databricks CLI version required to deploy. The ``postgres_roles`` +# resource in ``app/databricks.yml`` (one-button Lakebase provisioning) was +# added in CLI v1.4.0 (https://github.com/databricks/cli/pull/5467); older +# CLIs reject the unknown field at ``bundle validate`` with a confusing +# error deep inside the deploy. Fail fast here with an actionable message. +DATABRICKS_MIN_VERSION := 1.4.0 + +# Preflight: assert the CLI exists and is new enough. Used as a prerequisite +# of app-deploy / app-bind so the version gate runs before any build or +# network call. The sort -V trick: the smallest of {MIN, installed} equals +# MIN exactly when installed >= MIN (handles 1.10 > 1.4 unlike a string sort). +app-check-cli: ## Verify the Databricks CLI meets the minimum version for deploy + @command -v databricks >/dev/null 2>&1 || \ + { echo "ERROR: 'databricks' CLI not found on PATH. Install: https://docs.databricks.com/dev-tools/cli/install.html"; exit 1; } + @ver=$$(databricks --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1); \ + if [ -z "$$ver" ]; then \ + echo "ERROR: could not parse a version from 'databricks --version'."; exit 1; \ + fi; \ + if [ "$$(printf '%s\n%s\n' "$(DATABRICKS_MIN_VERSION)" "$$ver" | sort -V | head -1)" != "$(DATABRICKS_MIN_VERSION)" ]; then \ + echo "ERROR: Databricks CLI v$$ver is too old; v$(DATABRICKS_MIN_VERSION)+ is required to deploy DQX Studio."; \ + echo " app/databricks.yml uses the 'postgres_roles' resource (one-button Lakebase),"; \ + echo " which older CLIs reject at 'bundle validate' with an unknown-field error."; \ + echo " Upgrade: brew upgrade databricks (or https://docs.databricks.com/dev-tools/cli/install.html)"; \ + exit 1; \ + fi; \ + echo "✓ Databricks CLI v$$ver (>= $(DATABRICKS_MIN_VERSION))" + # Grant Unity Catalog permissions after bundle deploy. # # Usage: make app-grant-permissions PROFILE=my-profile @@ -216,7 +243,7 @@ app-grant-permissions: ## Run post-deploy GRANTs (called by app-deploy; standalo # to ``make app-deploy`` — the bind step reads the resolved bundle # variables to know which instance/schema name to bind to, so an # override applied only at deploy time would bind the wrong resource. -app-bind: ## Adopt pre-existing UC schemas / volume / Lakebase into bundle (run ONCE per workspace) +app-bind: app-check-cli ## Adopt pre-existing UC schemas / volume / Lakebase into bundle (run ONCE per workspace) @test -n "$(PROFILE)" || (echo "Usage: make app-bind PROFILE= TARGET="; exit 1) @test -n "$(TARGET)" || (echo "Usage: make app-bind PROFILE= TARGET="; exit 1) app/scripts/bind_resources.sh -p $(PROFILE) -t $(TARGET) $(if $(BUNDLE_VARS),-- $(BUNDLE_VARS)) @@ -245,10 +272,17 @@ app-bind: ## Adopt pre-existing UC schemas / volume / Lakebase into bundle (run # after a manual delete (the deploy errors out with "Instance name is # not unique" otherwise). Per-deploy CLI overrides keep ``databricks.yml`` # clean. -app-deploy: app-build ## Build, deploy bundle, grant permissions, and start app +# +# FORCE=1 appends ``--force`` to ``bundle deploy``. Use it when the deploy +# aborts because a resource was modified in the workspace UI since the last +# deploy (e.g. "dashboard ... has been modified remotely") — ``--force`` +# overwrites the remote copy with the bundle's local definition. This DROPS +# any in-UI edits, so only set it once you've confirmed the local version is +# the one you want to ship. +app-deploy: app-check-cli app-build ## Build, deploy bundle, grant permissions, and start app (FORCE=1 to overwrite remote edits) @test -n "$(PROFILE)" || (echo "Usage: make app-deploy PROFILE= TARGET="; exit 1) @test -n "$(TARGET)" || (echo "Usage: make app-deploy PROFILE= TARGET="; exit 1) - cd app && databricks bundle deploy -p $(PROFILE) -t $(TARGET) $(BUNDLE_VARS) + cd app && databricks bundle deploy -p $(PROFILE) -t $(TARGET) $(if $(FORCE),--force) $(BUNDLE_VARS) app/scripts/post_deploy_grants.sh -p $(PROFILE) -t $(TARGET) $(if $(BUNDLE_VARS),-- $(BUNDLE_VARS)) cd app && databricks bundle run $(APP_NAME) -p $(PROFILE) -t $(TARGET) $(BUNDLE_VARS) @@ -290,4 +324,4 @@ fork-sync: ## Mirror a fork PR to a branch in the main repo for full CI (PR=`) + +Single-table rules carry a real `table_fqn`. Dataset-level rules (cross-table SQL checks and `has_valid_schema` schema-validation rules) instead use the synthetic prefix `__sql_check__/` so they all bucket under the **Cross-table rules** group in the UI catalog and edit-router. + +- **SQL checks** store their query body inline; the runner reads it from `arguments.sql_query`. +- **`has_valid_schema` checks** store the real table FQN in `user_metadata.target_table`; the runner creates the input view from *that* table while keeping the synthetic FQN for history grouping, and dispatches through the standard row-level engine (NOT the SQL fast-path). + +Both code paths live in `backend/routes/v1/dryrun.py` and `backend/services/scheduler_service.py`. If you add another dataset-level rule kind, follow the same convention and update both dispatchers in lock-step. ## Internal Storage diff --git a/app/CUSTOMER_QA.md b/app/CUSTOMER_QA.md new file mode 100644 index 000000000..225de506c --- /dev/null +++ b/app/CUSTOMER_QA.md @@ -0,0 +1,437 @@ +# DQX Studio — Customer Q&A + +A reference for the customer Q&A session. Questions are grouped by theme; answers are written to be **honest and specific** (with limitations called out) rather than marketing prose. Skip what you don't need. + +> "DQX" = the open-source [`databrickslabs/dqx`](https://github.com/databrickslabs/dqx) data-quality engine. "DQX Studio" = this app — a UI + control plane on top of DQX, deployed as a Databricks App in the customer's own workspace. + +--- + +## 1. Product positioning + +### 1.1 What is DQX Studio in one sentence? +A workspace-native UI for authoring, approving, scheduling, and reviewing DQX data-quality rules — running entirely inside the customer's Databricks account, using the customer's own compute and storage. + +### 1.2 How is it different from running DQX in a notebook? +DQX (the library) gives you the engine. DQX Studio gives you the workflow around it: + +- **Rule authoring UI** (form-based + YAML) instead of editing notebooks +- **Approval workflow** with `RULE_AUTHOR` / `RULE_APPROVER` separation +- **Schedules** that submit runs to a serverless job — no notebook orchestration to maintain +- **Run history** with status, sample bad rows, metrics, comments, and review status per run +- **Insights dashboard** — a Lakeview/AI-BI dashboard embedded as an iframe, customisable per workspace +- **Hybrid storage** (Lakebase OLTP for rules/settings, Delta for run history) so the UI feels like a real app, not a Spark query under every click + +### 1.3 Is this a Databricks product or open source? +It's an open-source companion to the `databrickslabs/dqx` library — both live under `databrickslabs/` (Databricks Labs is the field-maintained ecosystem, not the core product). You get the source, you deploy it into your own workspace via a Declarative Asset Bundle, and you own the operating model. + +### 1.4 What does it *not* do? +Stated up front so there are no surprises: + +- **No cross-workspace rules.** DQX Studio runs inside one workspace and authors rules for tables visible in that workspace's Unity Catalog metastore. If you have multiple metastores (regions, business units), you deploy one instance per metastore. +- **No vendor SaaS plane.** All metadata stays in your account. That's a feature (sovereignty) and a constraint (no cross-customer benchmarks, no external data-quality marketplace). +- **No anomaly detection / ML quality models.** Today it executes rules you (or the profiler) author. Anomaly-style "this column drifted" checks would be net-new work. +- **Not a catalog or lineage tool.** It reads from Unity Catalog and writes results to it; it does not replace Atlan, Collibra, or Purview. + +--- + +## 2. Architecture + +### 2.1 What does deployment look like? +One Databricks Asset Bundle (`databricks.yml`) provisions everything in one `make app-deploy`: + +- **Databricks App** (FastAPI + React, single process, served by the Apps runtime) +- **Serverless Job** for Spark work — the *task runner* — invoked for profiler, dry-run, and scheduled runs +- **Lakebase Postgres instance** (`database_instances.lakebase`) for OLTP state +- **Two UC schemas** (`dqx_studio`, `dqx_studio_tmp`) under a customer-supplied catalog +- **UC volume** (`wheels`) the app uses to ship DQX wheels into the job +- **SQL warehouse** — managed by the bundle, or BYO (bring-your-own) if you already have one +- **Lakeview dashboard** (`dashboards.dqx_quality_overview`) pinned to the app's *Insights* page + +All stateful resources carry `lifecycle.prevent_destroy: true`, so a stray `bundle destroy` cannot drop them and wipe state. + +### 2.2 Why a separate job for the Spark work? +Databricks Apps run in a container without Spark. We use the FastAPI process for the UI, REST API, and short OLTP queries — anything that needs Spark (profiling, dry runs, scheduled validation) is submitted to a serverless job we provision and pin to a wheels volume. The app polls job status and reads results back from Delta. + +That gives you: + +- **Predictable autoscaling** — serverless handles the spiky workload +- **Tight cost control** — the app process stays small; nothing Spark-shaped is running when no one is validating +- **Identity isolation** — Spark work runs as the task-runner SP with the user's OBO token threaded through, so UC permissions still apply + +### 2.3 Why a hybrid storage model (Lakebase + Delta)? +Data quality has two very different workloads, and we picked the right tool for each: + +| Workload | Backend | Why | +|---|---|---| +| Rules CRUD, app settings, RBAC, comments, schedules | **Lakebase Postgres** | Sub-millisecond reads, real transactions, indexes on small tables. Delta's not made for "give me one row by primary key, fast." | +| Profiling results, validation runs, metrics, quarantine rows | **Delta on UC** | High-volume append, time-series scans, dashboard JOINs, AI/BI integration. Postgres would melt under quarantine volumes. | + +The split is invisible to service code — `SqlExecutor` (Delta) and `PgExecutor` (Lakebase) share the same surface. **If a workspace can't or won't enable Lakebase, the OLTP tables fall back to Delta** via the `oltp_fallback` migration. The app still works; latency on rule CRUD is just higher. + +### 2.4 How does authentication work? +Two identities, each with a deliberate scope: + +- **User identity (OBO)** — every request from the browser carries an `X-Forwarded-Access-Token`. UC browsing (catalogs/schemas/tables), temporary view creation for profiler/dry-runs, and user-initiated job submissions all run as the user. This is how we enforce "you can only validate tables you can already read." +- **App / task-runner SP** — owns the app's own state (rules catalog, settings, OLTP migrations, wheel uploads) and runs scheduled jobs that have no user attached. Critically, the app SP is **only granted what it needs inside the DQX schemas** — it never gets blanket UC access. + +There are no admin-scoped REST calls made by the app itself. + +### 2.5 What's the data residency story? +Everything lives in the customer's Databricks account: + +- Source tables stay where they are — DQX Studio never moves them. +- Quarantine rows (potentially PII) land in Delta tables inside the customer's catalog. +- OLTP state (rules, comments, schedules) lives in the customer's Lakebase instance. +- Wheels and app code live in the customer's UC volume + workspace files. + +No data, metadata, or telemetry leaves the account. There is no DQX Studio SaaS plane to phone home to. + +--- + +## 3. Rule authoring & execution + +### 3.1 What kinds of checks does DQX support? +Three families, in increasing power: + +1. **Column-level row checks** — `is_not_null`, `is_in_range`, `regex_match`, `is_unique`, etc. Configured per column. +2. **Row-level dataset checks** — composite expressions on multiple columns of the same row. +3. **Cross-table SQL checks** — full SQL queries (referential integrity, aggregate reconciliation, business-logic joins). Whatever you can write as a `SELECT` returning the offending rows, DQX can run. + +All three are first-class in the engine, the UI, and the metrics pipeline. + +### 3.2 How do users author rules? +Two surfaces, same backing store: + +- **Form mode** — pick a table, pick columns, pick a check function, set parameters, optional labels. Good for the long tail. +- **YAML mode** — paste/edit DQX YAML directly. Good for power users and copy-paste from existing notebooks. + +Rules are versioned in `dq_quality_rules` + `dq_quality_rules_history` (audit log). They flow through statuses (`draft → proposed → approved`) gated by RBAC. + +### 3.3 What's the profiler? +A one-click way to bootstrap rules for a table. You point at a table; the profiler runs a serverless job that samples the data, infers candidates (null thresholds, value ranges, uniqueness), and writes them to `dq_profiling_results`. You then pick the ones you want and promote them to draft rules. It's not magic — it's a starting point. + +### 3.4 What's a "dry run"? +Validate a rule (or rule set) against the live table **before** approving or scheduling it. It returns: + +- Pass/fail counts (valid rows, error rows, warning rows, total rows) +- A 10-row sample of failing data inline +- A full quarantine table written to `dq_quarantine_records` (capped at 100k rows for SQL checks to bound storage) +- Per-check metrics + custom metrics into `dq_metrics` + +Dry runs use OBO, so the user can only validate tables they have `SELECT` on. + +### 3.5 How do schedules work? +Configure a per-rule-set schedule (cron or interval) in the UI. An in-process asyncio loop inside the FastAPI worker ticks every minute, picks up due schedules, and submits the same task-runner job that dry runs use — `task_type` discriminates. Scheduled runs use the **app SP**, not OBO, because there's no user at 3 AM. The schedule's *target rules* honour the rule-set the author selected. + +A `/tmp/.dqx_scheduler.lock` exclusive file lock guarantees the loop runs in exactly one app worker even if Databricks Apps autoscales to multiple replicas. + +### 3.6 What's "review status"? +A per-run label set by reviewers (the business owner, the on-call SA, whoever) after a run completes. Values are admin-configurable in **Configuration → Run review statuses**; the default catalogue ships with *Pending review*, *Acknowledged*, *Resolved*, *False positive*. The Runs History page filters on it as multi-select chips, so a steward can ask "what's still pending review?" in one click. Every change is audited in `dq_run_review_status_history`. + +It sits next to free-text comments on the run detail row — the dropdown is the structured action, the comment is the explanation. + +--- + +## 4. Permissions & governance + +### 4.1 What roles exist inside the app? +Four primary roles plus one orthogonal: + +- **`ADMIN`** — configure the app (review-status catalogue, custom metrics, labels, role mappings, retention, embedded dashboard, role↔group mapping) +- **`RULE_APPROVER`** — move rules from `proposed` to `approved` +- **`RULE_AUTHOR`** — create/edit drafts and propose rules +- **`VIEWER`** — read runs, dashboards, results +- **`RUNNER`** (orthogonal) — can execute approved rules on-demand + +Roles are resolved from **Databricks workspace-group membership** via the `dq_role_mappings` table. Customers map their existing IAM groups onto DQX roles — we never invent a new identity system. + +### 4.2 Who can do what on a table they don't own? +- **Browse a table in the UI**: the user needs `USE CATALOG` + `USE SCHEMA` + `SELECT` on the table (standard UC). The UI uses OBO, so the catalog tree only shows what the user can already see. +- **Author a rule against a table**: same — OBO read access is required to author and to dry-run. +- **Approve a rule**: the `RULE_APPROVER` role inside DQX Studio, regardless of table ownership. This is governance-by-process; the data owner does *not* need to use the app. +- **See a run's quarantine rows**: catalog-filtered server-side. The runs list and the quarantine table both gate on the user's UC-accessible catalogs. + +### 4.3 What permissions does the app's service principal need? +Scoped tight: + +- `USE CATALOG` + `USE SCHEMA` + `ALL PRIVILEGES` on the two DQX schemas (`dqx_studio`, `dqx_studio_tmp`) only +- `READ VOLUME` + `WRITE VOLUME` on the wheels volume only +- `CAN USE` on the SQL warehouse the app is bound to +- `Service Principal: User` role on the task-runner SP (so the app can submit jobs as it) + +It is **not** a workspace admin and **not** a metastore admin. If you remove the app, those grants are the only blast radius. + +### 4.4 What's the deployer's permission burden? +Documented as a table in `DEPLOYMENT.md` — about 10 line items, the bulk of which collapse if the deployer is added to a UC-admin group. The single most common failure on first deploy is missing `MANAGE` on the target catalog (the `post_deploy_grants.sh` script needs it to GRANT to the app SP). We surface that error explicitly with a fix. + +### 4.5 Is everything audited? +Yes, but the auditing is **distributed** across the platform rather than in one DQX log: + +- **Rule edits** → `dq_quality_rules_history` +- **Schedule edits** → `dq_schedule_configs_history` +- **Review-status changes** → `dq_run_review_status_history` +- **Job submissions** → Databricks Jobs UI / `system.lakeflow.jobs` +- **Table reads (OBO)** → UC audit logs (the standard ones) +- **App requests** → Databricks Apps logs + +We didn't invent a parallel audit store because UC + Jobs already audit everything we'd put in one. + +--- + +## 5. Operations + +### 5.1 What's the upgrade story? +`make app-deploy` is idempotent. Bundle changes go through `databricks bundle deploy`; schema changes go through versioned, idempotent migration runners (`MigrationRunner` for Delta, `PgMigrationRunner` for Lakebase). On every app start the wheel hash gates whether a re-upload + job-environment patch is needed. + +You re-run the same command. If you forgot to update a variable, the bundle complains; it doesn't silently mutate state. + +### 5.2 What happens if `bundle destroy` runs accidentally? +The three stateful resource keys — `schemas.main_schema`, `volumes.wheels`, `database_instances.lakebase` — carry `lifecycle.prevent_destroy: true`. The CLI refuses to drop them. To intentionally tear those down you have to edit `databricks.yml`, `unbind` the resource, and destroy it manually — a three-step opt-in. + +### 5.3 What about backup? +Three layers, all leveraging native Databricks: + +- **Delta tables** (runs, metrics, quarantine): UC's time travel + Delta retention. Standard backup story. +- **Lakebase Postgres**: managed by the Lakebase service (snapshots, PITR depending on the tier). The instance is the unit of backup. +- **Bundle source of truth**: `databricks.yml` lives in your VCS. The whole deployment is reproducible from source + a Postgres restore. + +### 5.4 What's the cost footprint? +Three line items: + +- **App compute** — Databricks Apps charges a flat hourly rate per running app. The FastAPI process is small (one or two replicas). +- **Lakebase instance** — runs continuously; size is small (the rules catalog is in the low-thousands of rows for most customers). +- **Task-runner job (serverless)** — only billed when validations run. Cost scales with row count and check complexity; profiler/dryrun is bounded, scheduled is what you make it. + +The app does **not** keep a SQL warehouse warm. The Lakeview dashboard auto-stops between uses. + +### 5.5 How do we monitor it? +- App-level logs in Databricks Apps UI +- Job-level logs in Jobs UI (each task-runner submission shows up as a run) +- UC audit logs for table access +- `dq_validation_runs` is itself a run history table — `SELECT … WHERE status = 'FAILED'` is your alert query if you want one +- The Insights dashboard ships KPIs for run health, error trends, top-failing tables + +--- + +## 6. Comparisons (FAQ-style) + +### 6.1 How does DQX Studio compare to Soda / Great Expectations? +Both are rule-driven DQ frameworks; Soda is the closer comparison (commercial product with a UI; Great Expectations is library-first). For Soda specifically, see [Section 11 — Soda deep dive](#11-soda-comparison--deep-dive) which is the answer this customer is actually here for. + +The short version: Soda is broader (multi-source, anomaly detection, alerts/incidents, SaaS UI); DQX Studio is **deeper inside Databricks** (PySpark-native, UC-aware OBO, Delta-resident quarantine, single workspace plane). Customers serving multiple warehouses often pick Soda; customers all-in on Databricks tend to land on DQX. + +### 6.2 How does it compare to Monte Carlo / Anomalo (SaaS observability)? +Those tools shine at automatic anomaly detection across many sources by indexing metadata in a vendor cloud. DQX Studio is the opposite end of the spectrum: rule-driven, single-source-of-truth-is-UC, sovereign. Customers tend to use both — DQX for *explicit* rules they care about, MC/Anomalo for *implicit* drift detection across the catalog. We do not pretend to replace anomaly detection today. + +### 6.3 How does it compare to Collibra / Atlan? +Different layer. Collibra/Atlan are governance catalogs. DQX Studio is a quality control plane. They integrate cleanly: Atlan/Collibra reference UC tables; DQX Studio writes quality scores into UC; the catalog renders them. + +### 6.4 What about SDP / DLT Expectations? +Expectations are inline with pipelines. DQX rules sit outside the pipeline and validate the *result* — useful when: + +- You don't own the pipeline (it's a vendor / partner load) +- You want post-hoc validation independent of pipeline framework +- You want a UI for non-data-engineers to author rules + +The two coexist. DLT Expectations are great inside DLT pipelines; DQX Studio is the catch-all for everything else. + +--- + +## 7. Roadmap-ish questions + +### 7.1 What's the cross-metastore story? +Today: one DQX Studio per metastore. We're not solving multi-metastore authoring in this version — the trade-off would be moving metadata out of Databricks, which kills the sovereignty story. If a customer needs a federated view across metastores, they roll up `dq_metrics` from each instance into a central Delta share and dashboard on it. + +### 7.2 Anomaly / ML quality? +On the roadmap conceptually, not committed. DQX has the metric infrastructure (`dq_metrics` is a long-format event store with `rule_set_fingerprint` provenance), so a Z-score / forecast-residual layer on top is feasible. We'd prefer the customer's appetite to drive that than ship a half-baked anomaly detector. + +### 7.3 API surface for external automation? +Already there. Everything the UI does is REST under `/api/v1/*`. The OpenAPI spec is published, the React frontend is generated from it (`orval`), so external scripts get the same typed surface. Customers have used this to scaffold rules from dbt manifests. + +### 7.4 Embedded dashboards beyond the default? +The bundle ships a starter Lakeview dashboard pinned to the *Insights* page. Customers customise it in **AI/BI Dashboards** (the iframe picks up changes immediately, no redeploy) or point Insights at a completely different dashboard ID via **Configuration → Insights dashboard**. + +--- + +## 8. Security & compliance + +### 8.1 Where's PII? +Two places, both Delta tables in the customer catalog: + +- `dq_quarantine_records` — failing rows, by definition contain the raw values that broke the rule +- `dq_profiling_results.sample_invalid_json` — a 10-row sample from profiler/dry-run for UI preview + +Both inherit UC's access control. The app filters quarantine reads by the user's UC-accessible catalogs server-side. To hide PII from the UI, restrict the `SELECT` grant on those tables; the app respects that automatically (OBO). + +### 8.2 Network egress? +None. The app's outbound calls are: + +- Databricks REST API (UC, Jobs, Apps, Secrets) over the workspace's hostname +- Lakebase Postgres (over the workspace VNet) +- That's it. There is no third-party CDN, telemetry endpoint, or vendor cloud. + +### 8.3 How are secrets managed? +The task-runner SP's OAuth client secret is the only one. It's stored in Databricks Secret Scopes, injected into the job's environment, never logged. The bundle's `make app-deploy` flow walks the deployer through minting it (or reuses an existing one). + +### 8.4 Compliance certifications? +DQX Studio inherits the workspace's certifications — there is no separate compliance scope because there is no separate plane. If your workspace is HIPAA / FedRAMP / etc., DQX Studio inherits those controls because every byte lives inside it. + +--- + +## 9. The customer's likely follow-ups + +Pre-empt these — we've heard them: + +- **"Can we run this in our own VPC?"** — Yes, automatically. It's a Databricks App in your workspace. +- **"Does it work with Unity Catalog Federation?"** — UC Federation tables (foreign tables to Snowflake/BigQuery/Postgres) show up in the catalog tree and can be authored against. Performance depends on the federation source, not on DQX. +- **"Can rules be promoted across environments (dev → prod)?"** — Today: export YAML from one deployment, commit, import to another. We don't have an automated promotion pipeline in the UI yet. +- **"What if Lakebase isn't enabled in our workspace?"** — Set `lakebase_enabled: false` (or leave `lakebase_instance_name` empty). Migrations fall back to Delta automatically. The app still works. +- **"Why a service principal instead of just our personal token?"** — Scheduled runs at 3 AM can't carry a user token. We need a stable identity. The OBO model still ensures *interactive* requests use the user. + +--- + +## 10. Things to demo (in order) + +If the customer wants a 20-minute walkthrough: + +1. **Profiler** — point at one of their actual tables, get suggestions in 30 seconds +2. **Promote → Author → Dry run** — go from suggestion to approved rule with one round-trip +3. **Schedule** — set it to run hourly; show the run hit immediately +4. **Run detail** — sample bad rows, comments, review status dropdown, label by team +5. **Insights dashboard** — the embedded AI/BI iframe, then jump out to AI/BI and edit a tile to show the live update path +6. **Configuration page** — show how every operational knob (labels, statuses, custom metrics, retention, role mappings, dashboard ID) is admin-configurable without redeploy + +--- + +## 11. Soda comparison — deep dive + +Most customers evaluating both have already piloted Soda Core / Soda Cloud and want to know: *what changes if we switch?* This section is structured around the questions Soda users actually ask. **It's written to be honest about where Soda wins**, because the customer has almost certainly already read Soda's marketing and will catch handwaving immediately. + +### 11.1 What is Soda, briefly, so we're talking about the same thing? +Three products under one brand: + +- **Soda Core** (OSS) — Python library + CLI; you author checks in `SodaCL` (their YAML DSL) and run `soda scan` against a data source. Results print or POST to Soda Cloud. +- **Soda Library** (commercial) — the same engine plus the connector to Soda Cloud and additional check types (reconciliation, anomaly detection, MLOps). +- **Soda Cloud** (SaaS) — the web UI: datasets, incidents, alerts, anomaly dashboards, Slack/PagerDuty integrations, collaboration features. + +Soda's compute model: the library translates `SodaCL` into SQL, runs it against your warehouse (Databricks via JDBC is one of ~15 connectors), and ships the **metric values + failing-row samples** back to Soda Cloud. Your raw data stays in the warehouse; the *results* live in their SaaS. + +### 11.2 What does the customer actually gain by picking Soda over DQX Studio? +Stated up front, because it's real: + +1. **Multi-source out of the box.** ~15 connectors (Snowflake, BigQuery, Redshift, Postgres, MSSQL, Spark, Databricks, …). If the customer has a heterogeneous stack — Snowflake + Databricks + Postgres — Soda monitors all three from one UI. DQX Studio is Databricks-only. +2. **Anomaly detection** (Sherlock algorithm). Soda Cloud auto-detects "row count dropped 30 % vs the last 14 days" without you writing a rule. DQX is rule-driven; you'd need to write the equivalent expectation manually (and tune it). +3. **Incident management.** Soda Cloud has first-class incidents — assign to a person, link to a Slack thread, mark resolved, see MTTR over time. DQX has comments + review status, which is enough for triage but isn't an incident workflow. +4. **Alerting.** Soda Cloud routes check failures to Slack / Teams / PagerDuty / email / webhooks. DQX has scheduled runs and a dashboard; **there is no built-in alerting today** — you'd build it from `dq_validation_runs` (e.g. a SQL alert on a Databricks dashboard, or a downstream Lakeflow job). Honest gap. +5. **Time-tested at scale.** Soda has been shipping since ~2020 with a dedicated team. DQX is younger and field-maintained (Databricks Labs). + +If the customer needs any of those five things, **Soda is the better choice** and we should say so. The next question (11.3) is where DQX usually wins back. + +### 11.3 What does the customer gain by picking DQX Studio? + +1. **No data ever leaves the workspace.** Soda Cloud is SaaS — even with the on-prem Soda Agent, metric values and failed-row samples travel to their cloud. DQX Studio is *entirely* in the customer's Databricks account; quarantine rows land in their UC, OLTP state in their Lakebase, no telemetry phones home. For regulated industries (banking, healthcare, public sector) this is often the decisive factor. +2. **OBO permission enforcement.** A user authoring or dry-running a rule against table X can only do it if **they** have `SELECT` on X. Soda runs every scan as the **connection identity** — typically one service account with broad SELECT. DQX is fine-grained; Soda is coarse. +3. **PySpark-native execution.** DQX checks run as Spark jobs in the customer's serverless compute, parallelised, with native Delta predicates. Soda Library translates checks into SQL strings shipped over JDBC — for a billion-row table this is slower and more expensive than a serverless Spark scan, especially for distinct-count / referential-integrity / cross-table checks. +4. **Quarantine is a Delta table.** When a DQX check fails, the offending rows are persisted to `dq_quarantine_records` in UC. Downstream teams can `JOIN dq_quarantine_records WITH the original source` in plain SQL to investigate, build remediation pipelines, feed bug-tracker tickets — anything you'd do with a normal Delta table. Soda stores **samples** (default 100 rows) in Soda Cloud; the full failing set is not retained. +5. **Cost.** DQX Studio is open source — the customer pays Databricks Apps + serverless job + Lakebase, all of which they already have on their bill. Soda Cloud is per-dataset / per-check / per-seat pricing on top of the warehouse compute they pay Soda's scan to consume. For Databricks-heavy estates this difference is material. +6. **Integration with the surrounding stack.** DQX writes to UC; UC feeds AI/BI dashboards; rule metadata sits next to lineage; access is governed by the same UC permissions every other tool respects. Soda is an island that integrates *with* the warehouse, not *as* part of it. +7. **UC-native authoring UX.** Browsing catalogs/schemas/tables in DQX Studio uses live UC reads as the logged-in user, so authors see *exactly* what they have access to. Soda's UI builds against the connections an admin configured — there's a layer of translation between "what's in the warehouse" and "what's in Soda." + +### 11.4 Direct feature-by-feature +The table the customer is going to ask for, side by side: + +| Capability | Soda Cloud + Library | DQX Studio | +|---|---|---| +| Rule DSL | SodaCL (YAML) | DQX YAML (compatible with [`databrickslabs/dqx`](https://github.com/databrickslabs/dqx)) | +| UI for rule authoring | Yes (Soda Cloud) | Yes (form + YAML modes in app) | +| Profiler / suggested rules | Yes (column profile) | Yes (`dq_profiling_results`) | +| Cross-table SQL rules | `failed_rows_query` | Native `sql` check type, persisted to `dq_quarantine_records` | +| Anomaly detection | Yes (Sherlock, on metric history) | **No** (rule-driven only today) | +| Reconciliation checks | Yes (Soda Library) | Yes via cross-table SQL | +| Schedules | Soda Agent + cron, or CI/CD | In-process scheduler (cron/interval) + serverless job | +| Approval workflow (author → approver) | No formal split | Yes (`RULE_AUTHOR` / `RULE_APPROVER` roles) | +| Run history | Yes (Soda Cloud retention) | Yes (Delta — retention is whatever you set, default unbounded) | +| Quarantine of failing rows | Samples (default 100 in Soda Cloud) | Full set in Delta UC table (capped at 100k for SQL checks) | +| Comments / review workflow | Incidents (first-class) | Comments + per-run review status with audit | +| Alerts / notifications | Slack / Teams / PagerDuty / email / webhook | **None built-in** (use Databricks SQL Alerts on `dq_validation_runs`) | +| Dashboards | Soda Cloud built-in | Embedded Lakeview/AI-BI; fully customisable | +| Multi-source | ~15 warehouses | Databricks (UC) only | +| Identity for scans | Single connection identity (typically a SP) | OBO per-user for interactive; SP for scheduled | +| Data residency | Soda Cloud SaaS (or Agent if licensed) | 100 % customer Databricks account | +| Audit trail | Soda Cloud audit | Distributed: `*_history` tables + UC + Jobs | +| Cost model | Per-dataset/check/seat (SaaS) + warehouse compute | Databricks compute only (open source app) | +| Open source | Soda Core only (limited) | Yes, all of it | +| Cross-workspace / cross-metastore | Yes (single Soda Cloud over many connections) | No — one instance per metastore | +| API / programmatic surface | Soda Cloud API + Python | OpenAPI under `/api/v1/*`, typed React client generated | + +### 11.5 "Can we migrate rules from Soda to DQX?" +**Mostly yes, with rewrites — there is no one-shot importer.** SodaCL and DQX YAML are structurally similar (both are check-list YAML rooted at a table) but the check function names and parameters differ. A typical migration path: + +1. Export the SodaCL checks for the Databricks-targeted datasets. +2. For each check, map it: `missing_count = 0` → DQX `is_not_null`, `invalid_count(col) using regex … = 0` → DQX `regex_match`, `failed_rows_query` → DQX `sql` check. Most map 1:1. +3. The non-mappable checks are exactly the ones DQX doesn't have today: anomaly detection, change-detection-vs-history. Those need a manual replacement (write the rule explicitly, or accept it as a gap). +4. Bulk-import the rewritten YAML through the DQX Studio rule API or the rules UI's YAML mode. + +A field engineer can usually rewrite ~50 SodaCL checks in an afternoon if they're standard shapes. For very large estates we'd consider scripting it, but there's no productised tool today. + +### 11.6 "Can we run both side-by-side during the migration?" +Yes, with no conflicts. Soda reads through JDBC; DQX runs Spark jobs. They don't share state or coordinate on rule definitions, so there's nothing to deconflict — the only consideration is double-billed warehouse / serverless compute during the overlap period. We recommend running both for one to two weeks against the same critical tables to verify equivalent rule outcomes before turning Soda off. + +### 11.7 "Soda has anomaly detection. Is the lack of it a blocker?" +Honest answer: **it depends on what they're using it for**. + +- If they rely on anomaly detection as the *primary* DQ signal ("we don't write rules, we let Soda find weirdness"), DQX is not a like-for-like swap today. They should keep an anomaly tool — Soda, Monte Carlo, Anomalo — alongside DQX. +- If they use anomaly detection as a *safety net* on top of explicit rules ("our rules catch 80 %, Sherlock catches the 20 % we forgot"), they can get most of that 20 % through good rule discipline + the DQX profiler. The metric infrastructure in DQX (`dq_metrics` with `rule_set_fingerprint` provenance) is anomaly-detection-ready — a Z-score or forecast-residual layer on top is feasible as a customer-side notebook today. + +The roadmap honest answer: anomaly detection is conceptually on our roadmap, not committed. We'd rather not promise a date. + +### 11.8 "Soda has alerts to Slack. We need that." +Real gap. The pragmatic answer in Databricks today: + +- **`dq_validation_runs`** is a normal Delta table. Run a **Databricks SQL Alert** on `SELECT count(*) FROM dq_validation_runs WHERE status = 'FAILED' AND created_at > current_timestamp() - INTERVAL 1 HOUR` and route it to Slack/Teams/email via the standard SQL Alerts destination plumbing. +- Same for "run with > N error rows": `SELECT … WHERE error_rows > `. +- For richer routing (per-table owner, per-severity), a Lakeflow job downstream of `dq_metrics` is a few hours of work. + +Native alerting *inside* DQX Studio is a fair future ask; right now it's "use the platform's alerting layer." + +### 11.9 "Soda Cloud has incidents — do you?" +DQX Studio has the two pieces that matter for triage: + +- **Comments** on every run (a `dq_comments` thread, just like comments on a rule) +- **Review status** per run with an audit trail (the new dropdown we added — see [§3.6](#36-whats-review-status)) + +That covers "acknowledge / under investigation / resolved / false positive" with attribution and history. What it does not cover is: + +- Auto-creating an external ticket (Jira / ServiceNow) — that's a webhook we don't have today. +- A separate incident object that aggregates multiple runs of the same failing rule into one open issue. + +If the customer's DQ model is incident-driven (an on-call rotation triages failures across many checks), Soda has a more polished surface today. If their model is rule-owner-driven ("the team that owns the rule fixes the rule"), DQX is sufficient. + +### 11.10 "What about Soda Agent — they say data doesn't leave our network either?" +The Soda Agent is a containerised executor a customer hosts in their own infrastructure to run scans without the data passing through Soda's cloud. **The check definitions, metric results, and failed-row samples still travel to Soda Cloud** — that's how the UI works. The Agent improves the data path; it does not eliminate the SaaS plane. + +DQX Studio has no SaaS plane at all. Every byte — definitions, results, samples, audit logs — lives in the customer's Databricks account. + +### 11.11 "Pricing — how should we model the comparison?" +Approximate, not a quote: + +- **Soda Cloud**: per-dataset, per-check, or per-seat depending on the contract. Plus the Databricks warehouse compute that runs the scans. +- **DQX Studio**: Databricks Apps (small flat hourly), one small Lakebase instance, serverless job billed only when validating. All on the customer's existing Databricks bill. + +The honest framing: for a small estate (~50 monitored tables), Soda Cloud is often cost-comparable to DQX. For a large estate (hundreds to thousands of tables), DQX scales sub-linearly because there's no per-dataset SaaS fee — only compute, which the customer already accepts. Get the customer to share approximate counts before quoting. + +### 11.12 "Is Soda integration with DQX possible?" +There's no first-party integration. If a customer really wants both — Soda's anomaly detection + alerts plus DQX's UC integration and OBO authoring — the practical pattern is: + +- Run DQX as the authoring + execution layer (its strength) +- Use Soda Cloud against the **same source tables** with the anomaly-detection check types only (its strength) +- Don't double-write the explicit rules — pick one tool per rule + +This is unusual but real for customers who can absorb both bills and want best-of-both. Most pick one. + +### 11.13 The two-line elevator answer +If the customer asks you to summarise the whole comparison in two sentences: + +> **Soda is the right answer if you need multi-source coverage, anomaly detection, or built-in alerting today — and you're comfortable with metadata in their SaaS plane.** **DQX Studio is the right answer if you're Databricks-centric, want every byte to stay in your account, want OBO-level permission enforcement, and prefer rule-driven over anomaly-driven DQ.** + +Most Databricks-all-in customers land on DQX. Most heterogeneous customers (Snowflake + Databricks + Postgres) land on Soda. Neither answer is wrong. diff --git a/app/DEPLOYMENT.md b/app/DEPLOYMENT.md index 0688cf908..d55f12ead 100644 --- a/app/DEPLOYMENT.md +++ b/app/DEPLOYMENT.md @@ -8,7 +8,7 @@ Before you start, confirm you have **all** of the items below. The single most c ### Tooling -- **Databricks CLI** v0.268+ installed and authenticated against your workspace (`databricks auth login -p `). v0.268 is the minimum that supports `lifecycle.prevent_destroy` on bundle resources. +- **Databricks CLI** v1.4.0+ installed and authenticated against your workspace (`databricks auth login -p `). `make app-deploy` and `make app-bind` enforce this via a preflight `app-check-cli` step (`databricks --version`) and abort before building if the CLI is older. v1.4.0 is required because the one-button `postgres_roles` resource (used to provision a fresh Lakebase) is only accepted by CLI ≥ 1.4.0; `lifecycle.prevent_destroy` itself needs only v0.268+. - **`jq`** (used by the post-deploy grants script and the resource-bind helper) - **`make`** (drives the one-command deploy target) - **App build toolchain** — `make app-deploy` first runs `make app-build` to produce the wheel, which needs **uv**, **Node.js 18+** (provides `npm`; `brew install node` / nvm / [nodejs.org](https://nodejs.org/en/download)), **yarn** (`npm install -g yarn`), and **bun** (`curl -fsSL https://bun.sh/install | bash`). See [DEVELOPMENT.md → Prerequisites](DEVELOPMENT.md#prerequisites). @@ -74,13 +74,15 @@ Contact your workspace admin or enable it via the workspace settings if not alre ## Step 3: Stateful storage and destroy protection -DQX Studio's stateful resources — the two schemas (`dqx_studio`, `dqx_studio_tmp`), the wheels volume, and the Lakebase instance — are all declared as bundle resources in `app/databricks.yml`. Each one carries `lifecycle.prevent_destroy: true` (Databricks CLI 0.268+), which **blocks `databricks bundle destroy` from dropping the resource** and wiping the data. Use this command line to verify: +DQX Studio's stateful resources — the two schemas (`dqx_studio`, `dqx_studio_tmp`), the wheels volume, and the Lakebase instance — are all declared with `lifecycle.prevent_destroy: true` (Databricks CLI 0.268+), which **blocks `databricks bundle destroy` from dropping the resource** and wiping the data. + +The schemas and volume are declared at the **base level** in `app/databricks.yml`. The Lakebase instance is declared **inside the target** (`database_instances.lakebase` under `targets.dqx-app-demo`) so a target that doesn't want Lakebase can simply omit the block rather than carry a stub instance. Use: ```bash grep -A1 'lifecycle:' app/databricks.yml | head ``` -You'll see one `prevent_destroy: true` for each of: `schemas.main_schema`, `schemas.tmp_schema`, `volumes.wheels`, `database_instances.lakebase`. +You'll see `prevent_destroy: true` on `schemas.main_schema`, `schemas.tmp_schema`, `volumes.wheels` (base), and `database_instances.lakebase` (under the `dqx-app-demo` target). > **The app's `dqx_studio` Postgres schema** (inside the `databricks_postgres` admin database on the Lakebase instance) is created by the app at first start. It's stateful but lives below the resource layer DABs models, so `prevent_destroy` doesn't apply to it directly. The instance-level guard above is what protects it: as long as `database_instances.lakebase` survives, the schema and its tables survive. @@ -109,35 +111,25 @@ Add or update a deploy target. The minimum required variables are: - `dqx_service_principal_application_id` - `sql_warehouse_id` — picks the warehouse mode (see [Choosing a SQL warehouse mode](#choosing-a-sql-warehouse-mode) immediately below) -Everything else has a sensible default and can be overridden per target. Below is a **Mode A** (bundle-managed warehouse) starter. For **Mode B** (reuse an existing warehouse), see the next subsection. +Everything else has a sensible default and can be overridden per target. A minimal target skeleton looks like this: ```yaml targets: - dev: + my-target: workspace: profile: variables: catalog_name: dqx_service_principal_application_id: - # Mode A: chain the bundle-managed warehouse's runtime ID back into - # the variable the app reads. See "Choosing a SQL warehouse mode". - sql_warehouse_id: ${resources.sql_warehouses.dqx_sql_warehouse.id} - resources: - sql_warehouses: - dqx_sql_warehouse: - name: ${var.sql_warehouse_name} # default: dqx-studio-sql-warehouse - cluster_size: "X-Small" - enable_serverless_compute: true - max_num_clusters: 1 - min_num_clusters: 1 - auto_stop_mins: 10 - permissions: - - group_name: "users" - level: "CAN_USE" + sql_warehouse_id: presets: trigger_pause_status: PAUSED ``` +> The repo ships a single canonical target, **`dqx-app-demo`** (Mode B, marked `default: true`). + +The resource-specific blocks (the warehouse `sql_warehouses` block, the Lakebase `database_instances` block, and any app-resources overrides) are described next in [Choosing a SQL warehouse mode](#choosing-a-sql-warehouse-mode) and assembled per use-case in [Deployment scenarios](#deployment-scenarios-warehouse-and-lakebase-combinations). + ### Choosing a SQL warehouse mode The bundle does **not** declare a SQL warehouse at the top level on purpose — each target picks one of two patterns. Pick one per target; you cannot mix. @@ -148,7 +140,7 @@ Use when you want a fresh, dedicated warehouse provisioned and owned by the bund ```yaml targets: - bdf-vo: + my-mode-a-target: workspace: host: https://.cloud.databricks.com/ variables: @@ -177,57 +169,131 @@ Pros: zero pre-existing infra needed; warehouse spec is version-controlled; `bun Use when you want the app to point at a warehouse that already exists — typically a shared workspace warehouse used by multiple apps or teams. The bundle never creates, mutates, or destroys the warehouse; it only records "the app uses warehouse ``". You provide just the ID. +This is the pattern the repo's canonical `dqx-app-demo` target uses. Because the **base** app definition already binds the warehouse via `id: ${var.sql_warehouse_id}`, switching to an existing warehouse needs **only the literal ID in `variables`** — no `apps.dqx-studio.resources` override: + ```yaml targets: - kaizen-app: + dqx-app-demo: + default: true workspace: - profile: kaizen-app + profile: dqx-app-demo variables: - catalog_name: dqx_studio_demo + catalog_name: dqx_service_principal_application_id: - sql_warehouse_id: "ddafa92ede1ef3f4" # ← just the existing warehouse's ID - resources: - apps: - dqx-studio: - # The app's ``resources`` list is wholesale-replaced on target - # override, so all three entries (warehouse, job, lakebase) must - # be repeated even though only the warehouse changes. - resources: - - name: "dqx-sql-warehouse" - description: "Shared workspace SQL warehouse (not bundle-managed)" - sql_warehouse: - id: ${var.sql_warehouse_id} - permission: "CAN_USE" - - name: "dqx-task-runner-job" - job: - id: ${resources.jobs.dqx_task_runner.id} - permission: "CAN_MANAGE" - - name: "dqx-lakebase" - database: - database_name: ${var.lakebase_database_name} - instance_name: ${resources.database_instances.lakebase.name} - permission: "CAN_CONNECT_AND_CREATE" + sql_warehouse_id: "" # ← just the existing warehouse's ID + presets: + trigger_pause_status: PAUSED ``` -Pros: safe for shared warehouses (`bundle destroy` can't touch them); no wasted compute; warehouse settings are managed by whoever owns the warehouse, not by this bundle. Cons: requires the warehouse to exist already; needs the `apps.dqx-studio.resources:` list override (entire list, see [the list-replacement gotcha](#how-permissions-get-granted-on-each-mode)). +Pros: safe for shared warehouses (`bundle destroy` can't touch them); no wasted compute; warehouse settings are managed by whoever owns the warehouse, not by this bundle; the target stays tiny (just `variables`). Cons: requires the warehouse to exist already, and `CAN_USE` is granted post-deploy by `post_deploy_grants.sh` rather than by Terraform (see [How permissions get granted](#how-permissions-get-granted-on-each-mode)). + +> **When you _do_ need the `apps.dqx-studio.resources:` override:** only when a target changes the *shape* of a binding rather than a value it reads from a variable — e.g. dropping or repointing the Lakebase binding (see [Deployment scenarios](#deployment-scenarios-warehouse-and-lakebase-combinations) below). DABs replaces that list wholesale on override, so you must repeat the remaining entries. Plain Mode B (existing warehouse) does **not** trigger this because the base binding already reads `${var.sql_warehouse_id}`. #### Side-by-side comparison | | Mode A (bundle-managed) | Mode B (external) | |---|---|---| -| **YAML in target** | `sql_warehouses.dqx_sql_warehouse: { name, cluster_size, ... permissions }` + `sql_warehouse_id: ${resources...id}` | just `sql_warehouse_id: ""` + override `apps.dqx-studio.resources:` list | +| **YAML in target** | `sql_warehouses.dqx_sql_warehouse: { name, cluster_size, ... permissions }` + `sql_warehouse_id: ${resources...id}` | just `sql_warehouse_id: ""` (the base app binding already reads this var — no list override needed) | | **Who owns the warehouse** | The bundle (Terraform state) | Whoever created it (out of band) | | **`bundle deploy` effect** | Creates / mutates the warehouse to match spec | Never touches the warehouse | | **`bundle destroy` effect** | DELETES the warehouse | Leaves the warehouse intact | | **Permissions on the warehouse** | Granted via the `permissions:` block under `sql_warehouses.dqx_sql_warehouse` (Terraform sets them) | Granted post-deploy by `post_deploy_grants.sh` calling the warehouses permissions API (the bundle doesn't own the warehouse, so `databricks_permissions` can't be used) | | **Good for** | Dev/sandbox targets, per-environment dedicated warehouses | Production with a shared warehouse, vending-machine workspaces with a pre-existing warehouse, demo workspaces where you don't want a second warehouse to appear | -| **Example target in the repo** | `bdf-vo` | `kaizen-app` | +| **Example target in the repo** | _(none — illustrative only)_ | `dqx-app-demo` (the only shipped target) | #### How permissions get granted on each mode - **Mode A**: the `permissions:` block under `sql_warehouses.dqx_sql_warehouse` is applied by Terraform during `bundle deploy`. To grant additional principals, edit the YAML and redeploy. - **Mode B**: `post_deploy_grants.sh` reads `sql_warehouse_id` from the validated bundle config and, when it resolves to a non-empty literal ID (i.e. the bundle isn't managing the warehouse), PATCHes the warehouses permissions API to add `CAN_USE` for the app SP, the job SP, and the workspace `users` group. The PATCH is additive — existing grants on the warehouse are preserved. (In Mode A the variable resolves to `${resources.sql_warehouses.dqx_sql_warehouse.id}` which the script treats as bundle-owned and skips this step, since Terraform already covered it via the `permissions:` block.) If you need different principals, edit the relevant block in `app/scripts/post_deploy_grants.sh`. +### Deployment scenarios: warehouse and Lakebase combinations + +DQX Studio depends on three workspace resources, and you can independently choose whether the **bundle provisions** each one or whether you **bring your own** (BYO) existing resource: + +- **Catalog — always BYO.** The bundle never creates a catalog; it only creates the two schemas (`dqx_studio`, `dqx_studio_tmp`) and the `wheels` volume *inside* a catalog you name. So `catalog_name` must point at an existing catalog in every scenario (you need `USE CATALOG` + `CREATE SCHEMA` on it — see [Required permissions](#required-permissions) and [The catalog must already exist](#the-catalog-must-already-exist)). +- **SQL warehouse — 2 modes.** Bundle-managed (**Mode A**) or BYO existing (**Mode B**). See [Choosing a SQL warehouse mode](#choosing-a-sql-warehouse-mode). +- **Lakebase — 3 modes.** Bundle-managed (bundle CREATEs the instance), BYO existing (adopt an existing instance), or none (OLTP falls back to Delta). + +Because the warehouse has 2 modes and Lakebase has 3, there are **6 combinations**. The catalog is the same (existing) in all of them. + +#### The 5 building blocks + +Compose a target from these fragments. The **base** `resources` block already binds the warehouse via `${var.sql_warehouse_id}` and the Lakebase via `${resources.database_instances.lakebase.name}`, so most modes are just a variable change. + +**Catalog (required in every scenario):** +```yaml + variables: + catalog_name: # must already exist + dqx_service_principal_application_id: +``` + +**Warehouse — Mode A (bundle-managed):** set `sql_warehouse_id: ${resources.sql_warehouses.dqx_sql_warehouse.id}` and add the `sql_warehouses` block — full YAML in [Mode A above](#mode-a--bundle-managed-the-bundle-creates-a-dedicated-warehouse). + +**Warehouse — Mode B (BYO existing):** set `sql_warehouse_id: ""` — no extra blocks (the base app binding reads the variable). + +**Lakebase — managed (bundle creates a fresh instance):** keep the base `database_instances.lakebase` block and name the new instance. On a fresh workspace, also provision the app SP's CREATE-schema privilege (one-button `postgres_roles` or a manual grant — see the [one-button Postgres note](#step-4-configure-databricksyml)). +```yaml + variables: + lakebase_instance_name: +``` + +**Lakebase — BYO existing (adopt via bind):** keep the base block, name the existing instance, and run `make app-bind` once so the bundle adopts it instead of trying to CREATE it. +```yaml + variables: + lakebase_instance_name: +``` + +**Lakebase — none (Delta fallback):** set the sentinel `"-"`, remove the base `database_instances.lakebase` resource, and drop the `dqx-lakebase` app binding (which means overriding the app-resources list — DABs replaces it wholesale, so repeat the remaining two entries). +```yaml + variables: + lakebase_instance_name: "-" + resources: + apps: + dqx-studio: + resources: + - name: "dqx-sql-warehouse" + sql_warehouse: { id: "${var.sql_warehouse_id}", permission: "CAN_USE" } + - name: "dqx-task-runner-job" + job: { id: "${resources.jobs.dqx_task_runner.id}", permission: "CAN_MANAGE" } + # ...and delete the database_instances.lakebase block from the base resources. +``` + +#### The 6 combinations + +| # | Warehouse | Lakebase | Variables | Extra blocks | `make app-bind` first? | +|---|---|---|---|---|---| +| 1 | Managed (A) | Managed | `sql_warehouse_id: ${resources…id}`, `lakebase_instance_name: ` | add `sql_warehouses` | No (fresh) / Yes if storage pre-exists | +| 2 | Managed (A) | BYO existing | as #1 but `lakebase_instance_name: ` | add `sql_warehouses` | **Yes** (adopts the instance) | +| 3 | Managed (A) | None | `sql_warehouse_id: ${resources…id}`, `lakebase_instance_name: "-"` | add `sql_warehouses`; drop `dqx-lakebase` binding; remove base `database_instances` | No / Yes if storage pre-exists | +| 4 | BYO (B) | Managed | `sql_warehouse_id: ""`, `lakebase_instance_name: ` | none | No (fresh) / Yes if storage pre-exists | +| 5 | BYO (B) | BYO existing | `sql_warehouse_id: ""`, `lakebase_instance_name: ` | none | **Yes** — **this is the canonical `dqx-app-demo` target** | +| 6 | BYO (B) | None | `sql_warehouse_id: ""`, `lakebase_instance_name: "-"` | drop `dqx-lakebase` binding; remove base `database_instances` | No / Yes if storage pre-exists | + +> **When is `make app-bind` needed?** Whenever any stateful resource the bundle would CREATE *already exists* (the two schemas, the wheels volume, or — for the managed/BYO Lakebase modes — the instance). Adopting an existing Lakebase instance (scenarios #2 and #5) **always** requires it; for a brand-new workspace where nothing exists yet, skip bind and let `make app-deploy` create everything. `make app-bind` is idempotent, so running it when nothing needs adopting is a safe no-op. See [Migrating an existing workspace](#migrating-an-existing-workspace). + +#### Worked example: the canonical `dqx-app-demo` target + +**Scenario #5 — BYO warehouse + BYO Lakebase.** Everything already exists; the target is just `variables` (compose the other scenarios from the [building blocks](#the-5-building-blocks) above and the [combination table](#the-6-combinations)): + +```yaml +targets: + dqx-app-demo: + default: true + workspace: + profile: + variables: + catalog_name: + dqx_service_principal_application_id: + sql_warehouse_id: "" + lakebase_instance_name: + presets: + trigger_pause_status: PAUSED +``` +```bash +make app-bind PROFILE= TARGET=dqx-app-demo # one-time adoption +make app-deploy PROFILE= TARGET=dqx-app-demo +``` + ### Variable reference All target-level variables, their defaults, and what they control: @@ -236,7 +302,7 @@ All target-level variables, their defaults, and what they control: |---|---|---|---| | `catalog_name` | `dqx` | **Yes** | Unity Catalog catalog where schemas and the wheels volume are created. **Must already exist** — the bundle does not create the catalog itself. | | `dqx_service_principal_application_id` | `00000000-…` | **Yes** | Application ID of the service principal that runs the task-runner job. Created in [Step 1](#step-1-create-a-service-principal). The placeholder default fails validation. | -| `admin_group` | `admins` | No | Workspace group whose members get the in-app `ADMIN` role unconditionally (bootstrap admin path). The default `admins` is the built-in workspace admins group — every workspace admin becomes a DQX admin automatically. Override with a dedicated group (e.g. `dqx-admins-prod`) for narrower bootstrap access. Additional roles are assigned at runtime via the in-app Role Management UI. | +| `admin_group` | `proj_dbw_dev_dg_admins-data_ug` (bundle) / unset (local Python) | Yes for prod | Workspace group whose members get the in-app `ADMIN` role unconditionally (bootstrap admin path). The bundle ships with a non-production placeholder — override per target with your real admin group (e.g. `dqx-admins-prod`). Locally, `AppConfig` defaults to `None` and skips bootstrap admin assignment; set `DQX_ADMIN_GROUP` in your shell if you need it for local testing. Additional roles are assigned at runtime via the in-app Role Management UI. | | `app_name` | `dqx-studio` | No | Deployed Databricks App name. Override per target (e.g. `dqx-studio-dev`, `dqx-studio-prod`) when deploying multiple targets to the same workspace, or for personal sandboxes. | | `sql_warehouse_id` | `""` (empty) | **Yes** (every target must set it) | ID of the SQL warehouse the app queries. Two ways to populate it: chain it to a bundle-managed warehouse (Mode A: `sql_warehouse_id: ${resources.sql_warehouses.dqx_sql_warehouse.id}`) or point at an existing warehouse (Mode B: `sql_warehouse_id: ""`). See [Choosing a SQL warehouse mode](#choosing-a-sql-warehouse-mode). | | `sql_warehouse_name` | `dqx-studio-sql-warehouse` | Mode A only | Name passed to Terraform when CREATING the bundle-managed warehouse. **Ignored in Mode B** (the warehouse already exists with its own name). Override per target to avoid duplicates in shared workspaces. | @@ -334,6 +400,24 @@ GRANT USE SCHEMA, CREATE TABLE ON SCHEMA .dqx_studio_tmp TO `account us ``` > **Lakebase grants are handled differently.** When Lakebase is enabled, the bundle binds the database to the app via a `database` resource block (`permission: CAN_CONNECT_AND_CREATE`). DABs translates that into the equivalent Postgres role grants automatically — there is no separate SQL to run. The first time the app connects, `PgMigrationRunner` creates its own schema and tables inside the Lakebase database. +> +> **One-button Postgres (CLI ≥ 1.4.0).** `PgMigrationRunner` does `CREATE SCHEMA dqx_studio` inside the system `databricks_postgres` database, and `CREATE` there is owned by `databricks_superuser`. On some workspaces the `CAN_CONNECT_AND_CREATE` binding alone doesn't confer that, and the schema-create fails with a Postgres permission error until someone grants it by hand. For a **fresh workspace**, add a [`postgres_roles`](https://github.com/databricks/cli/pull/5467) block to your target that makes the app SP a member of `DATABRICKS_SUPERUSER`, so a single `bundle deploy` provisions the instance, the app, *and* the CREATE-schema privilege — no follow-up `psql`/console step: + +```yaml +targets: + my-fresh-target: + postgres_roles: + app_sp: + parent: "projects//branches/" + role_id: "sp-${resources.apps.dqx-studio.service_principal_client_id}" + postgres_role: ${resources.apps.dqx-studio.service_principal_client_id} + identity_type: SERVICE_PRINCIPAL + auth_method: LAKEBASE_OAUTH_V1 + membership_roles: + - DATABRICKS_SUPERUSER +``` + +This requires **Databricks CLI v1.4.0 or newer** (`databricks --version`); on older CLIs the `postgres_roles` field is rejected at `bundle validate`, so either upgrade the CLI or delete the block and grant `CREATE ON DATABASE databricks_postgres` to the app SP once manually. The shipped `dqx-app-demo` target intentionally **omits** this block because its app SP's Postgres role already exists (created out of band on an earlier deploy), so a declarative create would conflict; the pre-existing role already confers the needed CREATE-schema access. To grant app access to end users, go to **Apps → `` → Permissions** and assign `Can Use`. Replace `` with the value of `app_name` configured for your target (default `dqx-studio`). @@ -472,8 +556,9 @@ The app deliberately refuses to start when Lakebase is configured (`DQX_LAKEBASE ``` If the instance is missing, re-run `databricks bundle deploy`. If the instance is there but the state is `STARTING` / `UPDATING`, wait for it to reach `AVAILABLE` and the next restart will succeed. 2. Confirm the app SP has `CAN_CONNECT_AND_CREATE` on the bound logical database. Check the bundle's `database` resource block under `resources.apps.dqx-studio.resources` (it should bind `database_name: ${var.lakebase_database_name}`) and redeploy. -3. Confirm OAuth token issuance is healthy — Lakebase tokens currently expire after one hour; a misconfigured OAuth integration or revoked SP credential will surface here. -4. If you intentionally want to run on Delta only (no Lakebase), redeploy with `DQX_LAKEBASE_INSTANCE_NAME=""` (or remove the override from your target in `databricks.yml`). The app will start in legacy UC-only mode and OLTP tables will live on Delta. +3. If the failure is specifically a Postgres `permission denied for database databricks_postgres` (or `permission denied to create schema`), the app SP can connect but lacks `CREATE` on the system `databricks_postgres` database — that privilege is owned by `databricks_superuser`. Either deploy the `postgres_roles` block (CLI ≥ 1.4.0; see the [Lakebase grants note](#step-4-configure-databricksyml) above) so the bundle grants `DATABRICKS_SUPERUSER` membership, or run a one-time `GRANT CREATE ON DATABASE databricks_postgres TO ""` against the Lakebase instance. +4. Confirm OAuth token issuance is healthy — Lakebase tokens currently expire after one hour; a misconfigured OAuth integration or revoked SP credential will surface here. +5. If you intentionally want to run on Delta only (no Lakebase), redeploy with `DQX_LAKEBASE_INSTANCE_NAME=""` (or remove the override from your target in `databricks.yml`). The app will start in legacy UC-only mode and OLTP tables will live on Delta. **`databricks bundle deploy` fails with `"already exists"` / `"Instance name is not unique"` on the first deploy of a target:** The schemas, volume, or Lakebase instance were created out-of-band before this version of the bundle (e.g. by the older bootstrap script). Run the bind step once per target to adopt them into bundle management: @@ -490,3 +575,27 @@ This is the safety guard doing its job — see [Step 3](#step-3-stateful-storage **Lakebase queries time out / app logs show pool exhaustion:** Bump `lakebase_capacity` from `CU_1` to `CU_2` (or higher) in `databricks.yml` and redeploy. You can also raise `DQX_LAKEBASE_POOL_MAX_SIZE` (default 10) on the app's environment if many concurrent requests are hitting the OLTP path. + +## Insights dashboard + +The bundle ships a starter AI/BI dashboard (`dashboards/dqx_quality_overview.lvdash.json`) declared as `resources.dashboards.dqx_quality_overview` in `databricks.yml`. It's automatically created on deploy and pinned to the app's **Insights** page via the `DQX_DEFAULT_DASHBOARD_ID` env var, so the page works out-of-the-box. + +**What you get**: a four-row layout with KPI counters (total runs, monitored tables, total errors, pass rate), trend charts (runs over time by status; errors & warnings over time), drilldowns (top failing tables; quarantined rows over time), and a recent-runs table. + +**Customising the starter**: open it in **Databricks → AI/BI Dashboards**, add or change widgets, and save. The iframe inside DQX Studio picks up changes immediately — no redeploy needed. You can also point the Insights page at a completely different dashboard via **Configuration → Insights dashboard**; clearing that override reverts to the starter. + +**Query identity**: the dashboard is configured with `embed_credentials: true`, so queries run as the bundle deployer rather than the iframe viewer. This is deliberate — the bundle only grants `USE CATALOG` (not table-level `SELECT`) to `account users`, keeping `dq_quarantine_records` (potentially PII row payloads) off the workspace UC surface. The widgets in the starter only expose aggregated counts and run metadata, so deployer-credentialed queries don't leak anything a viewer couldn't already see in the Runs History page. To switch to viewer-credentials, flip `embed_credentials` to `false` and grant `SELECT` on the DQX tables to the audience you want to expose. + +`scripts/post_deploy_grants.sh` automatically grants the deployer `USE SCHEMA` + `SELECT ON SCHEMA .` after every `make app-deploy`, so the dashboard works end-to-end without manual UC plumbing. It resolves the deployer identity from `databricks current-user me` so it works for both human deploys (grants the email) and SP-based deploys (grants the application ID). + +**One operational caveat**: the "deployer" identity is whoever ran `databricks bundle deploy` (the human or service principal authenticated to the workspace at deploy time). If that identity later loses access to the DQX tables, dashboard tiles will fail to render until someone with access redeploys. For production, deploy the bundle as a stable service principal so the dashboard identity doesn't follow individual humans. + +## Run review status + +DQX Studio lets reviewers attach a per-run **review status** (e.g. *Pending review*, *Acknowledged*, *Resolved*, *False positive*) to each validation run from the expanded row on the **Runs History** page. The same value is filterable from the toolbar so a business owner can ask "what's still pending?" in one click. + +- **Configurable catalogue** — admins manage the list of allowed values (label, description, colour) under **Configuration → Run review statuses**. Exactly one entry must be marked **Default**; that value is what unreviewed runs surface virtually (no row is written until someone explicitly reviews). The backend enforces the single-default invariant on save. +- **Audit trail** — every change appends to `dq_run_review_status_history`, surfaced as an "Activity" timeline inside the review-status panel. The current value lives in `dq_run_review_status` (one row per reviewed run). +- **Storage** — both tables are OLTP-shaped (single-key lookups, frequent mutation), so they live in Lakebase when it's enabled and fall back to Delta otherwise via the same `oltp_fallback` plumbing used by comments and role mappings. No extra deployment configuration is needed. +- **Permissions** — any authenticated app user can set or change a review status, mirroring how comments work. Only admins can edit the catalogue itself. + diff --git a/app/DEVELOPMENT.md b/app/DEVELOPMENT.md index 2e314d90e..2943c3210 100644 --- a/app/DEVELOPMENT.md +++ b/app/DEVELOPMENT.md @@ -49,7 +49,7 @@ DQX_CATALOG=dqx # Unity Catalog catalog name DQX_SCHEMA=dqx_studio # schema inside the catalog DQX_JOB_ID= # required for profiler/dry-run DQX_WHEELS_VOLUME=/Volumes/dqx/dqx_studio/wheels # UC volume path; auto-set by DABs in production -DQX_ADMIN_GROUP=admins # workspace group granted bootstrap Admin access +DQX_ADMIN_GROUP=admins # workspace group granted bootstrap Admin access; AppConfig default is unset (no bootstrap admin locally) — set this to your test admin group # Lakebase (optional — leave DQX_LAKEBASE_INSTANCE_NAME empty to run OLTP tables on Delta locally) DQX_LAKEBASE_INSTANCE_NAME= # e.g. dqx-studio-lakebase; empty = Delta-only mode @@ -70,6 +70,18 @@ DQX_LAKEBASE_TOKEN_REFRESH_MINUTES=50 # OAuth token refresh cadence (token > **Lakebase locally:** The same OAuth token-refresh logic that runs in production also runs locally. The app authenticates as your CLI user (via `databricks-sdk` default auth chain), so your CLI principal must have `CAN_CONNECT_AND_CREATE` on the Lakebase database. Easiest path: run the bundle once against your dev workspace so the bundle's `database` resource binds the permissions, then point `DQX_LAKEBASE_INSTANCE_NAME` at the deployed instance. +### Bring your own SQL warehouse, catalog, and Lakebase + +Local dev **never provisions** anything — it always points at resources that already exist (created by a `bundle deploy` or by hand). So the production "bundle-managed vs. bring-your-own" choice collapses locally to "which value do I put in `app/.env`": + +| Resource | Local knob | Notes | +|---|---|---| +| **SQL warehouse** | `DATABRICKS_WAREHOUSE_ID=` | Any warehouse you have `CAN_USE` on. Required for queries, profiling, and dry-runs. | +| **Catalog** | `DQX_CATALOG=` (+ `DQX_SCHEMA`, `DQX_TMP_SCHEMA`) | The catalog and schemas must already exist; local dev does **not** create them. You need `USE CATALOG` + `USE SCHEMA` (+ `SELECT` to profile tables). | +| **Lakebase** | `DQX_LAKEBASE_INSTANCE_NAME=` | Point at an existing instance you can `CAN_CONNECT_AND_CREATE` on. **Leave it empty** to skip Lakebase and run OLTP tables on Delta — the simplest local setup. | + +In production these are configurable per deploy target — the bundle can provision the warehouse and Lakebase for you, or you can bring existing ones, giving **6 warehouse × Lakebase combinations** (the catalog is always pre-existing). See [DEPLOYMENT.md → Deployment scenarios: warehouse and Lakebase combinations](DEPLOYMENT.md#deployment-scenarios-warehouse-and-lakebase-combinations). The fastest way to get a matching warehouse + catalog + Lakebase for local dev is to run `make app-deploy` once against a dev workspace, then copy the resulting IDs/names into `app/.env`. + ## 2. Install Dependencies From the **project root**: diff --git a/app/README.md b/app/README.md index d66d83e21..e4f2211b9 100644 --- a/app/README.md +++ b/app/README.md @@ -137,7 +137,7 @@ The app aligns with the [DQX Summary Metrics spec](https://github.com/databricks **Read path.** `GET /api/v1/metrics/{table_fqn}` joins `dq_metrics` to `dq_validation_runs` on `run_id` and pivots the long-format rows back into the wide-format `MetricSnapshotOut` the existing UI consumes — the chart and table components keep working unchanged. New optional fields (`check_metrics`, `custom_metrics`, `rule_set_fingerprint`, `error_row_count`, `warning_row_count`) are exposed for future UI surfaces. -**Quarantine for SQL / cross-table rules.** Cross-table SQL checks now persist their full violation set to `dq_quarantine_records` (with a `{: "SQL check violation"}` synthetic `errors` payload to match the shape DQX produces for column checks), capped at `_SQL_QUARANTINE_MAX_ROWS=100_000` to bound storage on runaway rules whose violation set is the entire joined dataset. The true violation count remains accurate in `dq_metrics.error_row_count` even when truncation kicks in. The runs UI's full CSV/Excel export (which reads from `dq_quarantine_records`) now works for SQL checks; previously they only had the 10-row `sample_invalid_json` fallback and 99 %+ of violations were lost. +**Quarantine for SQL / cross-table rules.** Cross-table SQL checks now persist their full violation set to `dq_quarantine_records` (with a `[{"name": "", "message": "SQL check violation"}]` synthetic `errors` payload that mirrors DQX's public `dq_result_item_schema` — list-of-structs, same shape row-level checks produce — so the Pydantic `QuarantineRecordOut.errors: list[Any]` validates cleanly and the UI's per-row Errors column renders without special-casing SQL checks), capped at `_SQL_QUARANTINE_MAX_ROWS=100_000` to bound storage on runaway rules whose violation set is the entire joined dataset. The true violation count remains accurate in `dq_metrics.error_row_count` even when truncation kicks in. The runs UI's full CSV/Excel export (which reads from `dq_quarantine_records`) now works for SQL checks; previously they only had the 10-row `sample_invalid_json` fallback and 99 %+ of violations were lost. Legacy rows written with the old `{: }` dict shape are coerced to the new list shape by `quarantine._row_to_record` so historical data continues to display. ## Stack diff --git a/app/app.yml b/app/app.yml index 1f0334b18..87b40cc09 100644 --- a/app/app.yml +++ b/app/app.yml @@ -1,2 +1,2 @@ command: - ["uvicorn", "databricks_labs_dqx_app.backend.app:app", "--workers", "2"] + ["uvicorn", "databricks_labs_dqx_app.backend.app:app", "--workers", "1"] diff --git a/app/dashboards/dqx_quality_overview.lvdash.json b/app/dashboards/dqx_quality_overview.lvdash.json new file mode 100644 index 000000000..648de0c61 --- /dev/null +++ b/app/dashboards/dqx_quality_overview.lvdash.json @@ -0,0 +1,625 @@ +{ + "datasets": [ + { + "name": "ds_runs", + "displayName": "validation_runs (30 days)", + "queryLines": [ + "SELECT\n", + " run_id,\n", + " source_table_fqn,\n", + " status,\n", + " run_type,\n", + " total_rows,\n", + " valid_rows,\n", + " invalid_rows,\n", + " error_rows,\n", + " warning_rows,\n", + " requesting_user,\n", + " created_at\n", + "FROM dq_validation_runs\n", + "WHERE created_at >= current_date() - INTERVAL 30 DAYS" + ] + }, + { + "name": "ds_quarantine", + "displayName": "quarantine_records (30 days)", + "queryLines": [ + "SELECT\n", + " quarantine_id,\n", + " run_id,\n", + " source_table_fqn,\n", + " requesting_user,\n", + " created_at\n", + "FROM dq_quarantine_records\n", + "WHERE created_at >= current_date() - INTERVAL 30 DAYS" + ] + } + ], + "pages": [ + { + "name": "overview", + "displayName": "Overview", + "layout": [ + { + "widget": { + "name": "kpi_total_runs", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "total_runs", + "expression": "COUNT(`run_id`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "counter", + "encodings": { + "value": { + "fieldName": "total_runs", + "displayName": "Total runs" + } + }, + "frame": { + "showTitle": true, + "title": "Total runs (30d)" + } + } + }, + "position": { + "x": 0, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "widget": { + "name": "kpi_monitored_tables", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "monitored_tables", + "expression": "COUNT(DISTINCT `source_table_fqn`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "counter", + "encodings": { + "value": { + "fieldName": "monitored_tables", + "displayName": "Tables" + } + }, + "frame": { + "showTitle": true, + "title": "Tables under monitoring" + } + } + }, + "position": { + "x": 3, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "widget": { + "name": "kpi_total_errors", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "total_errors", + "expression": "COALESCE(SUM(`error_rows`), 0)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "counter", + "encodings": { + "value": { + "fieldName": "total_errors", + "displayName": "Error rows" + } + }, + "frame": { + "showTitle": true, + "title": "Total error rows (30d)" + } + } + }, + "position": { + "x": 6, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "widget": { + "name": "kpi_pass_rate", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "pass_rate_pct", + "expression": "100.0 * SUM(`valid_rows`) / NULLIF(SUM(`total_rows`), 0)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "counter", + "encodings": { + "value": { + "fieldName": "pass_rate_pct", + "displayName": "Pass rate", + "format": { + "type": "number-plain", + "decimalPlaces": { + "type": "exact", + "places": 1 + } + } + } + }, + "frame": { + "showTitle": true, + "title": "Average pass rate (%, 30d)" + } + } + }, + "position": { + "x": 9, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "widget": { + "name": "runs_over_time", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "run_date", + "expression": "DATE_TRUNC('DAY', `created_at`)" + }, + { + "name": "status", + "expression": "`status`" + }, + { + "name": "run_count", + "expression": "COUNT(`run_id`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": { + "fieldName": "run_date", + "scale": { + "type": "temporal" + }, + "displayName": "Day", + "axis": { + "title": "Day" + } + }, + "y": { + "fieldName": "run_count", + "scale": { + "type": "quantitative" + }, + "displayName": "Runs", + "axis": { + "title": "Runs" + } + }, + "color": { + "fieldName": "status", + "scale": { + "type": "categorical" + }, + "displayName": "Status", + "legend": { + "title": "Status" + } + } + }, + "frame": { + "showTitle": true, + "title": "Runs over time by status" + } + } + }, + "position": { + "x": 0, + "y": 3, + "width": 6, + "height": 5 + } + }, + { + "widget": { + "name": "errors_warnings_over_time", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "run_date", + "expression": "DATE_TRUNC('DAY', `created_at`)" + }, + { + "name": "error_rows_sum", + "expression": "COALESCE(SUM(`error_rows`), 0)" + }, + { + "name": "warning_rows_sum", + "expression": "COALESCE(SUM(`warning_rows`), 0)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "line", + "encodings": { + "x": { + "fieldName": "run_date", + "scale": { + "type": "temporal" + }, + "displayName": "Day", + "axis": { + "title": "Day" + } + }, + "y": { + "fields": [ + { + "fieldName": "error_rows_sum", + "displayName": "Errors" + }, + { + "fieldName": "warning_rows_sum", + "displayName": "Warnings" + } + ], + "scale": { + "type": "quantitative" + }, + "axis": { + "title": "Rows" + } + } + }, + "frame": { + "showTitle": true, + "title": "Error & warning rows over time" + } + } + }, + "position": { + "x": 6, + "y": 3, + "width": 6, + "height": 5 + } + }, + { + "widget": { + "name": "top_failing_tables", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "source_table_fqn", + "expression": "`source_table_fqn`" + }, + { + "name": "total_errors", + "expression": "COALESCE(SUM(`error_rows`), 0)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": { + "fieldName": "total_errors", + "scale": { + "type": "quantitative" + }, + "axis": { + "title": "Error rows" + } + }, + "y": { + "fieldName": "source_table_fqn", + "scale": { + "type": "categorical", + "sort": { + "by": "x-reversed" + } + }, + "axis": { + "title": "Table" + } + } + }, + "frame": { + "showTitle": true, + "title": "Top failing tables (30d)" + } + } + }, + "position": { + "x": 0, + "y": 8, + "width": 6, + "height": 5 + } + }, + { + "widget": { + "name": "quarantine_volume", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_quarantine", + "fields": [ + { + "name": "quarantine_date", + "expression": "DATE_TRUNC('DAY', `created_at`)" + }, + { + "name": "quarantine_count", + "expression": "COUNT(`quarantine_id`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "area", + "encodings": { + "x": { + "fieldName": "quarantine_date", + "scale": { + "type": "temporal" + }, + "axis": { + "title": "Day" + } + }, + "y": { + "fieldName": "quarantine_count", + "scale": { + "type": "quantitative" + }, + "axis": { + "title": "Quarantined rows" + } + } + }, + "frame": { + "showTitle": true, + "title": "Quarantined rows over time" + } + } + }, + "position": { + "x": 6, + "y": 8, + "width": 6, + "height": 5 + } + }, + { + "widget": { + "name": "recent_runs_table", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "ds_runs", + "fields": [ + { + "name": "created_at", + "expression": "`created_at`" + }, + { + "name": "source_table_fqn", + "expression": "`source_table_fqn`" + }, + { + "name": "status", + "expression": "`status`" + }, + { + "name": "run_type", + "expression": "`run_type`" + }, + { + "name": "total_rows", + "expression": "`total_rows`" + }, + { + "name": "error_rows", + "expression": "`error_rows`" + }, + { + "name": "warning_rows", + "expression": "`warning_rows`" + }, + { + "name": "requesting_user", + "expression": "`requesting_user`" + }, + { + "name": "run_id", + "expression": "`run_id`" + } + ], + "disaggregated": true + } + } + ], + "spec": { + "version": 1, + "widgetType": "table", + "encodings": { + "columns": [ + { + "fieldName": "created_at", + "displayName": "Time", + "type": "datetime", + "dateTimeFormat": "ll LTS (z)", + "order": 100 + }, + { + "fieldName": "source_table_fqn", + "displayName": "Table", + "order": 200 + }, + { + "fieldName": "status", + "displayName": "Status", + "order": 300 + }, + { + "fieldName": "run_type", + "displayName": "Type", + "order": 400 + }, + { + "fieldName": "total_rows", + "displayName": "Total", + "type": "integer", + "order": 500 + }, + { + "fieldName": "error_rows", + "displayName": "Errors", + "type": "integer", + "order": 600 + }, + { + "fieldName": "warning_rows", + "displayName": "Warnings", + "type": "integer", + "order": 700 + }, + { + "fieldName": "requesting_user", + "displayName": "Requested by", + "order": 800 + }, + { + "fieldName": "run_id", + "displayName": "Run ID", + "order": 900, + "visible": false + } + ] + }, + "invisibleColumns": [], + "allowHTMLByDefault": false, + "itemsPerPage": 25, + "paginationSize": "default", + "condensed": true, + "withRowNumber": false, + "frame": { + "showTitle": true, + "title": "Recent runs" + } + } + }, + "position": { + "x": 0, + "y": 13, + "width": 12, + "height": 6 + } + }, + { + "widget": { + "name": "footer_notes", + "spec": { + "version": 1, + "widgetType": "markdown", + "encodings": {}, + "frame": { + "showTitle": false + }, + "text": "### Customise this dashboard\n\nThis starter ships with DQX Studio and is fully editable. Open the dashboard in **Databricks → AI/BI Dashboards** to add widgets, change filters, or drill into specific tables — your changes appear here immediately on the **Insights** page.\n\n**Underlying tables**\n\n- `dq_validation_runs` — one row per validation run (status, counts, requester, timing)\n- `dq_quarantine_records` — per-row failures (full row payload, errors, warnings)\n- `dq_metrics` — long-format DQX observer metrics (pivot on `metric_name`)\n- `dq_profiling_results` — schema profiles for tables under monitoring\n\nAll four live in the schema configured at deploy time (`DQX_CATALOG` / `DQX_SCHEMA`)." + } + }, + "position": { + "x": 0, + "y": 19, + "width": 12, + "height": 3 + } + } + ] + } + ] +} diff --git a/app/databricks.yml b/app/databricks.yml index bf3292ec1..c1120d523 100644 --- a/app/databricks.yml +++ b/app/databricks.yml @@ -4,101 +4,63 @@ bundle: variables: catalog_name: - description: "Name of the catalog for DQX Studio storage" + description: "Catalog for DQX Studio storage" default: "dqx" schema_name: - description: "Name of the schema for DQX Studio storage" + description: "Main schema for DQX Studio storage" default: "dqx_studio" tmp_schema_name: - description: "Name of the schema for temporary views and intermediate processing" + description: "Temp schema for per-user dry-run views" default: "dqx_studio_tmp" admin_group: - description: "Databricks workspace group name for bootstrap Admin access" + description: "Workspace group bootstrapped with the Admin role" default: "admins" app_name: - description: "Name of the deployed Databricks App (must be unique per workspace)" + description: "Databricks App name (unique per workspace)" default: "dqx-studio" sql_warehouse_name: - description: "Name of the deployed SQL warehouse for DQX Studio queries (must be unique per workspace)" + description: "Warehouse name to create when a target opts into a bundle-managed warehouse (Mode A only)." default: "dqx-studio-sql-warehouse" sql_warehouse_id: description: > - ID of the SQL warehouse the app queries. Every target MUST set - this. Two ways to populate it; see DEPLOYMENT.md - "Choosing a SQL warehouse mode" for the side-by-side comparison. - - Mode A (bundle-managed, see ``bdf-vo``): set to - ``${resources.sql_warehouses.dqx_sql_warehouse.id}`` AND declare a - ``sql_warehouses.dqx_sql_warehouse`` block under the target's - ``resources:``. The bundle CREATES the warehouse; ``bundle destroy`` - DELETES it. - - Mode B (external / reuse, see ``kaizen-app``): set to a literal - warehouse ID. The bundle never creates, mutates, or destroys the - warehouse. ``post_deploy_grants.sh`` GRANTs ``CAN_USE`` to the app - SP, job SP, and the ``users`` workspace group via the warehouses - permissions API (the bundle doesn't own the warehouse, so - ``databricks_permissions`` can't be used). - - Default is intentionally empty so a target that forgets to set it - fails fast at deploy time rather than silently picking the wrong - warehouse. + ID of the SQL warehouse the app uses. Every target MUST set this. + Mode A (bundle-managed): ``${resources.sql_warehouses.dqx_sql_warehouse.id}`` AND + declare ``sql_warehouses.dqx_sql_warehouse`` locally (illustrative only — no + in-repo target uses this). + Mode B (external / reuse): literal warehouse ID, no resource block; CAN_USE is + granted by ``post_deploy_grants.sh`` (worked example: the ``dqx-app-demo`` target). + Default is empty so a target that forgets to set it fails fast at deploy time. + See DEPLOYMENT.md → "Choosing a SQL warehouse mode" for the full comparison. default: "" wheels_volume_name: - description: "Name of the UC volume for storing DQX wheel files" + description: "UC volume for DQX wheel files" default: "wheels" dqx_service_principal_application_id: - description: "Application ID of the service principal for DQX tasks" + description: "Application ID of the service principal that runs the task-runner job" default: "00000000-0000-0000-0000-000000000000" - # ------------------------------------------------------------------ - # Lakebase backend - # ------------------------------------------------------------------ - # The app stores its OLTP state (rules catalog, app settings, RBAC, - # comments, schedule configs, scheduler bookkeeping) in a Lakebase - # Postgres instance for sub-millisecond reads and to avoid SQL - # warehouse cold starts. Append-mostly observability tables - # (dq_validation_runs, dq_profiling_results, dq_metrics, - # dq_quarantine_records) stay in Delta — those are written by the - # Spark task runner and consumed by AI/BI dashboards. - # - # The Lakebase instance is declared as a bundle resource below with - # ``lifecycle.prevent_destroy: true``. New Lakebase instances are - # autoscaling by default (see - # https://docs.databricks.com/aws/en/oltp/upgrade-to-autoscaling). - # - # Connection target: ``databricks_postgres`` — the always-present - # admin database every Lakebase instance ships with. We don't create - # a per-app logical Postgres database because the only DAB resource - # that creates one (``database_catalogs``) also creates a Unity - # Catalog catalog and therefore requires ``CREATE CATALOG`` on the - # metastore — a permission most app deployers don't hold. Using - # ``databricks_postgres`` keeps the bundle fully declarative with no - # external bootstrap steps, and we still get per-app isolation via a - # dedicated Postgres schema (``DQX_LAKEBASE_SCHEMA_NAME`` in the app - # config, defaults to ``dqx_studio``). + # Lakebase backend holds OLTP state (rules, settings, RBAC, comments, schedules, + # scheduler bookkeeping). Append-mostly observability tables (validation runs, + # profiling, metrics, quarantine) stay in Delta. New Lakebase instances autoscale + # (https://docs.databricks.com/aws/en/oltp/upgrade-to-autoscaling). We connect to + # the always-present ``databricks_postgres`` admin DB and isolate per-app via a + # dedicated Postgres schema (``DQX_LAKEBASE_SCHEMA_NAME``, default ``dqx_studio``), + # which avoids needing CREATE CATALOG on the metastore. See DEPLOYMENT.md. lakebase_instance_name: - description: "Name of the Lakebase Postgres instance for OLTP state." + description: "Lakebase Postgres instance for OLTP state." default: "dqx-studio-lakebase" lakebase_database_name: - description: > - Logical Postgres database the app connects to inside the Lakebase - instance. Defaults to ``databricks_postgres`` (always present, no - creation required). All DQX tables live inside a dedicated - ``dqx_studio`` schema within this database for isolation, so - multiple apps can safely share ``databricks_postgres`` on the - same Lakebase instance. + description: "Postgres database the app connects to inside the Lakebase instance." default: "databricks_postgres" lakebase_capacity: - description: "Lakebase compute capacity (CU_1, CU_2, CU_4, CU_8)" + description: "Lakebase compute capacity (CU_1 / CU_2 / CU_4 / CU_8)" default: "CU_1" app_config: type: complex - description: "Configuration for DQX Studio" + description: "Databricks Apps config block" default: - # Multiple workers improve HTTP throughput. The in-process SchedulerService - # is guarded by a file lock so only one worker runs the scheduler loop. - # The in-memory CacheFactory (TTL-based, non-authoritative) warms - # independently per worker, which is acceptable for its use case. + # Single worker so the in-process SchedulerService (gated by a file lock) + # doesn't fight itself across workers. The in-memory CacheFactory is per + # worker (TTL, non-authoritative) so adding workers is otherwise safe. command: - "uvicorn" - "databricks_labs_dqx_app.backend.app:app" @@ -113,22 +75,21 @@ variables: value: "${var.schema_name}" - name: "DQX_JOB_ID" value_from: "dqx-task-runner-job" + - name: "DQX_REQUIRE_TASK_RUNNER" + value: "1" - name: "DQX_TMP_SCHEMA" value: "${var.tmp_schema_name}" - name: "DQX_ADMIN_GROUP" value: "${var.admin_group}" - name: "DQX_WHEELS_VOLUME" value: "/Volumes/${var.catalog_name}/${var.schema_name}/${var.wheels_volume_name}" - # Lakebase wiring. The instance is declared as a bundle - # resource further down (with ``lifecycle.prevent_destroy: true`` - # so ``bundle destroy`` cannot drop it). The logical database - # defaults to the always-present ``databricks_postgres``; the - # app creates its ``dqx_studio`` Postgres schema inside it on - # first connection. - name: "DQX_LAKEBASE_INSTANCE_NAME" value: "${var.lakebase_instance_name}" - name: "DQX_LAKEBASE_DATABASE_NAME" value: "${var.lakebase_database_name}" + # Starter Insights dashboard; admins can override at runtime via Configuration. + - name: "DQX_DEFAULT_DASHBOARD_ID" + value: "${resources.dashboards.dqx_quality_overview.id}" sync: include: @@ -136,22 +97,13 @@ sync: artifacts: default: - # ``scripts/build_app.py`` builds the application wheel with a build-tag - # local-version segment (e.g. ``.b20260530t012345``) baked into BOTH the - # wheel filename and the internal METADATA, so pip in Databricks Apps' - # persistent venv reinstalls fresh code on every redeploy. See - # ``[tool.uv-dynamic-versioning]`` in pyproject.toml for the - # ``DQX_APP_BUILD_TAG`` env var contract. - # - # ``scripts/build_app.py`` also writes ``.build/requirements.txt`` - # with the freshly-built app wheel; the trailing shell snippet - # prepends the parent DQX library wheel filename so Databricks - # Apps' pip install picks up both, in dependency order. build: > uv run python scripts/build_app.py && + rm -f .build/databricks_labs_dqx-*.whl && + rm -rf .build/tasks && uv build ../ --wheel --out-dir .build/ && uv build tasks/ --wheel --out-dir .build/tasks/ && - dqx_whl=$(ls -1 .build/databricks_labs_dqx-*.whl | head -1 | xargs basename) && + dqx_whl=$(ls -1t .build/databricks_labs_dqx-*.whl | head -1 | xargs basename) && echo "$dqx_whl" | cat - .build/requirements.txt > .build/requirements.txt.tmp && mv .build/requirements.txt.tmp .build/requirements.txt @@ -163,17 +115,6 @@ resources: source_code_path: ".build" config: ${var.app_config} - # These scopes are NOT sufficient for all app features. After deploying, - # enable all-apis via the account-level CLI: - # databricks auth login --host https://accounts.cloud.databricks.com \ - # --account-id --profile - # databricks account custom-app-integration update '' \ - # --json '{"scopes": ["openid", "profile", "email", "all-apis", "offline_access", "iam.current-user"]}' - # is in the Apps UI under User authorization, - # or run: databricks account custom-app-integration list - # - # The partial list below is the DABs-managed baseline; it does NOT - # replace the all-apis scope requirement above. user_api_scopes: - sql - catalog.catalogs:read @@ -181,18 +122,10 @@ resources: - catalog.tables:read - serving.serving-endpoints - resources: - name: "dqx-sql-warehouse" description: "SQL Warehouse for DQX Studio queries" sql_warehouse: - # ``sql_warehouse_id`` is set per-target. For bundle-managed - # warehouses (e.g. bdf-vo) the target sets it to - # ``${resources.sql_warehouses.dqx_sql_warehouse.id}`` and - # also declares the ``sql_warehouses.dqx_sql_warehouse`` - # block locally. For external warehouses (e.g. kaizen-app) - # the target sets it to a literal ID and the bundle never - # creates a warehouse for that target. id: ${var.sql_warehouse_id} permission: "CAN_USE" - name: "dqx-task-runner-job" @@ -200,15 +133,9 @@ resources: job: id: ${resources.jobs.dqx_task_runner.id} permission: "CAN_MANAGE" - # Lakebase Postgres for OLTP state (rules catalog, app - # settings, RBAC, comments, schedule configs, scheduler - # bookkeeping). The instance is declared as a bundle resource - # below; the database defaults to the always-present - # ``databricks_postgres`` admin DB. This entry binds the app - # to the (instance, database) pair so DABs configures - # ``CAN_CONNECT_AND_CREATE`` automatically — that lets the app - # SP create its ``dqx_studio`` schema inside the bound DB on - # first connection. + # ``database:`` binding (instance + database). ``instance_name`` points + # at ``resources.database_instances.lakebase``, which the target + # declares; the base binding resolves it after the per-target merge. - name: "dqx-lakebase" description: "Lakebase Postgres backend for app OLTP state" database: @@ -220,28 +147,13 @@ resources: - group_name: "account users" level: "CAN_USE" - # ``sql_warehouses.dqx_sql_warehouse`` is intentionally NOT declared at - # top-level. Targets that want a bundle-managed warehouse declare it - # inside their own ``resources.sql_warehouses`` block (see bdf-vo). - # Targets that point at an existing warehouse just set - # ``sql_warehouse_id`` to the literal ID (see kaizen-app). This keeps - # ``bundle destroy`` honest — it can't accidentally delete a shared - # warehouse the bundle never created. - - # ------------------------------------------------------------------ - # Stateful storage (schemas, volume, Lakebase instance + database). - # - # Every entry below is tagged ``lifecycle.prevent_destroy: true`` so - # that ``databricks bundle destroy`` (or a deploy that would - # otherwise replace the resource) fails fast instead of dropping the - # backing data. To intentionally tear something down: remove the - # flag, ``databricks bundle deployment unbind ``, then destroy. - # - # For an existing workspace whose resources were provisioned - # out-of-band (e.g. from the previous bootstrap script), one-time - # binding is required before the first deploy — see ``make app-bind`` - # and DEPLOYMENT.md → "Migrating an existing workspace". - # ------------------------------------------------------------------ + # Stateful storage (schemas, volume, Lakebase instance). Every entry below is + # tagged ``lifecycle.prevent_destroy: true`` so ``bundle destroy`` (or any deploy + # that would replace the resource) fails fast instead of dropping data. To + # intentionally tear one down: remove the flag, ``databricks bundle deployment + # unbind ``, then destroy. For pre-existing workspaces whose resources were + # provisioned out-of-band, run ``make app-bind`` once before the first deploy + # (see DEPLOYMENT.md → "Migrating an existing workspace"). schemas: main_schema: catalog_name: ${var.catalog_name} @@ -249,7 +161,6 @@ resources: comment: "DQX Studio main schema (rules, run history, profiling, metrics, quarantine, OLTP fallback)" lifecycle: prevent_destroy: true - tmp_schema: catalog_name: ${var.catalog_name} name: ${var.tmp_schema_name} @@ -267,6 +178,11 @@ resources: lifecycle: prevent_destroy: true + # Lakebase Postgres instance backing the app's OLTP state. Name/capacity read + # from variables so a target can resize or rename it. The ``dqx-lakebase`` app + # binding above references ``${resources.database_instances.lakebase.name}``. + # To run without Lakebase, remove this block and the ``dqx-lakebase`` binding, + # then set ``lakebase_instance_name: "-"`` (OLTP falls back to Delta). database_instances: lakebase: name: ${var.lakebase_instance_name} @@ -274,6 +190,18 @@ resources: lifecycle: prevent_destroy: true + dashboards: + dqx_quality_overview: + display_name: "[${bundle.target}] DQX Quality Overview" + file_path: ./dashboards/dqx_quality_overview.lvdash.json + warehouse_id: ${var.sql_warehouse_id} + dataset_catalog: ${var.catalog_name} + dataset_schema: ${var.schema_name} + embed_credentials: true + permissions: + - level: CAN_READ + group_name: "account users" + jobs: dqx_task_runner: name: "dqx-studio-task-runner" @@ -326,60 +254,38 @@ resources: - name: "requesting_user" default: "unknown" +# Targets: at minimum each one sets ``catalog_name``, +# ``dqx_service_principal_application_id``, and ``sql_warehouse_id``. +# Every other variable defaults sensibly; override per target as needed. +# +# This bundle ships a SINGLE canonical target, ``dqx-app-demo``. It uses Mode B +# (reuse an existing warehouse): a literal ``sql_warehouse_id`` and no +# ``sql_warehouses`` block. To add a Mode A (bundle-managed warehouse) target +# instead, declare a local ``sql_warehouses.dqx_sql_warehouse`` block and point +# ``sql_warehouse_id`` at ``${resources.sql_warehouses.dqx_sql_warehouse.id}``. +# See DEPLOYMENT.md → "Choosing a SQL warehouse mode" for the full comparison. +# +# To disable Lakebase, remove the ``database_instances.lakebase`` block and the +# ``dqx-lakebase`` app resource binding, then set ``lakebase_instance_name: "-"`` +# (OLTP tables fall back to Delta via the ``MigrationRunner`` path). +# +# One-button Postgres: a ``postgres_roles`` block makes the app SP's +# CREATE-schema privilege on ``databricks_postgres`` bundle-managed. It requires +# Databricks CLI >= 1.4.0 and is for FRESH workspaces; ``dqx-app-demo`` omits it +# because its app SP's Postgres role already exists (see the note on the target +# below). The full block is documented in DEPLOYMENT.md. targets: - # Template for adding a new target. Required variables are: - # - ``catalog_name`` - # - ``dqx_service_principal_application_id`` - # - ``sql_warehouse_id`` (chooses warehouse mode; see DEPLOYMENT.md - # "Choosing a SQL warehouse mode" and the live ``bdf-vo`` / - # ``kaizen-app`` targets below for worked Mode A / Mode B examples) - # Everything else has a sensible default in the ``variables:`` block - # at the top of this file and can be overridden per target below. - # - # example-target-mode-a: - # workspace: - # profile: - # variables: - # catalog_name: - # dqx_service_principal_application_id: - # sql_warehouse_id: ${resources.sql_warehouses.dqx_sql_warehouse.id} - # # Optional overrides: - # # admin_group: - # # app_name: - # # sql_warehouse_name: # Mode A only - # # schema_name: - # # tmp_schema_name: - # # wheels_volume_name: - # # lakebase_instance_name: - # # lakebase_database_name: - # # lakebase_capacity: CU_1 | CU_2 | CU_4 | CU_8 - # resources: - # sql_warehouses: - # dqx_sql_warehouse: - # name: ${var.sql_warehouse_name} - # cluster_size: "X-Small" - # enable_serverless_compute: true - # max_num_clusters: 1 - # min_num_clusters: 1 - # auto_stop_mins: 10 - # permissions: - # - group_name: "users" - # level: "CAN_USE" - # - # example-target-mode-b: - # workspace: - # profile: - # variables: - # catalog_name: - # dqx_service_principal_application_id: - # sql_warehouse_id: "" # just the ID - # # No resources.sql_warehouses block. post_deploy_grants.sh grants - # # CAN_USE to app SP / job SP / users via the warehouses API. - - # Mode B: reuses the pre-existing ``dqx-studio-sql-warehouse`` - # (X-Small serverless) and the pre-existing - # ``dqx-studio-lakebase-v2`` instance that an earlier deploy created in - # this workspace. Point at them by ID/name instead of having the bundle - # create new copies, so this target is a straight redeploy on top of the - # existing demo state. ``post_deploy_grants.sh`` handles ``CAN_USE`` on the - # external warehouse via the warehouses permissions API. + # The single canonical target for this workspace. Marked default so a bare + # ``bundle deploy`` (no ``-t``) resolves to it. + dev: + default: true + workspace: + profile: + variables: + catalog_name: + dqx_service_principal_application_id: + # Mode B: literal ID of an existing warehouse (bundle never manages it). + sql_warehouse_id: "" + lakebase_instance_name: + presets: + trigger_pause_status: PAUSED diff --git a/app/pyproject.toml b/app/pyproject.toml index 35a457b48..bb2a6050e 100644 --- a/app/pyproject.toml +++ b/app/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ "fastapi~=0.119", "pydantic-settings~=2.11", "uvicorn~=0.37", - "databricks-labs-dqx[llm]", + "databricks-labs-dqx[llm,datacontract]", "databricks-sdk~=0.73", "databricks-sql-connector[pyarrow]==4.2.5", "openpyxl>=3.1", diff --git a/app/scripts/_align_wheel_version.py b/app/scripts/_align_wheel_version.py new file mode 100644 index 000000000..e666a73bd --- /dev/null +++ b/app/scripts/_align_wheel_version.py @@ -0,0 +1,169 @@ +"""Re-align a wheel's METADATA / RECORD / dist-info dir to its filename. + +Background +---------- +A build step that injects a build timestamp into the wheel **filename** +(e.g. renaming +``databricks_labs_dqx_app-0.13.0.post38.dev0+65c9602-py3-none-any.whl`` +to +``databricks_labs_dqx_app-0.13.0.post38.dev0+65c9602.post20260506102508-py3-none-any.whl``) +without also updating the wheel's internal ``METADATA``, ``RECORD`` or +``*.dist-info`` directory name produces a mismatched wheel. + +The result is a wheel whose filename advertises a new version each +build but whose package metadata still reports the unchanged +git-derived version. ``pip`` consults the metadata, sees the version +is already installed, and silently skips reinstall — so a Databricks +App container's persistent venv keeps running stale code on every +redeploy. + +This script repairs such a wheel by: + +1. parsing the version segment out of the filename +2. unpacking the wheel +3. renaming the ``*.dist-info`` directory to use that version +4. rewriting ``METADATA`` so ``Version:`` matches +5. regenerating ``RECORD`` checksums +6. repacking the wheel in place + +After running this each rebuild produces a wheel with a unique +metadata version, forcing pip to install fresh code on every deploy. + +Usage +----- + python _align_wheel_version.py path/to/wheel.whl + +The script overwrites the input wheel; safe to run idempotently. +""" + +from __future__ import annotations + +import base64 +import hashlib +import re +import shutil +import sys +import tempfile +import zipfile +from pathlib import Path + +# Wheel filename format (PEP 427): +# {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl +# Distribution and version may both contain dots; the version segment +# starts right after the leading "{distribution}-" and ends at the +# next "-py" / "-cp" / "-pp" tag. Everything after that until the +# trailing ``.whl`` is the compatibility tag triple. +_WHEEL_FILENAME_RE = re.compile( + r"^(?P[A-Za-z0-9_]+)-(?P[^-]+(?:\+[^-]+)?)-(?P(?:py|cp|pp|ip)\d.*?)\.whl$" +) + + +def _filename_version(wheel: Path) -> tuple[str, str]: + """Return ``(distribution, version)`` parsed from the wheel filename. + + Uses a regex rather than ``packaging.utils.parse_wheel_filename`` + because the latter normalises the distribution name and rejects + local versions that contain dots — both of which can be present in + such wheels. + """ + m = _WHEEL_FILENAME_RE.match(wheel.name) + if not m: + raise SystemExit(f"unrecognised wheel filename: {wheel.name}") + return m.group("dist"), m.group("version") + + +def _record_entry(rel_path: str, data: bytes) -> str: + digest = base64.urlsafe_b64encode(hashlib.sha256(data).digest()).rstrip(b"=").decode() + return f"{rel_path},sha256={digest},{len(data)}" + + +def align(wheel_path: Path) -> None: + """Rewrite ``wheel_path`` so its metadata version matches its filename.""" + dist, filename_version = _filename_version(wheel_path) + + with tempfile.TemporaryDirectory() as raw_tmp: + tmp = Path(raw_tmp) + with zipfile.ZipFile(wheel_path, "r") as zf: + zf.extractall(tmp) + + old_dist_infos = [p for p in tmp.iterdir() if p.is_dir() and p.name.endswith(".dist-info")] + if not old_dist_infos: + raise SystemExit(f"no .dist-info dir in {wheel_path.name}") + if len(old_dist_infos) > 1: + raise SystemExit(f"multiple .dist-info dirs in {wheel_path.name}: {old_dist_infos}") + old_dist_info = old_dist_infos[0] + + # Hatchling preserves underscores in the distribution prefix + # of the dist-info directory, so reuse that prefix verbatim + # rather than recomputing from ``dist`` (which packaging.utils + # would normalise to dashes). The directory name is + # ``{prefix}-{version}.dist-info``; strip the literal suffix + # then split off the version. + stem = old_dist_info.name[: -len(".dist-info")] + existing_prefix = stem.rsplit("-", 1)[0] + new_dist_info = tmp / f"{existing_prefix}-{filename_version}.dist-info" + # Skip the rename when the dist-info dir is already correctly + # named (e.g. no build tag was injected on this rebuild so the + # filename version matches what's already inside the + # wheel). Without this guard, the ``rmtree`` below would delete + # the source directory — both paths point at the same dir — and + # the subsequent ``rename`` would crash with FileNotFoundError. + # The METADATA/RECORD rewrites further down are themselves + # idempotent, so we still run them to repair any drift. + if new_dist_info != old_dist_info: + if new_dist_info.exists(): + shutil.rmtree(new_dist_info) + old_dist_info.rename(new_dist_info) + + metadata = new_dist_info / "METADATA" + if not metadata.exists(): + raise SystemExit(f"missing METADATA in {wheel_path.name}") + text = metadata.read_text(encoding="utf-8") + new_text, n = re.subn( + r"^Version: .*$", + f"Version: {filename_version}", + text, + count=1, + flags=re.MULTILINE, + ) + if n == 0: + raise SystemExit(f"no Version: line in METADATA of {wheel_path.name}") + metadata.write_text(new_text, encoding="utf-8") + + record = new_dist_info / "RECORD" + record_rel = record.relative_to(tmp).as_posix() + entries: list[str] = [] + for f in sorted(tmp.rglob("*")): + if not f.is_file(): + continue + rel = f.relative_to(tmp).as_posix() + if rel == record_rel: + continue + entries.append(_record_entry(rel, f.read_bytes())) + entries.append(f"{record_rel},,") + record.write_text("\n".join(entries) + "\n", encoding="utf-8") + + tmp_wheel = wheel_path.with_suffix(wheel_path.suffix + ".tmp") + with zipfile.ZipFile(tmp_wheel, "w", zipfile.ZIP_DEFLATED) as zf: + for f in sorted(tmp.rglob("*")): + if f.is_file(): + zf.write(f, f.relative_to(tmp).as_posix()) + + tmp_wheel.replace(wheel_path) + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + print(f"usage: {Path(argv[0]).name} ", file=sys.stderr) + return 2 + wheel = Path(argv[1]) + if not wheel.is_file(): + print(f"not a file: {wheel}", file=sys.stderr) + return 1 + align(wheel) + print(f" aligned wheel metadata to filename version: {wheel.name}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/app/scripts/build_app.py b/app/scripts/build_app.py index c5120974d..206c55473 100644 --- a/app/scripts/build_app.py +++ b/app/scripts/build_app.py @@ -105,12 +105,18 @@ def _write_metadata_module(meta: dict[str, str]) -> None: def _write_app_yml(meta: dict[str, str]) -> None: """Write the Databricks Apps launch manifest. - Two-worker uvicorn. The ``app-module`` value comes from - ``pyproject.toml`` so changing the FastAPI app's import path is - a one-line edit there. + Single-worker uvicorn. The in-process ``SchedulerService`` is gated by + a file lock (``/tmp/.dqx_scheduler.lock``) that only excludes other + workers in the SAME container, so multi-worker would mean two + schedulers fighting over each due schedule. The ``app-module`` value + comes from ``pyproject.toml`` so changing the FastAPI app's import + path is a one-line edit there. Production DABs deploys override the + command via ``var.app_config.command`` in ``databricks.yml``; this + manifest is the fallback for paths that don't go through DABs (e.g. + direct ``databricks apps`` CLI deploys). """ (BUILD_DIR / "app.yml").write_text( - f'command: ["uvicorn", "{meta["app-module"]}", "--workers", "2"]\n', + f'command: ["uvicorn", "{meta["app-module"]}", "--workers", "1"]\n', encoding="utf-8", ) diff --git a/app/scripts/post_deploy_grants.sh b/app/scripts/post_deploy_grants.sh index 351216dc0..06bd2bdbe 100755 --- a/app/scripts/post_deploy_grants.sh +++ b/app/scripts/post_deploy_grants.sh @@ -220,14 +220,14 @@ run_sql() { } echo "" -echo "==> Granting permissions to App SP ($APP_SP_ID)..." +echo "==> Granting UC permissions to App SP ($APP_SP_ID)..." run_sql "GRANT USE CATALOG ON CATALOG \`$CATALOG\` TO \`$APP_SP_ID\`" run_sql "GRANT ALL PRIVILEGES ON SCHEMA \`$CATALOG\`.\`$SCHEMA\` TO \`$APP_SP_ID\`" run_sql "GRANT ALL PRIVILEGES ON SCHEMA \`$CATALOG\`.\`$TMP_SCHEMA\` TO \`$APP_SP_ID\`" run_sql "GRANT ALL PRIVILEGES ON VOLUME \`$CATALOG\`.\`$SCHEMA\`.\`$VOLUME\` TO \`$APP_SP_ID\`" echo "" -echo "==> Granting permissions to Job SP ($JOB_SP)..." +echo "==> Granting UC permissions to Job SP ($JOB_SP)..." run_sql "GRANT USE CATALOG ON CATALOG \`$CATALOG\` TO \`$JOB_SP\`" run_sql "GRANT ALL PRIVILEGES ON SCHEMA \`$CATALOG\`.\`$SCHEMA\` TO \`$JOB_SP\`" run_sql "GRANT ALL PRIVILEGES ON SCHEMA \`$CATALOG\`.\`$TMP_SCHEMA\` TO \`$JOB_SP\`" @@ -284,6 +284,18 @@ if [[ -n "$EXTERNAL_WH_ID" && "$EXTERNAL_WH_ID" != \$* ]]; then fi fi +# The starter Insights dashboard is configured with ``embed_credentials: true``, +# so AI/BI runs every widget query under the bundle's deployer identity (the +# principal authenticated to the CLI profile that ran ``databricks bundle deploy``). +# That principal doesn't automatically inherit SELECT on the DQX tables: DABs +# makes them schema-owner, but UC table-level reads need an explicit grant. +# Without it the Insights page renders, then every tile shows +# INSUFFICIENT_PERMISSIONS. Granting at the schema level covers existing and +# future tables created by the migration runner. +# +# For SP-based deployers (CI/CD) ``current-user me`` returns the SP and +# ``userName`` is the application ID — which is what UC expects in a GRANT. +# For human deployers it's the email. Either way the GRANT line is identical. echo "" echo "==> Granting end-user permissions for tmp view creation..." # The app's dry-run / preview feature creates temp views via the user's @@ -296,6 +308,33 @@ echo "==> Granting end-user permissions for tmp view creation..." run_sql "GRANT USE CATALOG ON CATALOG \`$CATALOG\` TO \`account users\`" run_sql "GRANT USE SCHEMA, CREATE TABLE ON SCHEMA \`$CATALOG\`.\`$TMP_SCHEMA\` TO \`account users\`" +# The starter Insights dashboard is configured with ``embed_credentials: true``, +# so AI/BI runs every widget query under the bundle's deployer identity (the +# principal authenticated to the CLI profile that ran ``databricks bundle deploy``). +# That principal doesn't automatically inherit SELECT on the DQX tables: DABs +# makes them schema-owner, but UC table-level reads need an explicit grant. +# Without it the Insights page renders, then every tile shows +# INSUFFICIENT_PERMISSIONS. Granting at the schema level covers existing and +# future tables created by the migration runner. +# +# For SP-based deployers (CI/CD) ``current-user me`` returns the SP and +# ``userName`` is the application ID — which is what UC expects in a GRANT. +# For human deployers it's the email. Either way the GRANT line is identical. +echo "" +echo "==> Granting SELECT to deployer (for Insights dashboard queries)..." +DEPLOYER_JSON=$($CLI current-user me -o json 2>/dev/null || echo "") +DEPLOYER=$(echo "$DEPLOYER_JSON" | jq -r '.userName // .applicationId // empty' 2>/dev/null || echo "") +if [[ -n "$DEPLOYER" ]]; then + echo " Deployer: $DEPLOYER" + run_sql "GRANT USE SCHEMA ON SCHEMA \`$CATALOG\`.\`$SCHEMA\` TO \`$DEPLOYER\`" + run_sql "GRANT SELECT ON SCHEMA \`$CATALOG\`.\`$SCHEMA\` TO \`$DEPLOYER\`" +else + echo " WARNING: Could not resolve deployer identity from profile '$PROFILE'." + echo " The Insights dashboard will show INSUFFICIENT_PERMISSIONS" + echo " until you GRANT USE SCHEMA + SELECT ON SCHEMA" + echo " \`$CATALOG\`.\`$SCHEMA\` to the bundle deployer." +fi + echo "" if (( FAILURES > 0 )); then # Exiting non-zero is what makes ``make app-deploy`` (and any CI @@ -306,4 +345,5 @@ if (( FAILURES > 0 )); then echo "==> FAILED: $FAILURES grant(s) did not succeed — see warnings above." >&2 exit 1 fi -echo "==> Done. All grants applied." +echo "==> Done. All grants applied. Re-running this script is safe — every" +echo " grant above is idempotent." diff --git a/app/src/databricks_labs_dqx_app/backend/CLAUDE.md b/app/src/databricks_labs_dqx_app/backend/CLAUDE.md index 7b6b46fe7..72caf6098 100644 --- a/app/src/databricks_labs_dqx_app/backend/CLAUDE.md +++ b/app/src/databricks_labs_dqx_app/backend/CLAUDE.md @@ -169,6 +169,10 @@ uv run uvicorn databricks_labs_dqx_app.backend.app:app --reload # Dev server - **Scheduler:** runs in-process as an asyncio task, gated by an exclusive file lock (`/tmp/.dqx_scheduler.lock`) so only one uvicorn worker drives it. Disable with `DQX_SCHEDULER_DISABLED=1`. - **Caches:** `app_cache` (`cache.py`) is per-process in-memory with TTL. SP `WorkspaceClient`, OBO `WorkspaceClient`, and per-user catalog list are all cached. Use the `MISS` sentinel — never `is None` — to detect cache absence. - **SPA static files:** `spa_static.py` falls through to `index.html` only for non-asset paths (positive allowlist of asset extensions), so SPA routes containing dots still work. +- **Dataset-level rule dispatch (`__sql_check__/`):** rules whose `table_fqn` starts with `__sql_check__/` are dataset-level. Two flavours exist — keep both dispatch paths in `routes/v1/dryrun.py` (manual / batch) and `services/scheduler_service.py` (scheduled) in sync: + - **SQL check** — `arguments.sql_query` is set. Build the input view with `view_svc.create_view_from_sql(...)` and set `is_sql_check=True` in the runner config. + - **`has_valid_schema`** — `arguments.sql_query` is absent; the real target table lives in `user_metadata.target_table`. Build the input view from *that* table with `view_svc.create_view(...)` but keep `source_table_fqn` set to the synthetic `__sql_check__/` value so the run groups correctly in history. Set `is_sql_check=False` — schema validation runs through the standard row-level DQX engine, not the SQL fast-path. Per-table errors raised during dispatch are surfaced to the UI via the run-submission response payload (consumed in `ui/routes/_sidebar/runs.tsx`). +- **Lakebase `ON CONFLICT DO UPDATE SET` column references:** PostgreSQL refuses bare column references on the RHS of `DO UPDATE SET` (`column reference "version" is ambiguous`), and a *schema-qualified* reference (`"dq"."tbl"."version"`) is **not** a valid existing-row reference there either — Postgres treats it as a FROM-clause entry and errors with `invalid reference to FROM-clause entry for table "dq"` on the *first* save, not just on conflict. `PgExecutor.upsert_with_audit` therefore aliases the conflict target (`INSERT INTO AS "dqx_upsert_target"`) and qualifies `increment_on_update` references against the alias (`"{qcol} = "dqx_upsert_target".{qcol} + 1"`). The regression test in `tests/test_pg_executor.py` asserts the alias form *and* the absence of both the bare and schema-qualified forms — do not relax it. ## Hybrid Storage Backend (Delta + Lakebase) diff --git a/app/src/databricks_labs_dqx_app/backend/app.py b/app/src/databricks_labs_dqx_app/backend/app.py index 037916a5b..f56ed68f7 100644 --- a/app/src/databricks_labs_dqx_app/backend/app.py +++ b/app/src/databricks_labs_dqx_app/backend/app.py @@ -47,6 +47,21 @@ def _try_acquire_scheduler_lease() -> bool: The lock file is held for the lifetime of the process; when the worker exits the OS releases it automatically. + + .. WARNING:: + This lease is **process-local**: ``fcntl.flock`` only excludes other + processes on the same host. If DQX Studio is ever scaled to more + than one Databricks App replica/container, every replica will + acquire its own lock and run ``_tick()`` independently — the same + due schedule will be submitted N times. The current deployment + relies on Databricks Apps running a single container; if that ever + changes, replace this with a cross-replica lease (e.g. a Postgres + advisory lock on ``DQX_LAKEBASE_SCHEMA_NAME`` keyed by + ``hashtext('dqx-scheduler')`` with ``pg_try_advisory_lock``, or a + lease row in ``dq_app_settings`` with TTL + ``FOR UPDATE + SKIP LOCKED``). See backend audit finding B1. The file lock is + still needed as a fallback for multi-worker uvicorn within one + container. """ import fcntl @@ -322,7 +337,21 @@ async def lifespan(app: FastAPI): ) if not (conf.wheels_volume and conf.job_id): - logger.warning("DQX_WHEELS_VOLUME or DQX_JOB_ID not set — task-runner job wheels will not be synced") + msg = ( + "DQX_WHEELS_VOLUME or DQX_JOB_ID is not set — profiler, dry-run, " + "and scheduler features will be unavailable" + ) + if conf.require_task_runner: + # Production-style deploy (bundle sets DQX_REQUIRE_TASK_RUNNER=1): + # treat a missing binding as a fatal misconfiguration so we + # crash loop with an actionable error instead of silently + # serving a half-broken app. + raise RuntimeError( + f"{msg}. Both env vars are required when DQX_REQUIRE_TASK_RUNNER=1. " + "Check the bundle resource bindings (dqx-task-runner-job, dqx-wheels) " + "and re-run `databricks bundle deploy`." + ) + logger.warning("%s (set DQX_REQUIRE_TASK_RUNNER=1 in production to fail fast)", msg) else: wheel_volume_paths: list[str] = [] try: diff --git a/app/src/databricks_labs_dqx_app/backend/config.py b/app/src/databricks_labs_dqx_app/backend/config.py index 03aa9b2c9..812b1a53a 100644 --- a/app/src/databricks_labs_dqx_app/backend/config.py +++ b/app/src/databricks_labs_dqx_app/backend/config.py @@ -31,7 +31,28 @@ class AppConfig(BaseSettings): tmp_schema_name: str = Field(default="dqx_studio_tmp", validation_alias="DQX_TMP_SCHEMA") job_id: str = Field(default="", validation_alias="DQX_JOB_ID") wheels_volume: str = Field(default="", validation_alias="DQX_WHEELS_VOLUME") + # Production deploys bind ``job_id`` and ``wheels_volume`` from + # bundle resources, so missing values there indicate a misconfigured + # deploy that would otherwise silently break profiler / dry-run / + # schedules at first use. Setting ``DQX_REQUIRE_TASK_RUNNER=1`` + # promotes the missing-env warnings in :func:`backend.app.lifespan` + # to fail-fast startup errors. Leave unset (default) for local dev + # so the API can still boot without the task-runner wheels. + require_task_runner: bool = Field( + default=False, + validation_alias="DQX_REQUIRE_TASK_RUNNER", + description="Require DQX_JOB_ID and DQX_WHEELS_VOLUME at startup (production deploys).", + ) llm_endpoint: str = Field(default="databricks-claude-sonnet-4-5", validation_alias="DQX_LLM_ENDPOINT") + # Hard cap on tokens generated per LLM call. Bounds cost/latency and + # mitigates LLM denial-of-service from pathological prompts (OWASP LLM04); + # rule-generation responses are small JSON payloads, so this is generous. + llm_max_tokens: int = Field( + default=4096, + validation_alias="DQX_LLM_MAX_TOKENS", + gt=0, + description="Maximum output tokens per ChatDatabricks call (LLM budget cap).", + ) admin_group: str | None = Field( default=None, validation_alias="DQX_ADMIN_GROUP", @@ -42,6 +63,20 @@ class AppConfig(BaseSettings): dryrun_max_sample_size: int = Field(default=10_000) dryrun_default_sample_size: int = Field(default=1_000) + # ------------------------------------------------------------------ + # Embedded dashboard (Insights page) + # ------------------------------------------------------------------ + # The Insights page renders a Databricks AI/BI dashboard inside an + # iframe. Admins set the dashboard ID via the Configuration page, + # which writes to ``dq_app_settings`` and overrides this default. + # When unset, this env var lets the bundle ship a starter + # dashboard ID so the page works out-of-the-box. + default_dashboard_id: str = Field( + default="", + validation_alias="DQX_DEFAULT_DASHBOARD_ID", + description="Fallback dashboard ID for the Insights page when no admin override is set.", + ) + # ------------------------------------------------------------------ # Lakebase (Postgres) backend # ------------------------------------------------------------------ @@ -58,7 +93,15 @@ class AppConfig(BaseSettings): lakebase_instance_name: str = Field( default="", validation_alias="DQX_LAKEBASE_INSTANCE_NAME", - description="Lakebase instance name. Empty disables Lakebase routing.", + description=( + "Lakebase instance name. Empty — or any of the sentinel " + "values ``-`` / ``disabled`` / ``off`` / ``none`` " + "(case-insensitive) — disables Lakebase routing. The " + "sentinel form exists because Databricks Apps rejects " + "env vars with an empty ``value`` string, so deployments " + "that want to disable Lakebase must pass a non-empty " + "placeholder." + ), ) # Default must match the ``lakebase_database_name`` bundle var in # ``app/databricks.yml`` (``databricks_postgres`` — the always-present @@ -135,15 +178,22 @@ class AppConfig(BaseSettings): def static_assets_path(self) -> Path: return Path(str(resources.files(app_slug))).joinpath("__dist__") + # Sentinel values that explicitly disable Lakebase routing even + # though the env var has to be non-empty (Databricks Apps rejects + # ``value: ""``). Comparison is case-insensitive after stripping. + _LAKEBASE_DISABLED_SENTINELS = frozenset({"", "-", "disabled", "off", "none"}) + @property def lakebase_enabled(self) -> bool: """``True`` when the deployment was provisioned with Lakebase. Falls back to ``False`` (legacy UC-only mode) when the - instance name is empty so existing tests and dev setups keep - working with no Postgres dependency. + instance name is empty or set to a recognised "disabled" + sentinel so existing tests, dev setups, and Lakebase-less + Databricks Apps deployments keep working with no Postgres + dependency. """ - return bool(self.lakebase_instance_name.strip()) + return self.lakebase_instance_name.strip().lower() not in self._LAKEBASE_DISABLED_SENTINELS conf = AppConfig() diff --git a/app/src/databricks_labs_dqx_app/backend/dependencies.py b/app/src/databricks_labs_dqx_app/backend/dependencies.py index 7be9e29e7..8757aeed3 100644 --- a/app/src/databricks_labs_dqx_app/backend/dependencies.py +++ b/app/src/databricks_labs_dqx_app/backend/dependencies.py @@ -22,11 +22,13 @@ from .runtime import rt from .services.ai_rules_service import AiRulesService from .services.app_settings_service import AppSettingsService +from .services.contract_rules_service import ContractRulesService from .services.discovery import DiscoveryService from .services.job_service import JobService from .services.role_service import RoleService from .services.rules_catalog_service import RulesCatalogService from .services.comments_service import CommentsService +from .services.review_status_service import ReviewStatusService from .services.schedule_config_service import ScheduleConfigService from .services.view_service import ViewService from .sql_executor import OltpExecutorProtocol, SqlExecutor @@ -230,6 +232,22 @@ async def get_ai_rules_service( return AiRulesService(obo_ws=obo_ws, sp_ws=sp_ws) +async def get_contract_rules_service( + sp_ws: Annotated[WorkspaceClient, Depends(get_sp_ws)], + ai_service: Annotated[AiRulesService, Depends(get_ai_rules_service)], +) -> ContractRulesService: + """Create a ContractRulesService. + + Contract parsing is local and the generator doesn't touch UC, so we + use the SP client — same pattern as the AI generator's LLM call leg. + The AI service is injected so natural-language (``type: text``) quality + expectations can be converted through the same ChatDatabricks leg the + AI-Assisted Generation page uses (DQX's own text path needs dspy + Spark, + which the app container lacks). + """ + return ContractRulesService(sp_ws=sp_ws, ai_service=ai_service) + + async def get_rules_catalog_service( sql: Annotated[OltpExecutorProtocol, Depends(get_sp_oltp_executor)], ) -> RulesCatalogService: @@ -266,6 +284,19 @@ async def get_comments_service( return CommentsService(sql=sql) +async def get_review_status_service( + sql: Annotated[OltpExecutorProtocol, Depends(get_sp_oltp_executor)], + settings: Annotated[AppSettingsService, Depends(get_app_settings_service)], +) -> ReviewStatusService: + """Create a ReviewStatusService routed at the OLTP executor. + + Takes the same ``AppSettingsService`` we use everywhere else as a + transitive dep so the catalogue of allowed status values comes from + the same singleton (and same cache) as the Configuration page reads. + """ + return ReviewStatusService(sql=sql, settings=settings) + + async def get_schedule_config_service( sql: Annotated[OltpExecutorProtocol, Depends(get_sp_oltp_executor)], ) -> ScheduleConfigService: @@ -457,6 +488,7 @@ async def get_user_catalog_names( "get_app_settings_service", "get_role_service", "get_ai_rules_service", + "get_contract_rules_service", "get_rules_catalog_service", "get_discovery_service", "get_view_service", @@ -464,6 +496,7 @@ async def get_user_catalog_names( "get_sql_connector", "get_user_role", "get_comments_service", + "get_review_status_service", "get_schedule_config_service", "require_role", "require_runner", diff --git a/app/src/databricks_labs_dqx_app/backend/migrations/__init__.py b/app/src/databricks_labs_dqx_app/backend/migrations/__init__.py index f14f6ca80..0d79f1eca 100644 --- a/app/src/databricks_labs_dqx_app/backend/migrations/__init__.py +++ b/app/src/databricks_labs_dqx_app/backend/migrations/__init__.py @@ -516,6 +516,72 @@ class Migration: _V5_VALIDATION_RUNS_ERROR_ROWS = f"ALTER TABLE {_PLACEHOLDER}.dq_validation_runs " f" ADD COLUMN error_rows INT" +# Run review status — per-run review label set by business / SA reviewers +# from the Runs detail page. The allowed value list is admin-managed in +# ``dq_app_settings.run_review_statuses`` so there's no CHECK constraint +# on ``status``; the service validates against the live list before INSERT. +# +# Two tables intentionally: +# - ``dq_run_review_status`` is mutable (one row per run that has been +# reviewed; absent rows surface the configured default virtually). +# - ``dq_run_review_status_history`` is append-only so we can show +# "X changed status from Pending to Acknowledged on Tue" on the run +# detail page and answer compliance questions. Same shape as +# ``dq_quality_rules_history`` — no PK column on Delta (rows are +# ordered by ``changed_at`` for display). +# +# Marked ``oltp_fallback=True`` because both tables are OLTP-shaped +# (single-key lookup, frequent mutation) and live in Lakebase when +# enabled; this migration only runs against Delta when Lakebase is off. +_V6_RUN_REVIEW_STATUS = ( + f"CREATE TABLE IF NOT EXISTS {_PLACEHOLDER}.dq_run_review_status (" + " run_id STRING NOT NULL," + " status STRING NOT NULL," + " updated_by STRING," + " updated_at TIMESTAMP," + " CONSTRAINT pk_dq_run_review_status PRIMARY KEY (run_id) RELY" + ") CLUSTER BY (run_id);" + f"CREATE TABLE IF NOT EXISTS {_PLACEHOLDER}.dq_run_review_status_history (" + " run_id STRING NOT NULL," + " status STRING NOT NULL," + " previous_status STRING," + " changed_by STRING NOT NULL," + " changed_at TIMESTAMP NOT NULL" + ") CLUSTER BY (run_id, changed_at)" +) + + +# Append-only audit trail for role-to-group mapping changes. Mirrors +# ``dq_quality_rules_history`` / ``dq_schedule_configs_history`` / +# ``dq_run_review_status_history`` — the table only retains the *current* +# set of (role, group) pairs in ``dq_role_mappings``, so without this +# history table there is no way to answer "when was Approver→ +# dqx_app_approver added?" or "who removed Viewer→dqx_app_viewer last +# Friday?". +# +# Same Delta shape conventions as the other history tables: no PK column +# (BIGSERIAL is Postgres-only; Delta rows are ordered by ``changed_at`` +# for display), ``action`` is a free-form enum-by-convention ('create' | +# 'delete' — there is no 'update' because the row has no mutable value +# columns), and ``changed_by`` / ``changed_at`` carry the audit timestamp +# pair. +# +# Marked ``oltp_fallback=True`` because the live mapping table is OLTP- +# shaped (small, single-key lookups, frequent mutation) and lives on +# Lakebase when enabled; this migration only runs against Delta when +# Lakebase is off. The Postgres mirror lives in +# :mod:`backend.migrations.postgres` (v3). +_V7_ROLE_MAPPINGS_HISTORY = ( + f"CREATE TABLE IF NOT EXISTS {_PLACEHOLDER}.dq_role_mappings_history (" + " role STRING NOT NULL," + " group_name STRING NOT NULL," + " action STRING NOT NULL," + " changed_by STRING," + " changed_at TIMESTAMP NOT NULL" + ") CLUSTER BY (role, group_name, changed_at)" +) + + # OLTP fallback migration is identified by ``oltp_fallback=True`` so # the runner can skip it when Lakebase is enabled. Keeping the flag on # the migration itself (rather than e.g. a hard-coded version number) @@ -564,6 +630,18 @@ class DeltaMigration(Migration): sql_template=_V5_VALIDATION_RUNS_ERROR_ROWS, oltp_fallback=False, ), + DeltaMigration( + version=6, + description="Run review status (per-run review label + audit history) — used only when Lakebase is disabled", + sql_template=_V6_RUN_REVIEW_STATUS, + oltp_fallback=True, + ), + DeltaMigration( + version=7, + description="Role mappings audit history (dq_role_mappings_history) — used only when Lakebase is disabled", + sql_template=_V7_ROLE_MAPPINGS_HISTORY, + oltp_fallback=True, + ), ] diff --git a/app/src/databricks_labs_dqx_app/backend/migrations/postgres.py b/app/src/databricks_labs_dqx_app/backend/migrations/postgres.py index 9a3ff8f04..36a1feeae 100644 --- a/app/src/databricks_labs_dqx_app/backend/migrations/postgres.py +++ b/app/src/databricks_labs_dqx_app/backend/migrations/postgres.py @@ -230,6 +230,78 @@ class PgMigration: f" ON {_S}.dq_schedule_configs_history (schedule_name, changed_at DESC);" ), ), + PgMigration( + version=2, + description="Run review status (per-run review label + audit history)", + sql=( + # ---------------------------------------------------------- + # dq_run_review_status — one mutable row per run that has + # been explicitly reviewed. Runs without a row surface the + # configured default virtually at read-time (see + # ReviewStatusService.get_effective). + # ---------------------------------------------------------- + f"CREATE TABLE IF NOT EXISTS {_S}.dq_run_review_status (" + " run_id TEXT PRIMARY KEY," + " status TEXT NOT NULL," + " updated_by TEXT," + " updated_at TIMESTAMPTZ" + ");" + # The Runs History page filters by status across the whole + # list, so an index on status keeps that scan cheap as the + # review-status table grows alongside the run history. + f"CREATE INDEX IF NOT EXISTS idx_dq_run_review_status_status " + f" ON {_S}.dq_run_review_status (status);" + # ---------------------------------------------------------- + # dq_run_review_status_history — append-only audit log. + # BIGSERIAL gives us a stable display order even if two + # changes land on the same TIMESTAMPTZ (rare but possible + # with millisecond resolution + bulk admin tooling). + # ---------------------------------------------------------- + f"CREATE TABLE IF NOT EXISTS {_S}.dq_run_review_status_history (" + " history_id BIGSERIAL PRIMARY KEY," + " run_id TEXT NOT NULL," + " status TEXT NOT NULL," + " previous_status TEXT," + " changed_by TEXT NOT NULL," + " changed_at TIMESTAMPTZ NOT NULL" + ");" + f"CREATE INDEX IF NOT EXISTS idx_dq_run_review_status_history_run_changed_at " + f" ON {_S}.dq_run_review_status_history (run_id, changed_at DESC);" + ), + ), + PgMigration( + version=3, + description="Role mappings audit history (dq_role_mappings_history)", + sql=( + # ---------------------------------------------------------- + # dq_role_mappings_history — append-only audit log for + # changes to dq_role_mappings. Mirrors the Delta v7 shape; + # see the corresponding _V7_ROLE_MAPPINGS_HISTORY comment in + # ``backend.migrations.__init__`` for the rationale. + # + # BIGSERIAL gives us a stable display order even if two + # admin actions land on the same TIMESTAMPTZ (rare but + # possible with millisecond resolution + bulk tooling). + # ---------------------------------------------------------- + f"CREATE TABLE IF NOT EXISTS {_S}.dq_role_mappings_history (" + " history_id BIGSERIAL PRIMARY KEY," + " role TEXT NOT NULL," + " group_name TEXT NOT NULL," + " action TEXT NOT NULL," + " changed_by TEXT," + " changed_at TIMESTAMPTZ NOT NULL" + ");" + # Two read patterns: full-history-for-mapping (compliance + # answer "show me every change to Approver→group_x") and + # recent-activity (admin Settings page "last 50 changes"). A + # single composite covers the first; the second is served by + # the second index on changed_at alone. + f"CREATE INDEX IF NOT EXISTS idx_dq_role_mappings_history_role_group_changed_at " + f" ON {_S}.dq_role_mappings_history (role, group_name, changed_at DESC);" + f"CREATE INDEX IF NOT EXISTS idx_dq_role_mappings_history_changed_at " + f" ON {_S}.dq_role_mappings_history (changed_at DESC);" + ), + ), ] diff --git a/app/src/databricks_labs_dqx_app/backend/models.py b/app/src/databricks_labs_dqx_app/backend/models.py index 8b4d02920..00b2256e5 100644 --- a/app/src/databricks_labs_dqx_app/backend/models.py +++ b/app/src/databricks_labs_dqx_app/backend/models.py @@ -56,6 +56,83 @@ class GenerateChecksOut(BaseModel): validation_errors: list[str] = Field(default_factory=list, description="Validation errors if any") +class GenerateRulesFromContractIn(BaseModel): + """Request body for generating DQX rules from an ODCS v3.x contract.""" + + # Bound the raw payload so a single request can't carry a pathologically + # large contract. 1 MiB is far larger than any realistic ODCS contract + # while still capping parse cost and the upstream LLM fan-out from + # ``type: text`` expectations (OWASP LLM04 — see AGENTS.md). + contract_text: str = Field( + max_length=1_048_576, + description="Raw ODCS contract YAML or JSON content", + ) + generate_predefined_rules: bool = Field( + default=True, + description="Generate rules from schema property constraints (required, pattern, min/max, etc.)", + ) + process_text_rules: bool = Field( + default=False, + description="Process natural-language quality expectations via LLM (requires [llm] extras)", + ) + generate_schema_validation: bool = Field( + default=True, + description="Emit a has_valid_schema dataset rule per ODCS schema", + ) + strict_schema_validation: bool = Field( + default=True, + description="If true, schema check requires exact columns/order/types", + ) + default_criticality: str = Field( + default="error", + description="Default criticality for predefined rules: 'error' or 'warn'", + ) + + +class ContractMetadataOut(BaseModel): + """Top-level metadata extracted from the source contract for UI display.""" + + contract_id: str | None = None + name: str | None = None + version: str | None = None + odcs_api_version: str | None = None + status: str | None = None + owner: str | None = None + domain: str | None = None + description: str | None = None + + +class ContractSchemaRulesOut(BaseModel): + """Generated rules for one ODCS schema, plus the suggested UC target if any.""" + + schema_name: str = Field(description="ODCS schema name (typically the logical table name)") + physical_name: str | None = Field( + default=None, + description="physicalName from the contract, if present — a suggested UC target", + ) + property_count: int = Field(default=0, description="Number of columns/properties in the schema") + rules: list[dict[str, Any]] = Field(description="Generated DQX rules belonging to this schema") + + +class GenerateRulesFromContractOut(BaseModel): + metadata: ContractMetadataOut + schemas: list[ContractSchemaRulesOut] + unassigned_rules: list[dict[str, Any]] = Field( + default_factory=list, + description="Rules with no schema metadata (rare; surfaced rather than dropped)", + ) + total_rules: int = 0 + warnings: list[str] = Field(default_factory=list) + validation_errors: list[str] = Field( + default_factory=list, + description=( + "DQEngine.validate_checks errors for the generated rules. Non-blocking — " + "surfaced so the UI can flag rules that would fail at execution time " + "(mirrors the AI-assisted generation endpoint)." + ), + ) + + class RuleCatalogEntryOut(BaseModel): table_fqn: str display_name: str = "" @@ -106,6 +183,19 @@ class CheckDuplicatesOut(BaseModel): duplicates: list[dict[str, Any]] = Field(description="Checks that already exist for this table") +class TableSchemaDdlOut(BaseModel): + """Spark-DDL snapshot of a UC table's current column list. + + Used as a starting point for the schema-validation rule builder so + the user can fine-tune (drop extra columns, widen types) rather than + type everything from scratch. + """ + + table_fqn: str = Field(description="Fully qualified UC table name") + ddl: str = Field(description="Comma-separated 'col TYPE' string suitable for has_valid_schema") + column_count: int = Field(default=0, description="Number of columns included in the DDL") + + class FilterTablesByColumnsIn(BaseModel): required_columns: list[str] = Field(description="Column names that must exist in the table") table_fqns: list[str] = Field(description="Fully qualified table names to check") @@ -290,6 +380,17 @@ class ValidationRunSummaryOut(BaseModel): created_at: str | None = None error_message: str | None = None checks: list[dict[str, Any]] = Field(default_factory=list) + # Per-run review status — set by reviewers on the Runs detail page, + # filterable on the Runs History page. ``review_status`` is the + # effective value (catalogue default for unreviewed runs, persisted + # value otherwise); ``review_status_is_default`` lets the History + # table render unreviewed rows distinctly (e.g. lighter badge) so + # they're not visually indistinguishable from rows where someone + # explicitly selected "Pending review". + review_status: str | None = None + review_status_is_default: bool = False + review_status_updated_by: str | None = None + review_status_updated_at: str | None = None # --------------------------------------------------------------------------- @@ -462,6 +563,30 @@ class CreateRoleMappingIn(BaseModel): group_name: str = Field(description="Databricks workspace group name") +class RoleMappingHistoryOut(BaseModel): + """One row from the role-mapping audit log. + + Read-only — populated only by service-side ``RoleService`` mutations + against ``dq_role_mappings_history``. Returned by ``GET /v1/roles/history`` + (Admin only) for the admin Settings page audit panel. + """ + + role: str = Field(description="Role name affected by the change") + group_name: str = Field(description="Databricks workspace group name affected by the change") + action: str = Field(description="Mutation kind: 'create' or 'delete'") + changed_by: str | None = Field( + default=None, + description=( + "Email of the admin who performed the change. ``null`` for legacy " + "delete rows recorded before the deleter email was wired through." + ), + ) + changed_at: str | None = Field( + default=None, + description="ISO-8601 timestamp of when the change was recorded.", + ) + + class GroupOut(BaseModel): display_name: str = Field(description="Group display name") id: str | None = Field(default=None, description="Group ID") diff --git a/app/src/databricks_labs_dqx_app/backend/pg_executor.py b/app/src/databricks_labs_dqx_app/backend/pg_executor.py index 056af9a4a..7d57c18ff 100644 --- a/app/src/databricks_labs_dqx_app/backend/pg_executor.py +++ b/app/src/databricks_labs_dqx_app/backend/pg_executor.py @@ -164,6 +164,12 @@ class PgExecutor: dialect: str = "postgres" + # Alias applied to the conflict target in :meth:`upsert_with_audit` so a + # self-referencing increment can address the existing row without a + # schema-qualified reference (which Postgres rejects in DO UPDATE SET). + # Deliberately prefixed to avoid colliding with any real relation name. + _UPSERT_TARGET_ALIAS: str = "dqx_upsert_target" + def __init__( self, *, @@ -515,9 +521,19 @@ def upsert_with_audit( """Postgres ``INSERT ... ON CONFLICT ... DO UPDATE`` with audit semantics. See :meth:`backend.sql_executor.OltpExecutorProtocol.upsert_with_audit` - for the contract. The increment self-reference uses the bare - column form (``"col" + 1``) which Postgres resolves against - the existing row inside DO UPDATE — no table prefix needed. + for the contract. The increment self-reference must point at the + *existing* row (not ``EXCLUDED``, which carries the proposed + value). A bare column name (``"col" = "col" + 1``) is ambiguous + when the same column also appears in ``EXCLUDED``, but a + schema-qualified reference (``"dq"."tbl"."col"``) is **not** a + valid existing-row reference inside ``DO UPDATE SET`` — Postgres + resolves it as a FROM-clause entry and errors with "invalid + reference to FROM-clause entry for table ...". We therefore alias + the conflict target (``INSERT INTO AS ``, supported + since the ON CONFLICT feature shipped in PG 9.5) and reference the + alias, which resolves unambiguously to the existing row. Aliasing + also sidesteps having to recover the bare relation name from the + already-quoted FQN string. """ if not key_cols: raise ValueError("upsert_with_audit requires at least one key column") @@ -531,19 +547,25 @@ def upsert_with_audit( all_vals = [_pg_render_value(v) for v in list(key_cols.values()) + list(value_cols.values())] quoted_cols = [self.q(c) for c in all_cols] quoted_keys = [self.q(c) for c in key_cols] + alias = self.q(self._UPSERT_TARGET_ALIAS) # DO UPDATE SET excludes created_* columns when preserve_created; - # the increment column (if any) gets the bare-column - # self-reference form Postgres resolves inside ON CONFLICT DO - # UPDATE — no table prefix needed. + # the increment column (if any) references the aliased conflict + # target so it resolves to the existing row rather than EXCLUDED. update_pairs: list[str] = [] + needs_alias = False for col, val in value_cols.items(): if preserve_created and col.startswith("created_"): continue qcol = self.q(col) if col == increment_on_update: - # Bare column reference resolves to the existing row. - update_pairs.append(f"{qcol} = {qcol} + 1") + # Alias-qualified reference resolves unambiguously to the + # existing row — even when ``EXCLUDED`` also exposes the + # same column name (which is always the case for + # ``increment_on_update`` since it has to be in + # ``value_cols``). + update_pairs.append(f"{qcol} = {alias}.{qcol} + 1") + needs_alias = True else: update_pairs.append(f"{qcol} = {_pg_render_value(val)}") @@ -555,7 +577,12 @@ def upsert_with_audit( # "create-if-missing" audit row). conflict_clause = f"ON CONFLICT ({', '.join(quoted_keys)}) DO NOTHING" - sql = f"INSERT INTO {table} ({', '.join(quoted_cols)}) " f"VALUES ({', '.join(all_vals)}) " f"{conflict_clause}" + # Only alias the target when a self-reference needs it, so the + # non-increment path keeps its existing rendered shape. + target = f"{table} AS {alias}" if needs_alias else table + sql = ( + f"INSERT INTO {target} ({', '.join(quoted_cols)}) " f"VALUES ({', '.join(all_vals)}) " f"{conflict_clause}" + ) self.execute(sql, timeout_seconds=timeout_seconds) def select_json_text(self, col: str) -> str: diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/__init__.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/__init__.py index 48832e9ba..269862ef3 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/__init__.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/__init__.py @@ -3,6 +3,7 @@ from .me import router as me_router from .check_functions import router as check_functions_router from .config import router as config_router +from .contract import router as contract_router from .discovery import router as discovery_router from .generate import router as generate_router from .rules import router as rules_router @@ -14,6 +15,7 @@ from .comments import router as comments_router from .quarantine import router as quarantine_router from .metrics import router as metrics_router +from .review_status import router as review_status_router from .schedules import router as schedules_router v1_router = APIRouter() @@ -23,6 +25,7 @@ v1_router.include_router(roles_router, prefix="/roles", tags=["roles"]) v1_router.include_router(discovery_router, prefix="/discovery", tags=["discovery"]) v1_router.include_router(generate_router, prefix="/ai", tags=["ai"]) +v1_router.include_router(contract_router, prefix="/contract", tags=["contract"]) v1_router.include_router(rules_router, prefix="/rules", tags=["rules"]) v1_router.include_router(import_rules_router, prefix="/rules", tags=["rules"]) v1_router.include_router(check_functions_router, prefix="/check-functions", tags=["check-functions"]) @@ -32,3 +35,4 @@ v1_router.include_router(comments_router, prefix="/comments", tags=["comments"]) v1_router.include_router(quarantine_router, prefix="/quarantine", tags=["quarantine"]) v1_router.include_router(metrics_router, prefix="/metrics", tags=["metrics"]) +v1_router.include_router(review_status_router, prefix="/runs", tags=["review-status"]) diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/check_functions.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/check_functions.py index 195d5a686..ffea9ebef 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/check_functions.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/check_functions.py @@ -8,10 +8,15 @@ The list is built once per process by introspecting :data:`databricks.labs.dqx.rule.CHECK_FUNC_REGISTRY` plus -:func:`inspect.signature` on each registered callable. We deliberately -omit cross-table dataset checks (``foreign_key``, ``compare_datasets``, -``sql_query``) because they require a reference dataset and therefore -belong to the cross-table-rules editor, not the single-table editor. +:func:`inspect.signature` on each registered callable. + +Reference-table checks that still validate a *single* target table +(``foreign_key``, ``has_valid_schema``) ARE surfaced here: from the +user's point of view the reference table is just another argument, so +they live in the single-table editor alongside row checks. Only the +genuinely multi-dataset checks (``compare_datasets``) and the raw SQL +editor's ``sql_query`` are omitted — those belong to the cross-table +editor. """ from __future__ import annotations @@ -39,16 +44,25 @@ # Functions that are intentionally hidden from the single-table-rules editor. -# All three need a *reference* dataset (or are handled via the cross-table SQL -# editor) and therefore don't fit the single-table flow. +# ``compare_datasets`` genuinely compares two datasets row-for-row, and +# ``sql_query`` is authored in the dedicated cross-table SQL editor — neither +# fits the single-table flow. Reference-table checks that still target one +# table (``foreign_key``, ``has_valid_schema``) are NOT hidden: the reference +# table is surfaced as a normal argument in the single-table editor. _HIDDEN_FROM_SINGLE_TABLE: frozenset[str] = frozenset( { - "foreign_key", "compare_datasets", "sql_query", } ) +# Parameters the single-table editor can't populate and therefore drops: +# * ``row_filter`` — injected by the engine, not user-facing. +# * ``ref_df_name`` — names an in-memory reference DataFrame, which the +# stateless app never has; reference data is always addressed by +# ``ref_table`` instead. +_HIDDEN_PARAMS: frozenset[str] = frozenset({"row_filter", "ref_df_name"}) + # Hand-curated mapping from function name to UX category. New DQX checks # fall back to ``"Other"`` until added here — that's intentional: the UI @@ -106,6 +120,8 @@ "is_unique": "Uniqueness", # Schema "has_valid_schema": "Schema", + # Reference table + "foreign_key": "Reference Table", # Custom SQL "sql_expression": "Custom SQL", # Anomaly Detection (optional module) @@ -160,6 +176,13 @@ def _classify_param_kind(name: str, annotation: object) -> str: has_list = "list" in lowered has_column = "column" in lowered # captures pyspark Column + # Reference-table arguments (foreign_key / has_valid_schema). The UI + # renders ``ref_table`` as a catalog/table picker and ``ref_columns`` as a + # column CSV, so they get dedicated kinds rather than the generic ones. + if name == "ref_table": + return "ref_table" + if name == "ref_columns": + return "ref_columns" # Composite-key column input (e.g. ``is_unique(columns: list[str | Column])``). if name == "columns" and has_list and has_column: return "columns" @@ -295,22 +318,13 @@ def _introspect_check_functions() -> tuple[CheckFunctionDef, ...]: logger.warning("inspect.signature failed for %r: %s", name, exc) continue params: list[CheckFunctionParam] = [] - skip_function = False for param_name, param in sig.parameters.items(): - # Filter out internal-only parameters that the engine injects - # (the rule editor doesn't know how to populate them). - if param_name in {"row_filter"}: + # Drop parameters the editor can't populate (engine-injected or + # in-memory-DataFrame references). Reference *tables* are kept and + # rendered as a picker — see ``_HIDDEN_PARAMS``. + if param_name in _HIDDEN_PARAMS: continue params.append(_build_param(param)) - # Cross-table dataset checks always carry a ref-table parameter; - # using the parameter name as a defensive escape hatch lets us - # auto-hide future ``foreign_key``-style additions even if we - # forget to add them to ``_HIDDEN_FROM_SINGLE_TABLE``. - if param_name.startswith("ref_") and param_name not in {"ref_columns"}: - skip_function = True - break - if skip_function: - continue out.append( CheckFunctionDef( name=name, diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/config.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/config.py index 78a8e5d4e..2cdb1f2c1 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/config.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/config.py @@ -1,10 +1,12 @@ import json +import os import re from typing import Annotated from fastapi import APIRouter, Depends, HTTPException from databricks_labs_dqx_app.backend.common.authorization import UserRole, get_user_email +from databricks_labs_dqx_app.backend.config import conf from databricks_labs_dqx_app.backend.dependencies import get_app_settings_service, require_role from databricks_labs_dqx_app.backend.logger import logger from pydantic import BaseModel, Field @@ -87,7 +89,7 @@ def _notify_scheduler() -> None: @router.get( "", response_model=ConfigOut, - operation_id="config", + operation_id="getConfig", dependencies=[require_role(UserRole.ADMIN)], ) def get_config( @@ -537,3 +539,293 @@ def save_custom_metrics( saved = svc.save_custom_metrics(cleaned, user_email=email) logger.info("Saved %d custom metric expression(s)", len(saved)) return CustomMetricsOut(metrics=saved) + + +# ---------------------------------------------------------------------- +# Embedded dashboard — the Insights page renders a Databricks AI/BI +# dashboard inside an iframe. Admins set the dashboard ID (and an +# optional display title) here; the GET endpoint falls back to the env +# default (``conf.default_dashboard_id`` from ``DQX_DEFAULT_DASHBOARD_ID``) +# so the bundle can ship a starter dashboard without preventing +# customer overrides. The workspace host is read from +# ``DATABRICKS_HOST`` (always set inside a Databricks App container) +# and included in the response so the frontend can build the embed +# URL without a second roundtrip. +# ---------------------------------------------------------------------- + +# Conservative ID validation: Databricks AI/BI dashboard IDs are +# UUIDs or shorter slugs, so we accept letters, digits, hyphens, and +# underscores. We deliberately reject anything that could be a URL +# fragment or path traversal so admins can't accidentally paste a full +# URL and break iframe rendering downstream. +_DASHBOARD_ID_RE = re.compile(r"^[A-Za-z0-9_-]{1,128}$") + + +class EmbeddedDashboardOut(BaseModel): + """Current embedded-dashboard configuration + the bits the UI needs to render the iframe.""" + + dashboard_id: str = Field( + default="", + description="Effective dashboard ID. Empty string means 'nothing configured'.", + ) + title: str | None = Field( + default=None, + description="Optional admin-provided display title. The UI falls back to a generic label when null.", + ) + workspace_host: str = Field( + default="", + description="Workspace host (e.g. 'https://e2-...cloud.databricks.com') used to build the iframe URL.", + ) + is_set: bool = Field( + default=False, + description="True when the admin has saved an explicit setting (independent of the env default).", + ) + is_default: bool = Field( + default=False, + description="True when the response is serving the env-provided default rather than an admin override.", + ) + + +class EmbeddedDashboardIn(BaseModel): + """Update payload — admins write the dashboard ID and optionally a display title.""" + + dashboard_id: str + title: str | None = None + + +def _workspace_host() -> str: + """Read the workspace host from the env Databricks Apps populates at runtime. + + Returns an empty string if unset (e.g. local dev without DATABRICKS_HOST); + the UI handles this by showing a config-required message rather than a + broken iframe. + """ + host = (os.environ.get("DATABRICKS_HOST") or "").strip() + if host and not host.startswith(("http://", "https://")): + host = f"https://{host}" + return host.rstrip("/") + + +@router.get( + "/embedded-dashboard", + response_model=EmbeddedDashboardOut, + operation_id="getEmbeddedDashboard", +) +def get_embedded_dashboard( + svc: Annotated[AppSettingsService, Depends(get_app_settings_service)], +) -> EmbeddedDashboardOut: + """Return the current embedded-dashboard config. + + Available to any authenticated user — the Insights page is read-only + and the underlying dashboard enforces UC permissions on the data, + so we don't gate visibility here. + """ + saved = svc.get_embedded_dashboard() + workspace_host = _workspace_host() + if saved: + return EmbeddedDashboardOut( + dashboard_id=saved["dashboard_id"], + title=saved.get("title"), + workspace_host=workspace_host, + is_set=True, + is_default=False, + ) + env_default = (conf.default_dashboard_id or "").strip() + return EmbeddedDashboardOut( + dashboard_id=env_default, + title=None, + workspace_host=workspace_host, + is_set=False, + is_default=bool(env_default), + ) + + +@router.put( + "/embedded-dashboard", + response_model=EmbeddedDashboardOut, + operation_id="saveEmbeddedDashboard", + dependencies=[require_role(UserRole.ADMIN)], +) +def save_embedded_dashboard( + body: EmbeddedDashboardIn, + svc: Annotated[AppSettingsService, Depends(get_app_settings_service)], + email: Annotated[str, Depends(get_user_email)], +) -> EmbeddedDashboardOut: + """Save the embedded-dashboard configuration (admin only).""" + dashboard_id = (body.dashboard_id or "").strip() + if not dashboard_id: + raise HTTPException(status_code=400, detail="dashboard_id is required.") + if not _DASHBOARD_ID_RE.match(dashboard_id): + raise HTTPException( + status_code=400, + detail=( + "Invalid dashboard_id. Paste the ID portion only " + "(letters, digits, hyphens, underscores; up to 128 chars) — " + "not a full dashboard URL." + ), + ) + title = (body.title or "").strip() or None + if title and len(title) > 200: + raise HTTPException(status_code=400, detail="title must be 200 characters or fewer.") + + svc.save_embedded_dashboard(dashboard_id, title, user_email=email) + logger.info("Saved embedded dashboard id=%s title=%r (by=%s)", dashboard_id, title, email) + return EmbeddedDashboardOut( + dashboard_id=dashboard_id, + title=title, + workspace_host=_workspace_host(), + is_set=True, + is_default=False, + ) + + +@router.delete( + "/embedded-dashboard", + response_model=EmbeddedDashboardOut, + operation_id="deleteEmbeddedDashboard", + dependencies=[require_role(UserRole.ADMIN)], +) +def delete_embedded_dashboard( + svc: Annotated[AppSettingsService, Depends(get_app_settings_service)], + email: Annotated[str, Depends(get_user_email)], +) -> EmbeddedDashboardOut: + """Clear the admin override (admin only). + + The env-provided default — if any — takes over again. Useful when + the bundle ships a starter dashboard and the admin wants to revert + to it after a botched custom ID. + """ + svc.delete_embedded_dashboard(user_email=email) + logger.info("Cleared embedded dashboard override (by=%s)", email) + return get_embedded_dashboard(svc) + + +# ---------------------------------------------------------------------- +# Run review statuses — admin-managed catalogue of values for the per-run +# review label shown on the Runs detail page and filterable on the Runs +# History page. Stored as a JSON list under ``run_review_statuses_v1`` +# (see :meth:`AppSettingsService.get_run_review_statuses`). The catalogue +# always includes exactly one entry marked ``is_default``; the service +# enforces that invariant on save so the Runs detail dropdown is never +# empty and listing endpoints always have something to surface for +# unreviewed runs. +# ---------------------------------------------------------------------- + +# Conservative value validation: the catalogue values become filter +# chips on the History page and audit-log strings, so we want them +# printable and short. Letters/digits/spaces/hyphens/underscores covers +# "Pending review", "Acknowledged", "False positive", custom domain +# terms, while rejecting newlines, control chars, and quote characters +# that would break our raw-SQL escape path on the history table. +_REVIEW_STATUS_VALUE_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9 _\-/.]{0,79}$") + + +class RunReviewStatusOption(BaseModel): + """One catalogue entry. ``is_default`` flags the value auto-surfaced for unreviewed runs.""" + + value: str + description: str = "" + color: str = "gray" + is_default: bool = False + + +class RunReviewStatusesOut(BaseModel): + statuses: list[RunReviewStatusOption] + + +class RunReviewStatusesIn(BaseModel): + statuses: list[RunReviewStatusOption] + + +def _statuses_to_out(entries: list[dict]) -> RunReviewStatusesOut: + """Coerce the service's dict-shaped entries into the pydantic out model.""" + return RunReviewStatusesOut( + statuses=[ + RunReviewStatusOption( + value=e.get("value") or "", + description=e.get("description") or "", + color=e.get("color") or "gray", + is_default=bool(e.get("is_default")), + ) + for e in entries + ] + ) + + +@router.get( + "/run-review-statuses", + response_model=RunReviewStatusesOut, + operation_id="getRunReviewStatuses", +) +def get_run_review_statuses( + svc: Annotated[AppSettingsService, Depends(get_app_settings_service)], +) -> RunReviewStatusesOut: + """Return the admin-managed list of run review status values. + + Visible to any authenticated user — both the Runs detail dropdown + and the Runs History filter need this list, and neither is + admin-gated. The list is seeded on first read so the UI never + sees an empty dropdown on a fresh deploy. + """ + return _statuses_to_out(svc.get_run_review_statuses()) + + +@router.put( + "/run-review-statuses", + response_model=RunReviewStatusesOut, + operation_id="saveRunReviewStatuses", + dependencies=[require_role(UserRole.ADMIN)], +) +def save_run_review_statuses( + body: RunReviewStatusesIn, + svc: Annotated[AppSettingsService, Depends(get_app_settings_service)], + email: Annotated[str, Depends(get_user_email)], +) -> RunReviewStatusesOut: + """Replace the full catalogue (admin only). + + Each value is validated against ``_REVIEW_STATUS_VALUE_RE`` (printable + short strings, no quotes/control chars), descriptions are trimmed, + duplicates are rejected, and exactly one ``is_default`` is enforced + by :meth:`AppSettingsService.save_run_review_statuses`. + + NOTE: renaming an existing value does *not* update historical + references in ``dq_run_review_status`` / ``dq_run_review_status_history``; + those rows keep the original string so the audit trail stays + accurate. The UI surfaces orphaned historical values as-is. If a + workspace operator wants to retire a value cleanly they should + either keep it in the list (without ``is_default``) until the + affected runs age out, or do a one-off UPDATE through the SQL + warehouse. + """ + cleaned_payload: list[dict] = [] + for option in body.statuses or []: + value = (option.value or "").strip() + if not value: + raise HTTPException(status_code=400, detail="Run review status 'value' cannot be blank.") + if not _REVIEW_STATUS_VALUE_RE.match(value): + raise HTTPException( + status_code=400, + detail=( + f"Invalid review status value {value!r}. Use printable ASCII " + "(letters, digits, spaces, hyphens, underscores, dots, slashes), " + "1–80 characters, starting with a letter or digit." + ), + ) + cleaned_payload.append( + { + "value": value, + "description": (option.description or "").strip(), + "color": (option.color or "gray").strip() or "gray", + "is_default": bool(option.is_default), + } + ) + + try: + saved = svc.save_run_review_statuses(cleaned_payload, user_email=email) + except ValueError as e: + # The service's invariants (at-least-one entry, unique values, + # exactly one default) surface as ValueError. Route surface + # them as 400 so the UI can show the human-readable message. + raise HTTPException(status_code=400, detail=str(e)) + logger.info("Saved %d run review status(es)", len(saved)) + return _statuses_to_out(saved) diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/contract.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/contract.py new file mode 100644 index 000000000..460294181 --- /dev/null +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/contract.py @@ -0,0 +1,99 @@ +"""ODCS data-contract → DQX rule generation endpoint. + +Mirrors the AI-assisted generation router (`/v1/ai`) shape but for +contract-based generation. Read-only — the user picks a UC target per +schema in the UI and then saves through the existing +``POST /v1/rules`` flow. +""" + +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException + +from databricks_labs_dqx_app.backend.common.authorization import UserRole +from databricks_labs_dqx_app.backend.dependencies import ( + get_contract_rules_service, + require_role, +) +from databricks_labs_dqx_app.backend.logger import logger +from databricks_labs_dqx_app.backend.models import ( + ContractMetadataOut, + ContractSchemaRulesOut, + GenerateRulesFromContractIn, + GenerateRulesFromContractOut, +) +from databricks_labs_dqx_app.backend.services.contract_rules_service import ( + ContractGenerationResult, + ContractRulesService, +) + +router = APIRouter() + +_AUTHORS_AND_ABOVE = [UserRole.ADMIN, UserRole.RULE_APPROVER, UserRole.RULE_AUTHOR] + + +@router.post( + "/generate-rules", + response_model=GenerateRulesFromContractOut, + operation_id="generateRulesFromContract", + dependencies=[require_role(*_AUTHORS_AND_ABOVE)], +) +def generate_rules_from_contract( + body: GenerateRulesFromContractIn, + service: Annotated[ContractRulesService, Depends(get_contract_rules_service)], +) -> GenerateRulesFromContractOut: + """Parse an ODCS v3.x contract and return generated DQX rules grouped per schema.""" + if body.default_criticality not in ("error", "warn"): + raise HTTPException( + status_code=400, + detail="default_criticality must be 'error' or 'warn'", + ) + try: + result = service.generate( + contract_text=body.contract_text, + generate_predefined_rules=body.generate_predefined_rules, + process_text_rules=body.process_text_rules, + generate_schema_validation=body.generate_schema_validation, + strict_schema_validation=body.strict_schema_validation, + default_criticality=body.default_criticality, + ) + return _to_out(result) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + # Missing extras or generator runtime errors — surface as 500. + logger.error("Contract generation runtime error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + except Exception as e: # pragma: no cover - defensive guard + # Don't relay the raw exception text — it may carry LLM/SDK detail or + # internal structure (AGENTS.md LLM06 / CWE-209). Log it server-side. + logger.error("Failed to generate rules from contract: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Failed to generate rules from contract.") + + +def _to_out(result: ContractGenerationResult) -> GenerateRulesFromContractOut: + return GenerateRulesFromContractOut( + metadata=ContractMetadataOut( + contract_id=result.metadata.contract_id, + name=result.metadata.name, + version=result.metadata.version, + odcs_api_version=result.metadata.odcs_api_version, + status=result.metadata.status, + owner=result.metadata.owner, + domain=result.metadata.domain, + description=result.metadata.description, + ), + schemas=[ + ContractSchemaRulesOut( + schema_name=s.schema_name, + physical_name=s.physical_name, + property_count=s.property_count, + rules=s.rules, + ) + for s in result.schemas + ], + unassigned_rules=result.unassigned_rules, + total_rules=result.total_rules, + warnings=result.warnings, + validation_errors=result.validation_errors, + ) diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/discovery.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/discovery.py index e518d7daf..616c7e4aa 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/discovery.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/discovery.py @@ -1,7 +1,7 @@ import asyncio from typing import Annotated -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from databricks_labs_dqx_app.backend.dependencies import get_discovery_service from databricks_labs_dqx_app.backend.logger import logger @@ -12,6 +12,7 @@ FilterTablesByColumnsOut, SchemaOut, TableOut, + TableSchemaDdlOut, TableTagsOut, ) from databricks_labs_dqx_app.backend.services.discovery import DiscoveryService @@ -22,7 +23,7 @@ router = APIRouter() -@router.get("/catalogs", response_model=list[CatalogOut], operation_id="list_catalogs") +@router.get("/catalogs", response_model=list[CatalogOut], operation_id="listCatalogs") async def list_catalogs( discovery: Annotated[DiscoveryService, Depends(get_discovery_service)], ) -> list[CatalogOut]: @@ -37,7 +38,7 @@ async def list_catalogs( @router.get( "/catalogs/{catalog}/schemas", response_model=list[SchemaOut], - operation_id="list_schemas", + operation_id="listSchemas", ) async def list_schemas( catalog: str, @@ -56,7 +57,7 @@ async def list_schemas( @router.get( "/catalogs/{catalog}/schemas/{schema}/tables", response_model=list[TableOut], - operation_id="list_tables", + operation_id="listTables", ) async def list_tables( catalog: str, @@ -83,7 +84,7 @@ async def list_tables( @router.get( "/catalogs/{catalog}/schemas/{schema}/all-table-fqns", response_model=list[str], - operation_id="list_all_table_fqns", + operation_id="listAllTableFqns", ) async def list_all_table_fqns( catalog: str, @@ -102,7 +103,7 @@ async def list_all_table_fqns( @router.get( "/catalogs/{catalog}/schemas/{schema}/tables/{table}/columns", response_model=list[ColumnOut], - operation_id="get_table_columns", + operation_id="getTableColumns", ) async def get_table_columns( catalog: str, @@ -130,7 +131,7 @@ async def get_table_columns( @router.get( "/catalogs/{catalog}/schemas/{schema}/tables/{table}/tags", response_model=TableTagsOut, - operation_id="get_table_tags", + operation_id="getTableTags", ) async def get_table_tags( catalog: str, @@ -151,10 +152,40 @@ async def get_table_tags( raise HTTPException(status_code=500, detail=f"Failed to get table tags: {e}") +@router.get( + "/schema-ddl", + response_model=TableSchemaDdlOut, + operation_id="getTableSchemaDdl", +) +async def get_table_schema_ddl( + discovery: Annotated[DiscoveryService, Depends(get_discovery_service)], + table_fqn: Annotated[str, Query(description="Fully qualified table name (catalog.schema.table)")], +) -> TableSchemaDdlOut: + """Snapshot the current schema of a UC table as a Spark DDL string. + + Powers the "snapshot from table" affordance in the schema-validation + rule builder. Returned shape is the canonical + ``has_valid_schema`` ``expected_schema`` argument. + """ + fqn = (table_fqn or "").strip() + if fqn.count(".") != 2 or not all(p.strip() for p in fqn.split(".")): + raise HTTPException( + status_code=400, + detail="table_fqn must be a three-part name (catalog.schema.table)", + ) + try: + ddl = await discovery.get_table_schema_ddl_async(fqn) + except Exception as e: + logger.error("Failed to snapshot schema DDL for %s: %s", fqn, e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to snapshot schema: {e}") + column_count = sum(1 for piece in ddl.split(",") if piece.strip()) + return TableSchemaDdlOut(table_fqn=fqn, ddl=ddl, column_count=column_count) + + @router.post( "/filter-tables-by-columns", response_model=FilterTablesByColumnsOut, - operation_id="filter_tables_by_columns", + operation_id="filterTablesByColumns", ) async def filter_tables_by_columns( body: FilterTablesByColumnsIn, diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/dryrun.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/dryrun.py index f98ce3856..c10ef6d38 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/dryrun.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/dryrun.py @@ -19,6 +19,7 @@ get_conf, get_job_service, get_obo_ws, + get_review_status_service, get_rules_catalog_service, get_sp_sql_executor, get_user_catalog_names, @@ -40,6 +41,7 @@ ) from databricks_labs_dqx_app.backend.services.job_service import JobService from databricks_labs_dqx_app.backend.run_status_manager import get_run_metadata, has_terminal_result, update_run_status +from databricks_labs_dqx_app.backend.services.review_status_service import ReviewStatusService from databricks_labs_dqx_app.backend.services.rules_catalog_service import RulesCatalogService from databricks_labs_dqx_app.backend.services.view_service import ViewService @@ -60,6 +62,27 @@ def _extract_sql_query(checks: list[dict[str, Any]]) -> str | None: return None +def _extract_schema_check_target(checks: list[dict[str, Any]]) -> str | None: + """Return the real Unity Catalog target table for a ``has_valid_schema`` rule. + + Schema-validation rules are dataset-level, so they're persisted under + the same ``__sql_check__/`` synthetic table_fqn as cross-table + SQL rules. The actual table the check should run against is carried + on ``user_metadata.target_table`` (stamped by the schema-rule + builder). This helper returns the first such target so the runner + knows which table to materialise a view from. + """ + for check in checks: + fn = (check.get("check") or {}).get("function", "") + if fn != "has_valid_schema": + continue + meta = check.get("user_metadata") or {} + target = meta.get("target_table") + if isinstance(target, str) and target.strip(): + return target.strip() + return None + + def _catalog_of(fqn: str) -> str: """Extract the catalog part from a fully qualified table name.""" if fqn.startswith(_SQL_CHECK_PREFIX): @@ -76,18 +99,76 @@ def _catalog_of(fqn: str) -> str: ) async def list_validation_runs( job_svc: Annotated[JobService, Depends(get_job_service)], + review_svc: Annotated[ReviewStatusService, Depends(get_review_status_service)], app_conf: Annotated[AppConfig, Depends(get_conf)], user_catalogs: Annotated[frozenset[str], Depends(get_user_catalog_names)], + review_status: Annotated[ + list[str] | None, + Query( + description=( + "Filter to runs whose effective review status matches one of " + "the supplied values. Repeat the param for multi-select " + "(e.g. ?review_status=Acknowledged&review_status=Resolved). " + "Match is on the effective value, so passing the catalogue " + "default also catches unreviewed runs." + ), + ), + ] = None, ) -> list[ValidationRunSummaryOut]: """Return validation (dry-run) history filtered to user-accessible catalogs.""" try: table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_validation_runs" rows = job_svc.list_dryrun_rows(table) - results: list[ValidationRunSummaryOut] = [] + + # First-pass filter on UC visibility — we don't want to bulk-fetch + # review statuses for runs the caller can't see anyway. Build the + # candidate list in the same order so the final response stays + # sorted by ``created_at DESC`` (already applied in the SQL). + candidates: list[dict[str, str | None]] = [] for row in rows: fqn = row.get("source_table_fqn") or "" if not fqn.startswith(_SQL_CHECK_PREFIX) and _catalog_of(fqn) not in user_catalogs: continue + candidates.append(row) + + # Bulk-fetch review statuses for every visible run in one query. + # Lakebase/Delta-OLTP and dq_validation_runs may live on different + # backends, so we can't JOIN at the SQL layer — merging in Python + # is what keeps this dialect-portable. + candidate_run_ids = [row.get("run_id") or "" for row in candidates if row.get("run_id")] + try: + review_map = review_svc.bulk_get_effective(candidate_run_ids) + except Exception as exc: + logger.warning( + "Failed to bulk-fetch review statuses (rendering without): %s", + exc, + exc_info=True, + ) + review_map = {} + + # Normalise the optional multi-select filter into a set for O(1) + # checks. Empty strings (e.g. trailing ``?review_status=``) are + # dropped so a forgotten chip never accidentally filters + # everything out. + review_filter: set[str] | None + if review_status: + review_filter = {s.strip() for s in review_status if s and s.strip()} + if not review_filter: + review_filter = None + else: + review_filter = None + + results: list[ValidationRunSummaryOut] = [] + for row in candidates: + fqn = row.get("source_table_fqn") or "" + run_id = row.get("run_id") or "" + review = review_map.get(run_id) + review_value = review.status if review else None + + if review_filter is not None: + if not review_value or review_value not in review_filter: + continue + checks: list[dict[str, Any]] = [] raw = row.get("checks_json") if raw: @@ -99,7 +180,7 @@ async def list_validation_runs( pass results.append( ValidationRunSummaryOut( - run_id=row.get("run_id") or "", + run_id=run_id, source_table_fqn=fqn, status=row.get("status"), requesting_user=row.get("requesting_user"), @@ -117,6 +198,10 @@ async def list_validation_runs( run_type=row.get("run_type"), error_message=row.get("error_message"), checks=checks, + review_status=review_value, + review_status_is_default=bool(review.is_default) if review else False, + review_status_updated_by=review.updated_by if review else None, + review_status_updated_at=review.updated_at if review else None, ) ) return results @@ -163,43 +248,82 @@ def batch_run_from_catalog( continue run_id = uuid4().hex[:16] - is_sql_check = table_fqn.startswith(_SQL_CHECK_PREFIX) - - if is_sql_check: + # Rules under ``__sql_check__/`` are dataset-level, but + # the *kind* of dataset rule decides how we materialise the + # input view: + # * ``sql_query`` → run the embedded query and use + # the result rows as violations (existing path). + # * ``has_valid_schema`` → the rule runs against a real UC + # table. We snapshot the target table into a temp view + # and execute the standard row-level engine (which + # handles ``has_valid_schema`` natively as a dataset + # check). ``is_sql_check`` stays ``False`` so the runner + # uses the engine path, not the SQL-violation shortcut. + # ``source_table_fqn`` keeps the synthetic key so the rule's + # run history still groups by the catalog entry. + is_synthetic = table_fqn.startswith(_SQL_CHECK_PREFIX) + sql_query: str | None = None + schema_target: str | None = None + if is_synthetic: sql_query = _extract_sql_query(approved_checks) - if not sql_query: - errors.append(f"{table_fqn}: SQL check has no query") - continue - view_fqn = view_svc.create_view_from_sql(sql_query) + if sql_query: + view_fqn = view_svc.create_view_from_sql(sql_query) + else: + schema_target = _extract_schema_check_target(approved_checks) + if not schema_target: + errors.append( + f"{table_fqn}: dataset-level rule must provide either a sql_query " + "or a has_valid_schema check with user_metadata.target_table" + ) + continue + view_fqn = view_svc.create_view(schema_target) else: view_fqn = view_svc.create_view(table_fqn) - config: dict[str, Any] = { - "checks": approved_checks, - "sample_size": body.sample_size, - "source_table_fqn": table_fqn, - "is_sql_check": is_sql_check, - } - if custom_metrics: - config["custom_metrics"] = custom_metrics - job_run_id = job_svc.submit_run( - task_type="dryrun", - view_fqn=view_fqn, - config=config, - run_id=run_id, - requesting_user=requesting_user, - ) - submitted.append(DryRunSubmitOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn)) + # See submit_dry_run: anything past this point must clean up + # ``view_fqn`` on failure or we leak a temp view. + try: + config: dict[str, Any] = { + "checks": approved_checks, + "sample_size": body.sample_size, + "source_table_fqn": table_fqn, + # Only true SQL queries take the SQL fast-path in the + # runner — schema-validation rules go through the normal + # row-level engine even though they share the synthetic + # ``__sql_check__/`` prefix. + "is_sql_check": sql_query is not None, + } + if custom_metrics: + config["custom_metrics"] = custom_metrics + job_run_id = job_svc.submit_run( + task_type="dryrun", + view_fqn=view_fqn, + config=config, + run_id=run_id, + requesting_user=requesting_user, + ) + submitted.append(DryRunSubmitOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn)) - job_svc.record_dryrun_started( - table=runs_table, - run_id=run_id, - requesting_user=requesting_user, - source_table_fqn=table_fqn, - view_fqn=view_fqn, - sample_size=body.sample_size, - job_run_id=job_run_id, - ) + job_svc.record_dryrun_started( + table=runs_table, + run_id=run_id, + requesting_user=requesting_user, + source_table_fqn=table_fqn, + view_fqn=view_fqn, + sample_size=body.sample_size, + job_run_id=job_run_id, + ) + except Exception: + try: + view_svc.drop_view(view_fqn) + except Exception as cleanup_err: + logger.warning( + "Failed to drop temp view %s after submit failure for %s: %s", + view_fqn, + table_fqn, + cleanup_err, + ) + raise except Exception as e: logger.error("Failed to submit run for %s: %s", table_fqn, e, exc_info=True) errors.append(f"{table_fqn}: {e}") @@ -238,52 +362,87 @@ def submit_dry_run( user = obo_ws.current_user.me() requesting_user = user.user_name or "unknown" - # Create view using OBO token — inherits user's table permissions - is_sql_check = body.table_fqn.startswith(_SQL_CHECK_PREFIX) - if is_sql_check: + # Create view using OBO token — inherits user's table permissions. + # For the synthetic ``__sql_check__/`` namespace we accept + # both real cross-table SQL checks (build a view from the SQL) + # and schema-validation checks (build a view from the real + # ``user_metadata.target_table`` so the row-level engine can run + # ``has_valid_schema``). See ``batch_run_from_catalog`` for the + # mirroring scheduled-run path. + is_synthetic = body.table_fqn.startswith(_SQL_CHECK_PREFIX) + sql_query: str | None = None + if is_synthetic: sql_query = _extract_sql_query(body.checks) - if not sql_query: - raise HTTPException(status_code=400, detail="SQL check has no query") - view_fqn = view_svc.create_view_from_sql(sql_query) + if sql_query: + view_fqn = view_svc.create_view_from_sql(sql_query) + else: + schema_target = _extract_schema_check_target(body.checks) + if not schema_target: + raise HTTPException( + status_code=400, + detail=( + "Dataset-level rule must provide either a sql_query or a " + "has_valid_schema check with user_metadata.target_table" + ), + ) + view_fqn = view_svc.create_view(schema_target) else: view_fqn = view_svc.create_view(body.table_fqn) - # Submit job using SP credentials - config: dict[str, Any] = { - "checks": body.checks, - "sample_size": body.sample_size, - "source_table_fqn": body.table_fqn, - "is_sql_check": is_sql_check, - } - if body.skip_history: - config["skip_history"] = True - # Skip custom metrics for preview runs — they're scoped to small - # samples and have no downstream dashboard consumers, so the - # extra observer columns just add noise. - if not body.skip_history: - custom_metrics = settings_svc.get_custom_metrics() - if custom_metrics: - config["custom_metrics"] = custom_metrics - job_run_id = job_svc.submit_run( - task_type="dryrun", - view_fqn=view_fqn, - config=config, - run_id=run_id, - requesting_user=requesting_user, - ) - - if not body.skip_history: - runs_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_validation_runs" - job_svc.record_dryrun_started( - table=runs_table, + # From here on, ``view_fqn`` is a side-effect on Unity Catalog + # that we own — if anything below raises, we MUST drop the view + # before propagating so we don't leak temp views in ``dqx_studio_tmp``. + # The terminal-status path in :func:`get_dry_run_status` handles + # cleanup for the happy case once the job finishes. + try: + config: dict[str, Any] = { + "checks": body.checks, + "sample_size": body.sample_size, + "source_table_fqn": body.table_fqn, + # SQL fast-path only fires for actual sql_query checks — + # schema-validation rules under the synthetic prefix still + # use the row-level engine. + "is_sql_check": sql_query is not None, + } + if body.skip_history: + config["skip_history"] = True + # Skip custom metrics for preview runs — they're scoped to small + # samples and have no downstream dashboard consumers, so the + # extra observer columns just add noise. + if not body.skip_history: + custom_metrics = settings_svc.get_custom_metrics() + if custom_metrics: + config["custom_metrics"] = custom_metrics + job_run_id = job_svc.submit_run( + task_type="dryrun", + view_fqn=view_fqn, + config=config, run_id=run_id, requesting_user=requesting_user, - source_table_fqn=body.table_fqn, - view_fqn=view_fqn, - sample_size=body.sample_size, - job_run_id=job_run_id, ) + if not body.skip_history: + runs_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_validation_runs" + job_svc.record_dryrun_started( + table=runs_table, + run_id=run_id, + requesting_user=requesting_user, + source_table_fqn=body.table_fqn, + view_fqn=view_fqn, + sample_size=body.sample_size, + job_run_id=job_run_id, + ) + except Exception: + try: + view_svc.drop_view(view_fqn) + except Exception as cleanup_err: + logger.warning( + "Failed to drop temp view %s after submit failure: %s", + view_fqn, + cleanup_err, + ) + raise + return DryRunSubmitOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn) except HTTPException: raise @@ -315,14 +474,19 @@ def get_dry_run_status( for validation dry runs that are not recorded in the history table. Ownership of the *job_run_id* is verified against the OBO caller via the Databricks Jobs API so a client cannot use a guessed *view_fqn* to drop - another user's temporary view. + another user's temporary view. We **fail closed**: if the job's + requesting-user attribution is missing (older run, SDK shape drift), + only admins/approvers may proceed. """ try: if job_run_id_param is not None: requesting_user = obo_ws.current_user.me().user_name or "unknown" run_owner = job_svc.get_run_creator(job_run_id_param) - if run_owner and run_owner != requesting_user: - raise HTTPException(status_code=403, detail="You can only check status of your own runs") + if run_owner != requesting_user: + raise HTTPException( + status_code=403, + detail="You can only check status of your own runs", + ) resolved_job_run_id = job_run_id_param resolved_view_fqn = view_fqn_param has_history_row = False @@ -367,13 +531,18 @@ def get_dry_run_status( logger.warning("Failed to clean up view %s: %s", resolved_view_fqn, cleanup_err) if has_history_row and is_terminal and status.state != "TERMINATED": + # INTERNAL_ERROR / SKIPPED are terminal lifecycle states that fall + # outside the run-history status vocabulary (SUCCESS/FAILED/CANCELED/ + # RUNNING). Writing them verbatim leaves rows downstream UI/queries + # don't recognise, so map them to FAILED (the synthesized-terminal + # path above already normalises to SUCCESS/FAILED). update_run_status( sql, app_conf, _DRYRUN_TABLE, run_id, - status=status.state, - error_message=status.message, + status="FAILED", + error_message=status.message or f"Run finished with state: {status.state}", ) elif has_history_row and is_terminal and status.result_state and status.result_state == "CANCELED": update_run_status( @@ -440,13 +609,20 @@ def cancel_dry_run( if job_run_id_param is not None: run_owner = job_svc.get_run_creator(job_run_id_param) - if run_owner and run_owner != canceling_user and not can_cancel_others: + # Fail closed: missing ``run_owner`` (older run, SDK drift) is + # treated the same as a non-match — only admins/approvers may + # cancel runs we cannot positively attribute to the caller. + is_owner = run_owner is not None and run_owner == canceling_user + if not is_owner and not can_cancel_others: raise HTTPException(status_code=403, detail="You can only cancel your own runs") job_svc.cancel_run(job_run_id_param) return {"status": "canceled", "run_id": run_id} meta = get_run_metadata(sql, app_conf, _DRYRUN_TABLE, run_id) - is_owner = not meta.requesting_user or meta.requesting_user == canceling_user + # Fail closed: an empty ``requesting_user`` (legacy row written + # before attribution was tracked) is treated as "not the caller", + # so admins/approvers are needed to cancel. + is_owner = bool(meta.requesting_user) and meta.requesting_user == canceling_user if not is_owner and not can_cancel_others: raise HTTPException(status_code=403, detail="You can only cancel your own runs") if meta.job_run_id is None: diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/generate.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/generate.py index a30fdc786..0747303dd 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/generate.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/generate.py @@ -18,7 +18,7 @@ @router.post( "/generate-checks", response_model=GenerateChecksOut, - operation_id="ai_assisted_checks_generation", + operation_id="aiAssistedChecksGeneration", dependencies=[require_role(*_AUTHORS_AND_ABOVE)], ) def ai_generate_checks( diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/me.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/me.py index 74cbab4e8..bc766c2a7 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/me.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/me.py @@ -19,7 +19,7 @@ router = APIRouter() -@router.get("/version", response_model=VersionOut, operation_id="version") +@router.get("/version", response_model=VersionOut, operation_id="getVersion") async def version(): return VersionOut.from_metadata() diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/profiler.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/profiler.py index 331c22b6a..aa74d500a 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/profiler.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/profiler.py @@ -145,32 +145,44 @@ def submit_profile_run( # Create view using OBO token — inherits user's table permissions view_fqn = view_svc.create_view(body.table_fqn, sample_limit=body.sample_limit) - # Submit job using SP credentials - config = { - "sample_limit": body.sample_limit, - "source_table_fqn": body.table_fqn, - "columns": body.columns, - "profile_options": body.profile_options, - } - job_run_id = job_svc.submit_run( - task_type="profile", - view_fqn=view_fqn, - config=config, - run_id=run_id, - requesting_user=requesting_user, - ) + # From here on ``view_fqn`` is a UC side-effect we own. Any failure + # before we return MUST drop the view, otherwise a half-submitted + # run leaks a temp view in ``dqx_studio_tmp``. + try: + config = { + "sample_limit": body.sample_limit, + "source_table_fqn": body.table_fqn, + "columns": body.columns, + "profile_options": body.profile_options, + } + job_run_id = job_svc.submit_run( + task_type="profile", + view_fqn=view_fqn, + config=config, + run_id=run_id, + requesting_user=requesting_user, + ) - # Write a RUNNING placeholder row immediately so the job appears in history - results_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_profiling_results" - job_svc.record_run_started( - table=results_table, - run_id=run_id, - requesting_user=requesting_user, - source_table_fqn=body.table_fqn, - view_fqn=view_fqn, - sample_limit=body.sample_limit, - job_run_id=job_run_id, - ) + results_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_profiling_results" + job_svc.record_run_started( + table=results_table, + run_id=run_id, + requesting_user=requesting_user, + source_table_fqn=body.table_fqn, + view_fqn=view_fqn, + sample_limit=body.sample_limit, + job_run_id=job_run_id, + ) + except Exception: + try: + view_svc.drop_view(view_fqn) + except Exception as cleanup_err: + logger.warning( + "Failed to drop temp view %s after profile submit failure: %s", + view_fqn, + cleanup_err, + ) + raise return ProfileRunOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn) except HTTPException: @@ -214,34 +226,47 @@ def submit_batch_profile_run( view_fqn = view_svc.create_view(table_fqn, sample_limit=body.sample_limit) - config = { - "sample_limit": body.sample_limit, - "source_table_fqn": table_fqn, - "columns": None, - "profile_options": body.profile_options, - } - job_run_id = job_svc.submit_run( - task_type="profile", - view_fqn=view_fqn, - config=config, - run_id=run_id, - requesting_user=requesting_user, - ) - - runs.append(ProfileRunOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn)) - logger.info("Submitted batch profile run for %s (run_id=%s)", table_fqn, run_id) - - # Write RUNNING placeholder so job appears in history immediately - results_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_profiling_results" - job_svc.record_run_started( - table=results_table, - run_id=run_id, - requesting_user=requesting_user, - source_table_fqn=table_fqn, - view_fqn=view_fqn, - sample_limit=body.sample_limit, - job_run_id=job_run_id, - ) + # See submit_profile_run: anything past create_view must + # drop the view on failure or we leak it. + try: + config = { + "sample_limit": body.sample_limit, + "source_table_fqn": table_fqn, + "columns": None, + "profile_options": body.profile_options, + } + job_run_id = job_svc.submit_run( + task_type="profile", + view_fqn=view_fqn, + config=config, + run_id=run_id, + requesting_user=requesting_user, + ) + + runs.append(ProfileRunOut(run_id=run_id, job_run_id=job_run_id, view_fqn=view_fqn)) + logger.info("Submitted batch profile run for %s (run_id=%s)", table_fqn, run_id) + + results_table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_profiling_results" + job_svc.record_run_started( + table=results_table, + run_id=run_id, + requesting_user=requesting_user, + source_table_fqn=table_fqn, + view_fqn=view_fqn, + sample_limit=body.sample_limit, + job_run_id=job_run_id, + ) + except Exception: + try: + view_svc.drop_view(view_fqn) + except Exception as cleanup_err: + logger.warning( + "Failed to drop temp view %s after batch profile submit failure for %s: %s", + view_fqn, + table_fqn, + cleanup_err, + ) + raise except Exception as table_err: # Classify the per-table failure so the response carries # a stable error code (``INSUFFICIENT_PERMISSIONS`` etc.) diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/quarantine.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/quarantine.py index ca0aab44c..77172b0fa 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/quarantine.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/quarantine.py @@ -3,6 +3,7 @@ import csv import io import json +import re from typing import Annotated, Any from fastapi import APIRouter, Depends, HTTPException, Query @@ -18,6 +19,30 @@ _APPROVERS_AND_ABOVE = [UserRole.ADMIN, UserRole.RULE_APPROVER] +# Match DQX check identifiers exactly: leading letter / underscore, +# letters / digits / underscores after that. Validating up front keeps +# the value safe to inline into a Spark SQL literal further down without +# needing additional escaping. +_CHECK_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]{0,127}$") + + +def _check_name_predicate(check_name: str) -> str: + """Build a Spark SQL predicate that matches quarantine rows whose + ``errors`` or ``warnings`` VARIANT array contains a struct with the + given ``name``. + + We project the VARIANT to a typed ``array>`` via + ``from_json(to_json(...))`` so the higher-order ``exists`` can run. + The ``or`` covers warning-level rules (which DQX writes to the + sibling ``warnings`` column starting in migration v4). + """ + return ( + "(EXISTS(from_json(to_json(errors), 'array>'), " + f"e -> e.name = '{check_name}') " + "OR EXISTS(from_json(to_json(warnings), 'array>'), " + f"w -> w.name = '{check_name}'))" + ) + def _query_quarantine( sql: SqlExecutor, @@ -25,14 +50,29 @@ def _query_quarantine( run_id: str, offset: int = 0, limit: int = 50, + check_name: str | None = None, ) -> tuple[list[dict[str, Any]], int]: - """Query quarantine records for a given run_id with pagination.""" + """Query quarantine records for a given run_id with pagination. + + When ``check_name`` is supplied, only rows where that DQX check + appears in the ``errors`` or ``warnings`` VARIANT payload are + returned. Both the COUNT and the data query apply the filter so + pagination stays consistent. + """ from databricks_labs_dqx_app.backend.sql_utils import escape_sql_string table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_quarantine_records" er = escape_sql_string(run_id) - count_sql = f"SELECT COUNT(*) AS cnt FROM {table} WHERE run_id = '{er}'" # noqa: S608 + where = f"run_id = '{er}'" + if check_name: + if not _CHECK_NAME_RE.match(check_name): + # Caller-controlled input; refuse anything that doesn't look + # like a normal DQX identifier rather than risk injection. + raise ValueError(f"Invalid check_name: '{check_name}'") + where = f"{where} AND {_check_name_predicate(check_name)}" + + count_sql = f"SELECT COUNT(*) AS cnt FROM {table} WHERE {where}" # noqa: S608 count_rows = sql.query(count_sql) total_count = int(count_rows[0][0] or 0) if count_rows and count_rows[0] else 0 @@ -44,7 +84,7 @@ def _query_quarantine( f"to_json(row_data) AS row_data, to_json(errors) AS errors, " f"to_json(warnings) AS warnings, " f"CAST(created_at AS STRING) AS created_at " - f"FROM {table} WHERE run_id = '{er}' " + f"FROM {table} WHERE {where} " # noqa: S608 f"ORDER BY created_at DESC LIMIT {limit} OFFSET {offset}" ) rows = sql.query_dicts(data_sql) @@ -62,9 +102,23 @@ def _row_to_record(row: dict[str, Any]) -> QuarantineRecordOut: errors = None if row.get("errors"): try: - errors = json.loads(row["errors"]) + parsed_errors = json.loads(row["errors"]) except (json.JSONDecodeError, TypeError): errors = [row["errors"]] + else: + # Normalise to a list. DQX's ``dq_result_item_schema`` (and + # the current SQL-check writer) emit a list of ``{name, + # message, ...}`` structs, but legacy SQL-check rows written + # before that fix used a single ``{check_name: message}`` + # dict. Coerce the legacy shape so those rows still display + # — and so Pydantic's ``list[Any]`` validation doesn't 500 + # the whole endpoint when one historical row is wrong. + if isinstance(parsed_errors, list): + errors = parsed_errors + elif isinstance(parsed_errors, dict): + errors = [{"name": k, "message": v} for k, v in parsed_errors.items()] + elif parsed_errors is not None: + errors = [parsed_errors] # ``warnings`` is missing on rows written before migration v4; the # column is ``null`` for SQL-check quarantines. @@ -101,11 +155,19 @@ def list_quarantine_records( app_conf: Annotated[AppConfig, Depends(get_conf)], offset: int = Query(0, ge=0), limit: int = Query(50, ge=1, le=500), + check_name: str | None = Query( + None, + description="Filter to rows that failed only this DQX check (matches either errors or warnings).", + max_length=128, + ), ) -> QuarantineListOut: try: - rows, total_count = _query_quarantine(sql, app_conf, run_id, offset, limit) + rows, total_count = _query_quarantine(sql, app_conf, run_id, offset, limit, check_name) records = [_row_to_record(r) for r in rows] return QuarantineListOut(records=records, total_count=total_count, offset=offset, limit=limit) + except ValueError as exc: + # _query_quarantine raises ValueError for malformed check_name. + raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @@ -119,10 +181,17 @@ def get_quarantine_count( run_id: str, sql: Annotated[SqlExecutor, Depends(get_sp_sql_executor)], app_conf: Annotated[AppConfig, Depends(get_conf)], + check_name: str | None = Query( + None, + description="Filter to rows that failed only this DQX check (matches either errors or warnings).", + max_length=128, + ), ) -> dict[str, int]: try: - _, total_count = _query_quarantine(sql, app_conf, run_id, 0, 0) + _, total_count = _query_quarantine(sql, app_conf, run_id, 0, 0, check_name) return {"count": total_count} + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @@ -141,6 +210,11 @@ def export_quarantine_records( app_conf: Annotated[AppConfig, Depends(get_conf)], format: str = Query("csv", pattern="^(csv|json|xlsx)$"), max_rows: int = Query(_EXPORT_MAX_ROWS, ge=1, le=_EXPORT_MAX_ROWS), + check_name: str | None = Query( + None, + description="Filter the export to rows that failed only this DQX check.", + max_length=128, + ), ) -> StreamingResponse: """Export quarantine records for a run as CSV or JSON download (capped).""" from databricks_labs_dqx_app.backend.sql_utils import escape_sql_string @@ -148,23 +222,33 @@ def export_quarantine_records( table = f"{app_conf.catalog}.{app_conf.schema_name}.dq_quarantine_records" er = escape_sql_string(run_id) + where = f"run_id = '{er}'" + if check_name: + if not _CHECK_NAME_RE.match(check_name): + raise HTTPException(status_code=400, detail=f"Invalid check_name: '{check_name}'") + where = f"{where} AND {_check_name_predicate(check_name)}" + stmt = ( f"SELECT quarantine_id, run_id, source_table_fqn, requesting_user, " f"to_json(row_data) AS row_data, to_json(errors) AS errors, " f"to_json(warnings) AS warnings, " f"CAST(created_at AS STRING) AS created_at " - f"FROM {table} WHERE run_id = '{er}' ORDER BY created_at DESC " # noqa: S608 + f"FROM {table} WHERE {where} ORDER BY created_at DESC " # noqa: S608 f"LIMIT {int(max_rows)}" ) rows = sql.query_dicts(stmt) + # Reflect the filter in the downloaded filename so an exported file + # is self-describing once it's left the UI. + filename_suffix = f"_check_{check_name}" if check_name else "" + if format == "json": records = [_row_to_record(r).model_dump() for r in rows] content = json.dumps(records, indent=2) return StreamingResponse( io.BytesIO(content.encode("utf-8")), media_type="application/json", - headers={"Content-Disposition": f'attachment; filename="quarantine_{run_id}.json"'}, + headers={"Content-Disposition": (f'attachment; filename="quarantine_{run_id}{filename_suffix}.json"')}, ) if format == "xlsx": @@ -212,7 +296,7 @@ def export_quarantine_records( return StreamingResponse( xlsx_buf, media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - headers={"Content-Disposition": f'attachment; filename="quarantine_{run_id}.xlsx"'}, + headers={"Content-Disposition": (f'attachment; filename="quarantine_{run_id}{filename_suffix}.xlsx"')}, ) buf = io.StringIO() @@ -251,5 +335,5 @@ def export_quarantine_records( return StreamingResponse( io.BytesIO(buf.getvalue().encode("utf-8")), media_type="text/csv", - headers={"Content-Disposition": f'attachment; filename="quarantine_{run_id}.csv"'}, + headers={"Content-Disposition": (f'attachment; filename="quarantine_{run_id}{filename_suffix}.csv"')}, ) diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/review_status.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/review_status.py new file mode 100644 index 000000000..6bbae302e --- /dev/null +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/review_status.py @@ -0,0 +1,208 @@ +"""Per-run review status endpoints. + +Surface for the dropdown that lives next to the comments thread on the +Runs detail page, plus the audit-history list shown below it. Any +authenticated user can change a run's status (matches the comments +behaviour — business reviewers don't have a special role); the +catalogue of allowed values is admin-managed in +``/api/v1/config/run-review-statuses`` (see ``config.py``). + +Route shape +----------- +``GET /api/v1/runs/{run_id}/review-status`` — current effective value +(persisted row or virtual default) plus ``updated_by`` / ``updated_at`` +metadata. + +``PUT /api/v1/runs/{run_id}/review-status`` — set the status. Validated +against the live catalogue server-side. + +``DELETE /api/v1/runs/{run_id}/review-status`` — revert to the +catalogue default (drops the explicit row, appends a history entry). + +``GET /api/v1/runs/{run_id}/review-status/history`` — last ≤200 audit +rows, newest first. Surfaced as a collapsible list on the run detail +page. +""" + +from __future__ import annotations + +from typing import Annotated + +from databricks.sdk import WorkspaceClient +from fastapi import APIRouter, Depends, HTTPException + +from databricks_labs_dqx_app.backend.common.authorization import UserRole +from databricks_labs_dqx_app.backend.dependencies import ( + get_obo_ws, + get_review_status_service, + require_role, +) +from databricks_labs_dqx_app.backend.logger import logger +from databricks_labs_dqx_app.backend.services.review_status_service import ( + ReviewStatusHistoryEntry, + ReviewStatusRecord, + ReviewStatusService, +) +from pydantic import BaseModel + +router = APIRouter() + +# Visible to every authenticated principal. Matches the comments +# routes — review actions are deliberately broad-reach so business +# users can self-serve without being assigned a role. +_ALL_ROLES = [UserRole.ADMIN, UserRole.RULE_APPROVER, UserRole.RULE_AUTHOR, UserRole.VIEWER] + + +class ReviewStatusOut(BaseModel): + """Current effective review status for a run.""" + + run_id: str + status: str + updated_by: str | None = None + updated_at: str | None = None + is_default: bool = False + + +class SetReviewStatusIn(BaseModel): + status: str + + +class ReviewStatusHistoryEntryOut(BaseModel): + run_id: str + status: str + previous_status: str | None = None + changed_by: str + changed_at: str | None = None + + +class ReviewStatusHistoryOut(BaseModel): + history: list[ReviewStatusHistoryEntryOut] + + +def _record_to_out(record: ReviewStatusRecord) -> ReviewStatusOut: + return ReviewStatusOut( + run_id=record.run_id, + status=record.status, + updated_by=record.updated_by, + updated_at=record.updated_at, + is_default=record.is_default, + ) + + +def _history_entry_to_out(entry: ReviewStatusHistoryEntry) -> ReviewStatusHistoryEntryOut: + return ReviewStatusHistoryEntryOut( + run_id=entry.run_id, + status=entry.status, + previous_status=entry.previous_status, + changed_by=entry.changed_by, + changed_at=entry.changed_at, + ) + + +@router.get( + "/{run_id}/review-status", + response_model=ReviewStatusOut, + operation_id="getRunReviewStatus", + dependencies=[require_role(*_ALL_ROLES)], +) +def get_run_review_status( + run_id: str, + svc: Annotated[ReviewStatusService, Depends(get_review_status_service)], +) -> ReviewStatusOut: + """Return the effective status for the run. + + Falls back to the catalogue default when no explicit row exists — + the UI uses ``is_default`` to render an "(auto)" hint and skip the + ``updated_by`` / ``updated_at`` metadata that would otherwise be + misleading for an unreviewed run. + """ + try: + return _record_to_out(svc.get_effective(run_id)) + except Exception as e: + logger.error("Failed to get review status for run %s: %s", run_id, e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to get review status: {e}") + + +@router.put( + "/{run_id}/review-status", + response_model=ReviewStatusOut, + operation_id="setRunReviewStatus", + dependencies=[require_role(*_ALL_ROLES)], +) +def set_run_review_status( + run_id: str, + body: SetReviewStatusIn, + svc: Annotated[ReviewStatusService, Depends(get_review_status_service)], + obo_ws: Annotated[WorkspaceClient, Depends(get_obo_ws)], +) -> ReviewStatusOut: + """Set the review status for a run. + + Records the change in the audit history with the previous effective + value (virtual default included) so the run detail page can render + "Pending review → Acknowledged" naturally. + """ + try: + user = obo_ws.current_user.me() + user_email = user.user_name or "unknown" + record = svc.set_status(run_id, body.status, user_email=user_email) + return _record_to_out(record) + except ValueError as e: + # Service raises ValueError for unknown status / empty inputs. + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error("Failed to set review status for run %s: %s", run_id, e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to set review status: {e}") + + +@router.delete( + "/{run_id}/review-status", + response_model=ReviewStatusOut, + operation_id="clearRunReviewStatus", + dependencies=[require_role(*_ALL_ROLES)], +) +def clear_run_review_status( + run_id: str, + svc: Annotated[ReviewStatusService, Depends(get_review_status_service)], + obo_ws: Annotated[WorkspaceClient, Depends(get_obo_ws)], +) -> ReviewStatusOut: + """Revert the run to the catalogue default. + + Drops the explicit row and appends a history entry. Useful when a + reviewer wants to "unacknowledge" something without picking another + explicit value (e.g. the run was re-classified and should go back + into the unreviewed queue). + """ + try: + user = obo_ws.current_user.me() + user_email = user.user_name or "unknown" + record = svc.clear_status(run_id, user_email=user_email) + return _record_to_out(record) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error("Failed to clear review status for run %s: %s", run_id, e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to clear review status: {e}") + + +@router.get( + "/{run_id}/review-status/history", + response_model=ReviewStatusHistoryOut, + operation_id="getRunReviewStatusHistory", + dependencies=[require_role(*_ALL_ROLES)], +) +def get_run_review_status_history( + run_id: str, + svc: Annotated[ReviewStatusService, Depends(get_review_status_service)], +) -> ReviewStatusHistoryOut: + """Return up to 200 most-recent audit rows, newest first.""" + try: + history = svc.get_history(run_id) + return ReviewStatusHistoryOut(history=[_history_entry_to_out(h) for h in history]) + except Exception as e: + logger.error( + "Failed to load review status history for run %s: %s", + run_id, + e, + exc_info=True, + ) + raise HTTPException(status_code=500, detail=f"Failed to load review status history: {e}") diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/roles.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/roles.py index 04716b770..1cea443af 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/roles.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/roles.py @@ -19,9 +19,13 @@ from databricks_labs_dqx_app.backend.models import ( CreateRoleMappingIn, GroupOut, + RoleMappingHistoryOut, RoleMappingOut, ) -from databricks_labs_dqx_app.backend.services.role_service import RoleService +from databricks_labs_dqx_app.backend.services.role_service import ( + RoleMappingHistoryEntry, + RoleService, +) router = APIRouter(dependencies=[require_role(UserRole.ADMIN)]) @@ -37,6 +41,16 @@ def _mapping_to_out(mapping) -> RoleMappingOut: ) +def _history_to_out(entry: RoleMappingHistoryEntry) -> RoleMappingHistoryOut: + return RoleMappingHistoryOut( + role=entry.role, + group_name=entry.group_name, + action=entry.action, + changed_by=entry.changed_by, + changed_at=entry.changed_at.isoformat() if entry.changed_at else None, + ) + + @router.get("", response_model=list[RoleMappingOut], operation_id="listRoleMappings") def list_role_mappings( svc: Annotated[RoleService, Depends(get_role_service)], @@ -74,20 +88,67 @@ def delete_role_mapping( role: str, group_name: str, svc: Annotated[RoleService, Depends(get_role_service)], + obo_ws: Annotated[WorkspaceClient, Depends(get_obo_ws)], ) -> dict[str, str]: - """Delete a role-to-group mapping (Admin only).""" + """Delete a role-to-group mapping (Admin only). + + The acting admin's email is captured into ``dq_role_mappings_history`` + so the audit log answers "who removed this mapping?" without having to + cross-reference workspace audit events. + """ valid_roles = {r.value for r in UserRole} if role not in valid_roles: raise HTTPException(status_code=400, detail=f"Invalid role: {role}. Must be one of {sorted(valid_roles)}") try: - svc.delete_mapping(role, group_name) + user = obo_ws.current_user.me() + user_email = user.user_name or "unknown" + svc.delete_mapping(role, group_name, user_email) return {"status": "deleted", "role": role, "group_name": group_name} except Exception as e: logger.error(f"Failed to delete role mapping: {e}", exc_info=True) raise HTTPException(status_code=500, detail=f"Failed to delete role mapping: {e}") +@router.get( + "/history", + response_model=list[RoleMappingHistoryOut], + operation_id="listRoleMappingHistory", +) +def list_role_mapping_history( + svc: Annotated[RoleService, Depends(get_role_service)], + role: Annotated[ + str | None, + Query(description="Optional exact-match filter on role name."), + ] = None, + group_name: Annotated[ + str | None, + Query(description="Optional exact-match filter on Databricks workspace group name."), + ] = None, + limit: Annotated[ + int, + Query( + ge=1, + le=1000, + description="Maximum number of history rows to return (default 200, max 1000).", + ), + ] = 200, +) -> list[RoleMappingHistoryOut]: + """List role-mapping audit history, newest first (Admin only). + + Returns rows from ``dq_role_mappings_history`` — the append-only + audit trail of every create/delete against ``dq_role_mappings``. + Independent of the live mapping table, so deleted mappings still + appear in the timeline. + """ + try: + entries = svc.list_history(role=role, group_name=group_name, limit=limit) + return [_history_to_out(e) for e in entries] + except Exception as e: + logger.error(f"Failed to list role mapping history: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to list role mapping history: {e}") + + @router.get("/groups", response_model=list[GroupOut], operation_id="listWorkspaceGroups") def list_workspace_groups( sp_ws: Annotated[WorkspaceClient, Depends(get_sp_ws)], diff --git a/app/src/databricks_labs_dqx_app/backend/routes/v1/settings.py b/app/src/databricks_labs_dqx_app/backend/routes/v1/settings.py index 8804134cc..921259945 100644 --- a/app/src/databricks_labs_dqx_app/backend/routes/v1/settings.py +++ b/app/src/databricks_labs_dqx_app/backend/routes/v1/settings.py @@ -12,12 +12,12 @@ router = APIRouter() -@router.get("", response_model=InstallationSettings, operation_id="get_settings") +@router.get("", response_model=InstallationSettings, operation_id="getSettings") def get_settings(obo_ws: Annotated[WorkspaceClient, Depends(get_obo_ws)]): return SettingsManager(obo_ws).get_settings() -@router.post("", response_model=InstallationSettings, operation_id="save_settings") +@router.post("", response_model=InstallationSettings, operation_id="saveSettings") def save_settings(settings: InstallationSettings, obo_ws: Annotated[WorkspaceClient, Depends(get_obo_ws)]): try: return SettingsManager(obo_ws).save_settings(settings) diff --git a/app/src/databricks_labs_dqx_app/backend/services/ai_rules_service.py b/app/src/databricks_labs_dqx_app/backend/services/ai_rules_service.py index df0761994..904f206b2 100644 --- a/app/src/databricks_labs_dqx_app/backend/services/ai_rules_service.py +++ b/app/src/databricks_labs_dqx_app/backend/services/ai_rules_service.py @@ -136,11 +136,39 @@ def generate(self, user_input: str, table_fqn: str | None = None) -> list[dict[s Returns: List of DQX rule dicts. """ - system = SystemMessage(content=_SYSTEM_TEMPLATE.format(available_functions=self._get_available_functions())) schema_info = self._get_schema_info(table_fqn) if table_fqn else "" + return self.generate_from_schema_info(user_input=user_input, schema_info=schema_info) + + def generate_from_schema_info(self, user_input: str, schema_info: str = "") -> list[dict[str, Any]]: + """Generate DQX rules from natural language with a pre-built schema_info. + + Used by the data-contract importer for text/natural-language quality + expectations: the schema is already known from the contract, so there + is no UC table to look up. This reuses the same ChatDatabricks prompt + and few-shot context as :meth:`generate` — DQX's own contract text-rule + path needs ``dspy`` + a SparkSession, which the stateless app container + doesn't have, so we route contract text rules through this LLM leg + instead and tag the results with ``rule_type: text_llm`` upstream. + + Args: + user_input: Natural language description of the quality expectation. + schema_info: JSON string describing the table columns (may be empty). + + Returns: + List of DQX rule dicts. + """ + system = SystemMessage(content=_SYSTEM_TEMPLATE.format(available_functions=self._get_available_functions())) human = HumanMessage(content=f"schema_info: {schema_info}\nbusiness_description: {user_input}") messages: list[BaseMessage] = [system, *self._get_few_shot_messages(), human] - llm = ChatDatabricks(endpoint=conf.llm_endpoint, workspace_client=self._sp_ws) + # ``max_tokens`` caps the per-call output budget (AGENTS.md / OWASP + # LLM04): rule generation returns small JSON payloads, so a bounded + # cap protects against pathological prompts triggering runaway, + # expensive inference without truncating legitimate responses. + llm = ChatDatabricks( + endpoint=conf.llm_endpoint, + workspace_client=self._sp_ws, + max_tokens=conf.llm_max_tokens, + ) response = llm.invoke(messages) return self._parse_response(str(response.content)) diff --git a/app/src/databricks_labs_dqx_app/backend/services/app_settings_service.py b/app/src/databricks_labs_dqx_app/backend/services/app_settings_service.py index a25cd2d0f..4fdb62b00 100644 --- a/app/src/databricks_labs_dqx_app/backend/services/app_settings_service.py +++ b/app/src/databricks_labs_dqx_app/backend/services/app_settings_service.py @@ -199,3 +199,219 @@ def _get_int_setting(self, key: str) -> int | None: except (TypeError, ValueError): logger.warning("Setting %s is not parseable as int (%r); ignoring", key, raw) return None + + # ------------------------------------------------------------------ + # Embedded dashboard — Insights page renders a Databricks AI/BI + # dashboard inside an iframe. Admins set the dashboard ID + an + # optional display title via the Configuration page; the GET + # endpoint falls back to ``conf.default_dashboard_id`` (env) when + # this setting is unset, so a bundle can ship a starter dashboard + # ID without preventing customer overrides. + # ------------------------------------------------------------------ + + _EMBEDDED_DASHBOARD_KEY = "embedded_dashboard_v1" + + def get_embedded_dashboard(self) -> dict | None: + """Return ``{"dashboard_id": str, "title": str | None}`` or ``None`` if unset.""" + raw = self.get_setting(self._EMBEDDED_DASHBOARD_KEY) + if not raw: + return None + try: + parsed = json.loads(raw) + except (TypeError, json.JSONDecodeError): + logger.warning("embedded_dashboard_v1 setting is not valid JSON; ignoring") + return None + if not isinstance(parsed, dict): + logger.warning("embedded_dashboard_v1 setting is not a dict; ignoring") + return None + dashboard_id = parsed.get("dashboard_id") + if not isinstance(dashboard_id, str) or not dashboard_id.strip(): + return None + title = parsed.get("title") + return { + "dashboard_id": dashboard_id.strip(), + "title": title.strip() if isinstance(title, str) and title.strip() else None, + } + + def save_embedded_dashboard( + self, + dashboard_id: str, + title: str | None = None, + *, + user_email: str | None = None, + ) -> dict: + """Persist the embedded dashboard ID + optional title. Returns the saved payload.""" + cleaned_id = (dashboard_id or "").strip() + cleaned_title = (title or "").strip() or None + payload = {"dashboard_id": cleaned_id, "title": cleaned_title} + self.save_setting(self._EMBEDDED_DASHBOARD_KEY, json.dumps(payload), user_email=user_email) + return payload + + def delete_embedded_dashboard(self, *, user_email: str | None = None) -> None: + """Clear the embedded dashboard setting so the env default takes over again.""" + self.save_setting(self._EMBEDDED_DASHBOARD_KEY, "", user_email=user_email) + + # ------------------------------------------------------------------ + # Run review statuses — admin-managed list of labels surfaced on the + # Runs detail page (next to comments) and as a Runs History filter. + # Stored as a JSON array under ``run_review_statuses_v1``. One entry + # MUST be flagged ``is_default``; that value is what + # ``ReviewStatusService`` returns virtually for runs that have never + # been explicitly reviewed. + # + # We seed a sensible default list on first read (rather than at + # migration time) so the feature works out-of-the-box even on + # already-deployed workspaces, and so the seed list can evolve in + # code without needing a fresh migration. + # ------------------------------------------------------------------ + + _RUN_REVIEW_STATUSES_KEY = "run_review_statuses_v1" + + # Default catalogue shipped on first read. The colors are token + # names the UI maps to its design-system palette so we can rebrand + # without touching backend data. + _RUN_REVIEW_STATUSES_SEED: list[dict] = [ + { + "value": "Pending review", + "description": "Awaiting business review", + "color": "gray", + "is_default": True, + }, + { + "value": "Acknowledged", + "description": "Known issue, accepted by owners", + "color": "amber", + "is_default": False, + }, + { + "value": "Resolved", + "description": "Fixed upstream", + "color": "green", + "is_default": False, + }, + { + "value": "False positive", + "description": "Rule is wrong, not a real issue", + "color": "blue", + "is_default": False, + }, + ] + + def get_run_review_statuses(self) -> list[dict]: + """Return the admin-managed catalogue of run review status values. + + Falls back to (and persists) the seed list on first read so the + Runs detail dropdown is never empty after a fresh deploy. + Returned entries are always normalised — ``value`` trimmed, + ``description`` defaulted to empty string, ``color`` defaulted + to ``"gray"``, ``is_default`` coerced to bool — so call sites + can index by field without defensive lookups. + """ + raw = self.get_setting(self._RUN_REVIEW_STATUSES_KEY) + if not raw: + logger.info("Seeding default run_review_statuses on first read") + self._persist_run_review_statuses(self._RUN_REVIEW_STATUSES_SEED, user_email=None) + return [self._normalise_status_entry(e) for e in self._RUN_REVIEW_STATUSES_SEED] + + try: + parsed = json.loads(raw) + except (TypeError, json.JSONDecodeError): + logger.warning("run_review_statuses_v1 is not valid JSON; falling back to seed") + return [self._normalise_status_entry(e) for e in self._RUN_REVIEW_STATUSES_SEED] + if not isinstance(parsed, list): + logger.warning("run_review_statuses_v1 is not a list; falling back to seed") + return [self._normalise_status_entry(e) for e in self._RUN_REVIEW_STATUSES_SEED] + + out: list[dict] = [] + for item in parsed: + if not isinstance(item, dict): + continue + normalised = self._normalise_status_entry(item) + if normalised["value"]: + out.append(normalised) + # Defensive: if the persisted list is malformed and we end up + # with nothing, surface the seed rather than an empty dropdown. + return out or [self._normalise_status_entry(e) for e in self._RUN_REVIEW_STATUSES_SEED] + + def save_run_review_statuses( + self, + statuses: list[dict], + *, + user_email: str | None = None, + ) -> list[dict]: + """Replace the admin-managed catalogue, enforcing exactly one default. + + Validation rules (raise ``ValueError`` on violation so the route + can turn them into a 400): + - At least one entry is required (callers always need a default + to surface for unreviewed runs). + - ``value`` must be non-empty, trimmed, and unique within the list. + - Exactly one entry must have ``is_default=True``. + """ + cleaned: list[dict] = [] + seen: set[str] = set() + for item in statuses or []: + if not isinstance(item, dict): + continue + normalised = self._normalise_status_entry(item) + value = normalised["value"] + if not value: + raise ValueError("Run review status 'value' must be non-empty.") + if value in seen: + raise ValueError(f"Duplicate run review status value: {value!r}.") + seen.add(value) + cleaned.append(normalised) + + if not cleaned: + raise ValueError("At least one run review status is required.") + + defaults = [e for e in cleaned if e["is_default"]] + if len(defaults) != 1: + raise ValueError(f"Exactly one run review status must be marked as default; got {len(defaults)}.") + + self._persist_run_review_statuses(cleaned, user_email=user_email) + return cleaned + + def get_default_run_review_status(self) -> str: + """Return the ``value`` of the catalogue entry flagged ``is_default``. + + Called by ``ReviewStatusService`` to surface an effective status + for runs that have no explicit row. Guaranteed non-empty because + :meth:`save_run_review_statuses` enforces the invariant. + """ + for entry in self.get_run_review_statuses(): + if entry["is_default"]: + return entry["value"] + # Should be unreachable thanks to the save-side invariant, but + # we fall back to the seed default rather than raise so a buggy + # write can't take down the whole listings endpoint. + return self._RUN_REVIEW_STATUSES_SEED[0]["value"] + + def _persist_run_review_statuses( + self, + entries: list[dict], + *, + user_email: str | None, + ) -> None: + self.save_setting( + self._RUN_REVIEW_STATUSES_KEY, + json.dumps([self._normalise_status_entry(e) for e in entries]), + user_email=user_email, + ) + logger.info("Saved %d run review status(es) (by=%s)", len(entries), user_email or "system") + + @staticmethod + def _normalise_status_entry(item: dict) -> dict: + value = (item.get("value") or "").strip() if isinstance(item.get("value"), str) else "" + description = item.get("description") or "" + if not isinstance(description, str): + description = "" + color = item.get("color") or "gray" + if not isinstance(color, str) or not color.strip(): + color = "gray" + return { + "value": value, + "description": description.strip(), + "color": color.strip(), + "is_default": bool(item.get("is_default")), + } diff --git a/app/src/databricks_labs_dqx_app/backend/services/contract_rules_service.py b/app/src/databricks_labs_dqx_app/backend/services/contract_rules_service.py new file mode 100644 index 000000000..c2af8b1e6 --- /dev/null +++ b/app/src/databricks_labs_dqx_app/backend/services/contract_rules_service.py @@ -0,0 +1,551 @@ +"""Generate DQX quality rules from ODCS v3.x data contracts. + +Thin wrapper around DQX's :class:`DataContractRulesGenerator`. We do **not** +go through :class:`DQGenerator`, because that class eagerly calls +``SparkSession.builder.getOrCreate()`` in its constructor — fine on a +Databricks cluster, fatal inside a stateless Databricks App container that +has no Spark runtime. + +Generated rules are returned untouched from DQX. We additionally bucket +them by ODCS ``schema`` (extracted from ``user_metadata.schema``) so the +UI can let the user assign each ODCS schema to a Unity Catalog table +before saving — most contracts describe the data product abstractly and +don't carry a fully-qualified UC name. +""" + +from __future__ import annotations + +import json +import logging +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar + +from databricks.sdk import WorkspaceClient + +if TYPE_CHECKING: + from databricks_labs_dqx_app.backend.services.ai_rules_service import AiRulesService + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ContractMetadata: + """Subset of ODCS top-level fields surfaced in the UI.""" + + contract_id: str | None + name: str | None + version: str | None + odcs_api_version: str | None + status: str | None + owner: str | None + domain: str | None + description: str | None + + +@dataclass(frozen=True) +class ContractSchemaRules: + """One ODCS schema and the rules generated for it.""" + + schema_name: str + physical_name: str | None + property_count: int + rules: list[dict[str, Any]] + + +@dataclass(frozen=True) +class ContractGenerationResult: + metadata: ContractMetadata + schemas: list[ContractSchemaRules] + unassigned_rules: list[dict[str, Any]] + total_rules: int + warnings: list[str] + validation_errors: list[str] + + +@dataclass(frozen=True) +class _TextExpectation: + """One ODCS ``type: text`` quality expectation to feed to the LLM.""" + + schema_name: str + field: str | None + description: str + schema_info: str # JSON string of the schema's columns for LLM context + + +class ContractRulesService: + """Generate DQX rules from a raw ODCS contract YAML/JSON string.""" + + # Upper bound on how many ``type: text`` expectations we route to the + # LLM in a single contract import. Each expectation costs one + # ChatDatabricks call, so without a cap an author-supplied contract with + # thousands of text quality entries amplifies into thousands of LLM + # invocations (cost/latency DoS — OWASP LLM04, see AGENTS.md). Beyond + # this limit we process the first N and warn the caller. + _MAX_TEXT_EXPECTATIONS: ClassVar[int] = 100 + + def __init__( + self, + sp_ws: WorkspaceClient, + ai_service: "AiRulesService | None" = None, + ) -> None: + # Service principal client is sufficient: contract parsing is + # local, no UC reads happen during predefined/explicit/schema + # rule generation. + self._ws = sp_ws + # Optional AI service for natural-language (``type: text``) quality + # expectations. DQX's own contract text-rule path requires ``dspy`` + # + a SparkSession (see DataContractRulesGenerator.__init__), neither + # of which exists in the stateless app container — so we extract the + # text expectations here and route them through the same + # ChatDatabricks LLM leg the AI-Assisted Generation page uses. + self._ai_service = ai_service + + def generate( + self, + contract_text: str, + *, + generate_predefined_rules: bool = True, + process_text_rules: bool = False, + generate_schema_validation: bool = True, + strict_schema_validation: bool = True, + default_criticality: str = "error", + ) -> ContractGenerationResult: + # Imports are local so a missing ``[datacontract]`` extra surfaces + # as a clean 500 in this single endpoint instead of breaking app + # boot. + try: + from datacontract.data_contract import DataContract # type: ignore[import-untyped] + from databricks.labs.dqx.datacontract.contract_rules_generator import ( + DataContractRulesGenerator, + ) + except ImportError as exc: + raise RuntimeError( + "Data contract support is not installed. Install with " "'databricks-labs-dqx[datacontract]'." + ) from exc + + if not contract_text.strip(): + raise ValueError("Contract text is empty") + + metadata, schema_index = self._parse_metadata_and_schemas(contract_text) + + contract = DataContract(data_contract_str=contract_text) + generator = DataContractRulesGenerator(workspace_client=self._ws) + + warnings: list[str] = [] + + # Predefined + explicit + schema-validation rules come straight from + # DQX. ``process_text_rules=False`` here because DQX's text path needs + # an LLM engine (dspy + Spark) we can't construct in-container; we + # handle text expectations separately below via the app's LLM leg. + rules = generator.generate_rules_from_contract( + contract=contract, + generate_predefined_rules=generate_predefined_rules, + process_text_rules=False, + generate_schema_validation=generate_schema_validation, + strict_schema_validation=strict_schema_validation, + default_criticality=default_criticality, + ) + + # Natural-language (``type: text``) expectations: run each through the + # ChatDatabricks-backed AI service and tag the output as ``text_llm`` + # so it carries the same lineage metadata as DQX-native contract rules. + if process_text_rules: + text_rules, text_warnings = self._generate_text_rules( + contract_text, + metadata, + default_criticality=default_criticality, + ) + rules.extend(text_rules) + warnings.extend(text_warnings) + + # Gate every generated rule (DQX-native predefined/schema rules *and* + # LLM-produced text rules) through the same DQEngine.validate_checks + # used by the AI-assisted ``/generate`` endpoint, so malformed or + # unresolvable rules are surfaced here instead of only failing later + # at execution time. Non-blocking: errors are returned for the UI to + # flag, mirroring the AI page's behaviour. + validation_errors = self._validate_rules(rules) + + buckets, unassigned = self._bucket_rules_by_schema(rules, schema_index) + return ContractGenerationResult( + metadata=metadata, + schemas=buckets, + unassigned_rules=unassigned, + total_rules=len(rules), + warnings=warnings, + validation_errors=validation_errors, + ) + + @staticmethod + def _validate_rules(rules: list[dict[str, Any]]) -> list[str]: + """Run ``DQEngine.validate_checks`` over the generated rules. + + Returns a flat list of validation error strings (empty when all + rules are valid). Validation failures are reported, not raised, so a + single bad rule doesn't sink an otherwise-usable contract import. + """ + if not rules: + return [] + try: + from databricks.labs.dqx.engine import DQEngine + except ImportError: # pragma: no cover - core dep, defensive only + logger.warning("DQEngine is unavailable; skipping contract rule validation.") + return [] + status = DQEngine.validate_checks(rules) + return list(status.errors) if status.has_errors else [] + + # ------------------------------------------------------------------ + # Text / natural-language expectations + # ------------------------------------------------------------------ + + def _generate_text_rules( + self, + contract_text: str, + metadata: ContractMetadata, + *, + default_criticality: str, + ) -> tuple[list[dict[str, Any]], list[str]]: + """Generate DQX rules from the contract's ``type: text`` expectations. + + Returns ``(rules, warnings)``. Each generated rule is tagged with + ``user_metadata`` mirroring DQX's contract lineage fields plus + ``rule_type: text_llm`` and the original ``text_expectation`` so the + UI groups and traces them exactly like DQX-native contract rules. + """ + if self._ai_service is None: + return [], [ + "Natural-language expectations were skipped: AI-Assisted generation " + "is not configured on this deployment." + ] + + expectations = self._extract_text_expectations(contract_text) + if not expectations: + return [], [] + + rules: list[dict[str, Any]] = [] + warnings: list[str] = [] + + # Bound LLM fan-out before issuing any calls: one ChatDatabricks + # invocation per expectation, so cap the count to keep cost/latency + # bounded for adversarially large contracts (OWASP LLM04). + if len(expectations) > self._MAX_TEXT_EXPECTATIONS: + warnings.append( + f"Contract has {len(expectations)} natural-language expectations; only the " + f"first {self._MAX_TEXT_EXPECTATIONS} were processed to bound LLM cost. " + "Reduce the number of text quality expectations or split the contract to " + "process the rest." + ) + expectations = expectations[: self._MAX_TEXT_EXPECTATIONS] + + for exp in expectations: + try: + generated = self._ai_service.generate_from_schema_info( + user_input=exp.description, + schema_info=exp.schema_info, + ) + except Exception: # pragma: no cover - defensive guard + # Log full detail server-side (with newline-scrubbed, + # untrusted contract values) but never relay the raw LLM/SDK + # exception text back to the caller — it can echo prompt + # content or internal structure (AGENTS.md LLM06 / CWE-209). + logger.warning( + "Failed to generate text rule for schema '%s' field '%s'.", + _scrub_for_log(exp.schema_name), + _scrub_for_log(exp.field), + exc_info=True, + ) + warnings.append( + "Could not generate a rule for the text expectation on " + f"'{exp.field or exp.schema_name}'. See server logs for details." + ) + continue + for rule in generated: + if not isinstance(rule, dict): + continue + # LLM output is untrusted (AGENTS.md): drop any rule whose + # check function does not resolve through CHECK_FUNC_REGISTRY + # or whose generated SQL fails is_sql_query_safe(), before it + # can flow to the UI / be saved / executed. + if not self._is_llm_rule_safe(rule): + logger.warning( + "Discarded unsafe/unresolved LLM-generated rule for schema '%s' field '%s'.", + _scrub_for_log(exp.schema_name), + _scrub_for_log(exp.field), + ) + warnings.append( + "An AI-generated rule for " + f"'{exp.field or exp.schema_name}' was discarded because it " + "referenced an unknown check function or unsafe SQL." + ) + continue + user_metadata = { + "contract_id": metadata.contract_id or "unknown", + "contract_version": metadata.version or "unknown", + "odcs_version": metadata.odcs_api_version or "unknown", + "schema": exp.schema_name, + "rule_type": "text_llm", + "text_expectation": exp.description, + **(rule.get("user_metadata") or {}), + } + if exp.field: + user_metadata["field"] = exp.field + rule["user_metadata"] = user_metadata + rule.setdefault("criticality", default_criticality) + rules.append(rule) + if not rules and not warnings: + warnings.append("No rules were generated from the contract's text expectations.") + return rules, warnings + + # SQL-bearing fields an LLM could populate with unsafe statements: the + # rule's top-level ``filter`` plus the arguments used by SQL-based check + # functions (sql_expression → ``expression``, sql_query → ``query``, + # several checks → ``row_filter``). + _SQL_BEARING_KEYS: ClassVar[tuple[str, ...]] = ("expression", "query", "row_filter", "filter") + + @classmethod + def _is_llm_rule_safe(cls, rule: dict[str, Any]) -> bool: + """Validate an LLM-produced rule before it reaches the UI. + + Per AGENTS.md, LLM-generated output is untrusted: the check function + name must resolve through ``CHECK_FUNC_REGISTRY`` and any generated + SQL fragment must pass ``is_sql_query_safe()``. Returns ``False`` for + hallucinated function names or unsafe SQL so the caller drops the rule. + """ + # Local imports keep module import cheap and ensure the check-function + # registry is populated (importing ``check_funcs`` runs the + # ``@register_rule`` decorators). + from databricks.labs.dqx import check_funcs # noqa: F401 pylint: disable=unused-import + from databricks.labs.dqx.rule import CHECK_FUNC_REGISTRY + from databricks.labs.dqx.utils import is_sql_query_safe + + check = rule.get("check") + if not isinstance(check, dict): + return False + function = check.get("function") + if not isinstance(function, str) or function not in CHECK_FUNC_REGISTRY: + return False + + sql_fragments: list[str] = [] + top_filter = rule.get("filter") + if isinstance(top_filter, str): + sql_fragments.append(top_filter) + arguments = check.get("arguments") + if isinstance(arguments, dict): + for key in cls._SQL_BEARING_KEYS: + value = arguments.get(key) + if isinstance(value, str): + sql_fragments.append(value) + return all(is_sql_query_safe(sql) for sql in sql_fragments if sql.strip()) + + @staticmethod + def _extract_text_expectations(contract_text: str) -> list[_TextExpectation]: + """Pull ODCS ``type: text`` quality expectations from the contract. + + Mirrors DQX's ``_process_text_rules_for_schema`` extraction: both + property-level and schema-level ``quality`` entries with ``type: + text`` are collected, each paired with a JSON schema_info blob so the + LLM has column context. + """ + import yaml + + try: + data = yaml.safe_load(contract_text) or {} + except yaml.YAMLError: + return [] + if not isinstance(data, dict): + return [] + + raw_schemas = data.get("schema") or [] + if not isinstance(raw_schemas, list): + return [] + + expectations: list[_TextExpectation] = [] + for entry in raw_schemas: + if not isinstance(entry, dict): + continue + schema_name = _first_str(entry.get("name")) or "unknown_schema" + schema_info = ContractRulesService._build_schema_info(entry) + + for q in _text_quality_descriptions(entry.get("quality")): + expectations.append( + _TextExpectation(schema_name=schema_name, field=None, description=q, schema_info=schema_info) + ) + + props = entry.get("properties") or [] + if isinstance(props, list): + for prop in props: + if not isinstance(prop, dict): + continue + field = _first_str(prop.get("name")) + for q in _text_quality_descriptions(prop.get("quality")): + expectations.append( + _TextExpectation( + schema_name=schema_name, + field=field, + description=q, + schema_info=schema_info, + ) + ) + return expectations + + @staticmethod + def _build_schema_info(schema_entry: dict[str, Any]) -> str: + """Build a JSON ``{name, columns:[...]}`` blob for LLM schema context.""" + columns: list[dict[str, str]] = [] + props = schema_entry.get("properties") or [] + if isinstance(props, list): + for prop in props: + if not isinstance(prop, dict): + continue + name = _first_str(prop.get("name")) + if not name: + continue + col: dict[str, str] = {"name": name} + type_value = _first_str(prop.get("logicalType"), prop.get("physicalType")) + if type_value: + col["type"] = type_value + description = _first_str(prop.get("description")) + if description: + col["description"] = description + columns.append(col) + return json.dumps({"name": _first_str(schema_entry.get("name")), "columns": columns}) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _parse_metadata_and_schemas( + contract_text: str, + ) -> tuple[ContractMetadata, dict[str, dict[str, Any]]]: + """Pull display metadata + per-schema property counts directly from YAML. + + We intentionally do this on the raw dict rather than the parsed + ``OpenDataContractStandard`` model so we don't fail rendering when + the contract has minor unknown fields the model doesn't like — the + downstream generator will fail with a clear error if the contract + is truly invalid. + """ + import yaml # local import to keep module import cheap + + try: + data = yaml.safe_load(contract_text) or {} + except yaml.YAMLError as exc: + raise ValueError(f"Contract YAML is invalid: {exc}") from exc + if not isinstance(data, dict): + raise ValueError("Contract must be a YAML mapping at the top level") + + info = data.get("info") or {} + owner_value = data.get("owner") or info.get("owner") + metadata = ContractMetadata( + contract_id=_first_str(data.get("id"), info.get("id")), + name=_first_str(data.get("name"), info.get("title"), info.get("name")), + version=_first_str(data.get("version"), info.get("version")), + odcs_api_version=_first_str(data.get("apiVersion")), + status=_first_str(data.get("status"), info.get("status")), + owner=_first_str(owner_value), + domain=_first_str(data.get("domain"), info.get("domain")), + description=_first_str(data.get("description"), info.get("description")), + ) + + raw_schemas = data.get("schema") or [] + if not isinstance(raw_schemas, list): + raw_schemas = [] + schema_index: dict[str, dict[str, Any]] = {} + for entry in raw_schemas: + if not isinstance(entry, dict): + continue + name = _first_str(entry.get("name")) + if not name: + continue + props = entry.get("properties") or [] + schema_index[name] = { + "physical_name": _first_str(entry.get("physicalName")), + "property_count": len(props) if isinstance(props, list) else 0, + } + return metadata, schema_index + + @staticmethod + def _bucket_rules_by_schema( + rules: list[dict[str, Any]], + schema_index: dict[str, dict[str, Any]], + ) -> tuple[list[ContractSchemaRules], list[dict[str, Any]]]: + buckets: dict[str, list[dict[str, Any]]] = {name: [] for name in schema_index} + unassigned: list[dict[str, Any]] = [] + for rule in rules: + schema_name = _extract_schema_name(rule) + if schema_name and schema_name in buckets: + buckets[schema_name].append(rule) + elif schema_name: + # Schema appeared in metadata of a rule but not in the + # contract's ``schema`` list — keep it visible rather than + # silently dropping. + buckets.setdefault(schema_name, []).append(rule) + else: + unassigned.append(rule) + result: list[ContractSchemaRules] = [] + for name, info in schema_index.items(): + result.append( + ContractSchemaRules( + schema_name=name, + physical_name=info.get("physical_name"), + property_count=int(info.get("property_count") or 0), + rules=buckets.pop(name, []), + ) + ) + # Tail-end: any schema that only appeared in rule metadata. + for name, rules_for_schema in buckets.items(): + result.append( + ContractSchemaRules( + schema_name=name, + physical_name=None, + property_count=0, + rules=rules_for_schema, + ) + ) + return result, unassigned + + +def _scrub_for_log(value: str | None) -> str: + """Strip newlines/control chars from untrusted strings before logging. + + Contract-supplied identifiers (schema/field names) are user-controlled; + embedding them verbatim in log messages risks log forging/injection + (CWE-117). Collapse control characters and bound the length. + """ + if not value: + return "" + return re.sub(r"[\x00-\x1f\x7f]+", " ", value)[:200] + + +def _first_str(*values: Any) -> str | None: + for v in values: + if isinstance(v, str) and v.strip(): + return v.strip() + return None + + +def _text_quality_descriptions(quality: Any) -> list[str]: + """Return the ``description`` of every ``type: text`` entry in a quality list.""" + if not isinstance(quality, list): + return [] + out: list[str] = [] + for q in quality: + if not isinstance(q, dict): + continue + if q.get("type") != "text": + continue + desc = _first_str(q.get("description")) + if desc: + out.append(desc) + return out + + +def _extract_schema_name(rule: dict[str, Any]) -> str | None: + meta = rule.get("user_metadata") + if isinstance(meta, dict): + schema = meta.get("schema") + if isinstance(schema, str) and schema.strip(): + return schema.strip() + return None diff --git a/app/src/databricks_labs_dqx_app/backend/services/discovery.py b/app/src/databricks_labs_dqx_app/backend/services/discovery.py index eb4b44f51..f14582503 100644 --- a/app/src/databricks_labs_dqx_app/backend/services/discovery.py +++ b/app/src/databricks_labs_dqx_app/backend/services/discovery.py @@ -1,5 +1,6 @@ import asyncio import logging +import re from dataclasses import dataclass from databricks.sdk import WorkspaceClient @@ -66,6 +67,32 @@ def get_table_columns(self, catalog: str, schema: str, table: str) -> list[Table for col in table_info.columns ] + def get_table_schema_ddl(self, table_fqn: str) -> str: + """Return the Spark DDL string for a UC table, ordered by column position. + + Used by the schema-validation rule builder to snapshot the current + shape of a table so the user has something concrete to start from. + We prefer ``type_text`` because it preserves parameterised and + nested types (``DECIMAL(10,2)``, ``ARRAY``, ``STRUCT<...>``) + — the SDK ``type_name`` enum loses precision on those. + """ + table_info = self._ws.tables.get(full_name=table_fqn) + columns = table_info.columns or [] + ordered = sorted(columns, key=lambda c: c.position or 0) + parts: list[str] = [] + for col in ordered: + name = (col.name or "").strip() + if not name: + continue + type_text = (getattr(col, "type_text", None) or "").strip() + if not type_text and col.type_name: + # Fallback for older SDKs that don't surface ``type_text``. + type_text = col.type_name.value + if not type_text: + continue + parts.append(f"{_quote_ddl_identifier(name)} {type_text}") + return ", ".join(parts) + def get_table_tags(self, catalog: str, schema: str, table: str) -> TableTags: """Get tags for a table and its columns from Unity Catalog.""" full_name = f"{catalog}.{schema}.{table}" @@ -128,3 +155,19 @@ async def get_table_columns_async(self, catalog: str, schema: str, table: str) - @app_cache.cached("discovery:{_user}:tags:{catalog}:{schema}:{table}", ttl=_TAGS_TTL) async def get_table_tags_async(self, catalog: str, schema: str, table: str) -> TableTags: return await asyncio.to_thread(self.get_table_tags, catalog, schema, table) + + @app_cache.cached("discovery:{_user}:schema_ddl:{table_fqn}", ttl=_COLUMN_TTL) + async def get_table_schema_ddl_async(self, table_fqn: str) -> str: + return await asyncio.to_thread(self.get_table_schema_ddl, table_fqn) + + +# Identifiers with characters outside ``[A-Za-z0-9_]`` must be back-tick +# quoted in DDL, otherwise ``has_valid_schema`` will fail to parse the +# expected schema. Embedded back-ticks are doubled per Spark's grammar. +_SAFE_IDENTIFIER = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +def _quote_ddl_identifier(name: str) -> str: + if _SAFE_IDENTIFIER.match(name): + return name + return "`" + name.replace("`", "``") + "`" diff --git a/app/src/databricks_labs_dqx_app/backend/services/review_status_service.py b/app/src/databricks_labs_dqx_app/backend/services/review_status_service.py new file mode 100644 index 000000000..8942a8806 --- /dev/null +++ b/app/src/databricks_labs_dqx_app/backend/services/review_status_service.py @@ -0,0 +1,366 @@ +"""Per-run review-status tracking for DQX validation runs. + +A reviewer (any authenticated user) tags a validation run with one of the +admin-configured values from ``dq_app_settings.run_review_statuses_v1`` — +e.g. ``Pending review`` (auto-default) → ``Acknowledged`` once business +owners have triaged the failures. The Runs detail page renders the +current status next to the comments thread, and the Runs History page +exposes a multi-select filter on it. + +Storage layout +-------------- +- ``dq_run_review_status`` — one mutable row per run that has been + explicitly reviewed. Runs without a row are *not* unreviewed — they + surface the configured default ``value`` from ``AppSettingsService`` + virtually, so dashboards and filters never see a NULL state. +- ``dq_run_review_status_history`` — append-only audit log. One row per + explicit change (including the very first time someone moves a run + off the virtual default). ``previous_status`` carries the *effective* + value before the change, which is what reviewers care about reading + even though it may have been virtual. + +Both tables live in the OLTP executor — Lakebase Postgres when enabled, +Delta fallback otherwise — and share the same service surface as +``CommentsService`` / ``AppSettingsService``. The validation-run rows +themselves stay in Delta because they're append-only Spark output; +listing routes bulk-fetch effective statuses from this service and merge +in Python (see :meth:`bulk_get_effective`). +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone + +from databricks_labs_dqx_app.backend.services.app_settings_service import AppSettingsService +from databricks_labs_dqx_app.backend.sql_executor import OltpExecutorProtocol, RawSql +from databricks_labs_dqx_app.backend.sql_utils import escape_sql_string + +logger = logging.getLogger(__name__) + + +class ReviewStatusRecord: + """A persisted ``dq_run_review_status`` row + a flag telling the UI + whether the value came from the table or from the virtual default.""" + + __slots__ = ("run_id", "status", "updated_by", "updated_at", "is_default") + + def __init__( + self, + run_id: str, + status: str, + updated_by: str | None, + updated_at: str | None, + is_default: bool, + ) -> None: + self.run_id = run_id + self.status = status + self.updated_by = updated_by + self.updated_at = updated_at + # True iff this record reflects the catalogue default rather + # than an explicit row in dq_run_review_status. The UI uses + # this to render an "(auto)" hint and skip showing meaningless + # ``updated_by`` / ``updated_at`` metadata. + self.is_default = is_default + + +class ReviewStatusHistoryEntry: + """One row from ``dq_run_review_status_history``.""" + + __slots__ = ("run_id", "status", "previous_status", "changed_by", "changed_at") + + def __init__( + self, + run_id: str, + status: str, + previous_status: str | None, + changed_by: str, + changed_at: str | None, + ) -> None: + self.run_id = run_id + self.status = status + self.previous_status = previous_status + self.changed_by = changed_by + self.changed_at = changed_at + + +class ReviewStatusService: + """CRUD + audit log for ``dq_run_review_status``. + + Constructed against the OLTP executor so the data co-locates with + ``dq_app_settings`` (status catalogue) and ``dq_comments`` (the + sibling reviewer-action table). All writes go through the app's + service principal, mirroring :class:`CommentsService`. + """ + + # Capped to keep the detail-page request payload predictable. Most + # reviews have <10 changes; the cap protects against pathological + # cases where a script churns the status thousands of times. + _HISTORY_LIMIT = 200 + + def __init__(self, sql: OltpExecutorProtocol, settings: AppSettingsService) -> None: + self._sql = sql + self._settings = settings + self._table = sql.fqn("dq_run_review_status") + self._history_table = sql.fqn("dq_run_review_status_history") + + # ------------------------------------------------------------------ + # Reads + # ------------------------------------------------------------------ + + def get_explicit(self, run_id: str) -> ReviewStatusRecord | None: + """Return the persisted row for *run_id* or ``None`` if unset. + + Callers that want a value (even the catalogue default) for runs + without a row should use :meth:`get_effective` instead. + """ + er = escape_sql_string(run_id) + sql = ( + f"SELECT run_id, status, updated_by, {self._sql.ts_text('updated_at')} " + f"FROM {self._table} WHERE run_id = '{er}' LIMIT 1" + ) + rows = self._sql.query(sql) + if not rows: + return None + return ReviewStatusRecord( + run_id=rows[0][0] or "", + status=rows[0][1] or "", + updated_by=rows[0][2], + updated_at=rows[0][3], + is_default=False, + ) + + def get_effective(self, run_id: str) -> ReviewStatusRecord: + """Return the explicit row, or a virtual default record if unset. + + The default value comes from + :meth:`AppSettingsService.get_default_run_review_status`. + """ + explicit = self.get_explicit(run_id) + if explicit is not None: + return explicit + return ReviewStatusRecord( + run_id=run_id, + status=self._settings.get_default_run_review_status(), + updated_by=None, + updated_at=None, + is_default=True, + ) + + def bulk_get_effective(self, run_ids: list[str]) -> dict[str, ReviewStatusRecord]: + """Return ``{run_id: ReviewStatusRecord}`` for every input run_id. + + Runs without an explicit row get a virtual-default record so the + listing route can render *something* for every row. Uses a single + ``WHERE run_id IN (...)`` query — at the existing listing LIMIT + of 500 this is well inside any reasonable IN-list ceiling. + """ + if not run_ids: + return {} + + # Deduplicate before formatting so the ``IN`` list stays + # compact even if the caller passes the same run twice. + unique_ids = list(dict.fromkeys(run_ids)) + in_list = ", ".join(f"'{escape_sql_string(r)}'" for r in unique_ids) + sql = ( + f"SELECT run_id, status, updated_by, {self._sql.ts_text('updated_at')} " + f"FROM {self._table} WHERE run_id IN ({in_list})" + ) + rows = self._sql.query(sql) + + explicit: dict[str, ReviewStatusRecord] = { + (row[0] or ""): ReviewStatusRecord( + run_id=row[0] or "", + status=row[1] or "", + updated_by=row[2], + updated_at=row[3], + is_default=False, + ) + for row in rows + if row and row[0] + } + + default_value = self._settings.get_default_run_review_status() + out: dict[str, ReviewStatusRecord] = {} + for run_id in unique_ids: + record = explicit.get(run_id) + if record is not None: + out[run_id] = record + else: + out[run_id] = ReviewStatusRecord( + run_id=run_id, + status=default_value, + updated_by=None, + updated_at=None, + is_default=True, + ) + return out + + def get_history(self, run_id: str) -> list[ReviewStatusHistoryEntry]: + """Return up to ``_HISTORY_LIMIT`` recent history rows, newest first.""" + er = escape_sql_string(run_id) + sql = ( + f"SELECT run_id, status, previous_status, changed_by, " + f"{self._sql.ts_text('changed_at')} " + f"FROM {self._history_table} " + f"WHERE run_id = '{er}' " + f"ORDER BY changed_at DESC LIMIT {self._HISTORY_LIMIT}" + ) + rows = self._sql.query(sql) + return [ + ReviewStatusHistoryEntry( + run_id=row[0] or "", + status=row[1] or "", + previous_status=row[2], + changed_by=row[3] or "", + changed_at=row[4], + ) + for row in rows + ] + + # ------------------------------------------------------------------ + # Writes + # ------------------------------------------------------------------ + + def set_status( + self, + run_id: str, + status: str, + *, + user_email: str, + ) -> ReviewStatusRecord: + """Upsert the explicit row + append an audit row. + + ``status`` is validated against the live catalogue from + :class:`AppSettingsService`; passing an unknown value raises + ``ValueError`` so the route can return a 400. The audit row + carries the effective previous status (virtual default included) + so the history reads naturally on the UI. + """ + if not run_id or not run_id.strip(): + raise ValueError("run_id must be a non-empty string.") + if not status or not status.strip(): + raise ValueError("status must be a non-empty string.") + if not user_email or not user_email.strip(): + raise ValueError("user_email must be a non-empty string.") + + cleaned_status = status.strip() + allowed = {entry["value"] for entry in self._settings.get_run_review_statuses()} + if cleaned_status not in allowed: + raise ValueError(f"Unknown review status {cleaned_status!r}. " f"Allowed values: {sorted(allowed)}") + + previous_effective = self.get_effective(run_id).status + + # No-op writes still go through so the history captures the + # human action (e.g. the reviewer re-confirmed the existing + # status). UI shows the no-op rows as "Confirmed" if we want + # that polish later; for now we just record the duplicate. + self._sql.upsert( + self._table, + key_cols={"run_id": run_id}, + value_cols={ + "status": cleaned_status, + "updated_by": user_email, + "updated_at": RawSql("current_timestamp()"), + }, + ) + + self._append_history( + run_id=run_id, + status=cleaned_status, + previous_status=previous_effective, + changed_by=user_email, + ) + + logger.info( + "Run %s review status set to %r (prev=%r) by %s", + run_id, + cleaned_status, + previous_effective, + user_email, + ) + return ReviewStatusRecord( + run_id=run_id, + status=cleaned_status, + updated_by=user_email, + updated_at=datetime.now(timezone.utc).isoformat(), + is_default=False, + ) + + def clear_status(self, run_id: str, *, user_email: str) -> ReviewStatusRecord: + """Delete the explicit row + append an audit row for the revert. + + The run reverts to the catalogue default. The audit row records + the default value as the new status so the history reads + "Acknowledged → Pending review" rather than "Acknowledged → + (none)" which would be ambiguous. + """ + if not run_id or not run_id.strip(): + raise ValueError("run_id must be a non-empty string.") + if not user_email or not user_email.strip(): + raise ValueError("user_email must be a non-empty string.") + + previous_effective = self.get_effective(run_id).status + default_value = self._settings.get_default_run_review_status() + + er = escape_sql_string(run_id) + self._sql.execute(f"DELETE FROM {self._table} WHERE run_id = '{er}'") + + self._append_history( + run_id=run_id, + status=default_value, + previous_status=previous_effective, + changed_by=user_email, + ) + + logger.info( + "Run %s review status cleared (was %r) by %s", + run_id, + previous_effective, + user_email, + ) + return ReviewStatusRecord( + run_id=run_id, + status=default_value, + updated_by=None, + updated_at=None, + is_default=True, + ) + + # ------------------------------------------------------------------ + # Internals + # ------------------------------------------------------------------ + + def _append_history( + self, + *, + run_id: str, + status: str, + previous_status: str | None, + changed_by: str, + ) -> None: + """Insert one row into ``dq_run_review_status_history``. + + Hand-rolled INSERT (no ``upsert``) because the table is + append-only with no natural key. We can't piggyback on the + executor's ``_render_value`` helpers either — the Delta one + passes ``current_timestamp()`` through verbatim (Postgres + rejects the parenthesised form), and the Postgres one is not + exposed publicly. ``now()`` is portable across both dialects + (Postgres ships it as a built-in synonym for + ``CURRENT_TIMESTAMP``; Delta SQL accepts it too — same idiom + as :class:`CommentsService`), so we inline it directly here. + """ + run_id_lit = f"'{escape_sql_string(run_id)}'" + status_lit = f"'{escape_sql_string(status)}'" + if previous_status is None: + prev_lit = "NULL" + else: + prev_lit = f"'{escape_sql_string(previous_status)}'" + changed_by_lit = f"'{escape_sql_string(changed_by)}'" + + self._sql.execute( + f"INSERT INTO {self._history_table} " + f"(run_id, status, previous_status, changed_by, changed_at) " + f"VALUES ({run_id_lit}, {status_lit}, {prev_lit}, {changed_by_lit}, now())" + ) diff --git a/app/src/databricks_labs_dqx_app/backend/services/role_service.py b/app/src/databricks_labs_dqx_app/backend/services/role_service.py index 40aaa39b5..0ad8784f7 100644 --- a/app/src/databricks_labs_dqx_app/backend/services/role_service.py +++ b/app/src/databricks_labs_dqx_app/backend/services/role_service.py @@ -30,6 +30,21 @@ class RoleMapping: updated_at: datetime | None = None +@dataclass +class RoleMappingHistoryEntry: + """One row from the ``dq_role_mappings_history`` audit table. + + See :meth:`RoleService.list_history` for the read contract and + :meth:`RoleService._record_history` for the write contract. + """ + + role: str + group_name: str + action: str + changed_by: str | None + changed_at: datetime | None + + class RoleService: """Manages role-to-group mappings in ``dq_role_mappings``. @@ -37,11 +52,31 @@ class RoleService: user's OBO token. The table lives on Lakebase Postgres when Lakebase is enabled and on Delta otherwise — the injected executor decides which. + + Every mutation (``create_mapping`` / ``delete_mapping``) is mirrored + into the append-only ``dq_role_mappings_history`` audit table so the + full timeline of who-changed-what-when survives even after a mapping + is removed. The mutable :attr:`_table` only ever holds the *current* + set of (role, group) pairs; the history table is the source of truth + for compliance / "what changed last week" questions. See the docstring + on :data:`backend.migrations._V7_ROLE_MAPPINGS_HISTORY` for the table + schema and the rationale for the ``action`` vocabulary. """ + # Action vocabulary written into ``dq_role_mappings_history.action``. + # Kept narrow (only ``create`` and ``delete``) because the live row + # has no mutable value columns — re-saving an existing (role, + # group_name) pair is a no-op semantically, so we don't emit an + # ``update`` row for it. Re-adding a previously-deleted mapping is + # recorded as a fresh ``create`` (sequence in the audit log + # disambiguates). + _ACTION_CREATE = "create" + _ACTION_DELETE = "delete" + def __init__(self, sql: OltpExecutorProtocol) -> None: self._sql = sql self._table = sql.fqn("dq_role_mappings") + self._history_table = sql.fqn("dq_role_mappings_history") self._mappings_cache: list[RoleMapping] | None = None self._mappings_cache_expires: float = 0.0 @@ -105,6 +140,10 @@ def create_mapping(self, role: str, group_name: str, user_email: str) -> RoleMap ``MERGE INTO`` (Delta) / ``INSERT ... ON CONFLICT DO UPDATE`` (Postgres) choice so the original ``created_*`` survive on UPDATE. + + Also records a ``create`` row in ``dq_role_mappings_history`` + (best-effort; history failures do not roll back the mapping + change — see :meth:`_record_history`). """ if role not in [r.value for r in UserRole]: raise ValueError(f"Invalid role: {role}. Must be one of {[r.value for r in UserRole]}") @@ -121,6 +160,12 @@ def create_mapping(self, role: str, group_name: str, user_email: str) -> RoleMap }, preserve_created=True, ) + self._record_history( + role=role, + group_name=group_name, + action=self._ACTION_CREATE, + user_email=user_email, + ) self.invalidate_mappings_cache() logger.info(f"Created/updated role mapping: {role} -> {group_name}") @@ -133,16 +178,118 @@ def create_mapping(self, role: str, group_name: str, user_email: str) -> RoleMap updated_at=datetime.now(timezone.utc), ) - def delete_mapping(self, role: str, group_name: str) -> None: - """Delete a role-to-group mapping.""" + def delete_mapping(self, role: str, group_name: str, user_email: str | None = None) -> None: + """Delete a role-to-group mapping. + + ``user_email`` is the actor who performed the deletion (recorded + in the audit log). Defaulted to ``None`` so existing call sites + that don't yet pass it keep working — those rows will surface in + the audit log with ``changed_by = NULL``. New call sites should + always pass the OBO user email. + """ escaped_role = escape_sql_string(role) escaped_group = escape_sql_string(group_name) sql = f"DELETE FROM {self._table} WHERE role = '{escaped_role}' AND group_name = '{escaped_group}'" self._sql.execute(sql) + self._record_history( + role=role, + group_name=group_name, + action=self._ACTION_DELETE, + user_email=user_email, + ) self.invalidate_mappings_cache() logger.info(f"Deleted role mapping: {role} -> {group_name}") + # ------------------------------------------------------------------ + # Audit history + # ------------------------------------------------------------------ + + def list_history( + self, + *, + role: str | None = None, + group_name: str | None = None, + limit: int = 200, + ) -> list[RoleMappingHistoryEntry]: + """Return audit-log rows for role mapping changes, newest first. + + Args: + role: Optional exact-match filter on ``role``. + group_name: Optional exact-match filter on ``group_name``. + limit: Hard cap on the number of rows returned (defaults to + 200 — same as the workspace-groups dropdown, plenty for + the admin Settings page UI). + """ + # Hard cap so a hostile caller can't OOM the app by passing an + # absurdly large limit. 1000 matches the cap on the + # /roles/groups endpoint. + limit = max(1, min(int(limit), 1000)) + + where_clauses: list[str] = [] + if role is not None: + where_clauses.append(f"role = '{escape_sql_string(role)}'") + if group_name is not None: + where_clauses.append(f"group_name = '{escape_sql_string(group_name)}'") + where = f"WHERE {' AND '.join(where_clauses)} " if where_clauses else "" + + sql = ( + f"SELECT role, group_name, action, changed_by, " + f"{self._sql.ts_text('changed_at')} " + f"FROM {self._history_table} " + f"{where}" + f"ORDER BY changed_at DESC " + f"LIMIT {limit}" + ) + rows = self._sql.query(sql) + return [ + RoleMappingHistoryEntry( + role=row[0], + group_name=row[1], + action=row[2], + changed_by=row[3] if row[3] else None, + changed_at=datetime.fromisoformat(row[4]) if row[4] else None, + ) + for row in rows + ] + + def _record_history( + self, + *, + role: str, + group_name: str, + action: str, + user_email: str | None, + ) -> None: + """Insert an audit row into ``dq_role_mappings_history`` (best-effort). + + Same contract as :meth:`RulesCatalogService._record_history`: + a history-write failure must NEVER roll back the primary + mutation, because losing one audit row is far less harmful + than refusing a legitimate admin change. Failures are logged at + WARNING with the stack trace so they're still investigable + post-hoc. + """ + try: + escaped_role = escape_sql_string(role) + escaped_group = escape_sql_string(group_name) + escaped_action = escape_sql_string(action) + user_sql = f"'{escape_sql_string(user_email)}'" if user_email else "NULL" + sql = ( + f"INSERT INTO {self._history_table} " + f"(role, group_name, action, changed_by, changed_at) VALUES " + f"('{escaped_role}', '{escaped_group}', '{escaped_action}', " + f"{user_sql}, now())" + ) + self._sql.execute(sql) + except Exception: + logger.warning( + "Failed to record role-mapping history for %s -> %s (non-fatal)", + role, + group_name, + exc_info=True, + ) + def resolve_role(self, user_groups: list[str], admin_group: str | None = None) -> UserRole: """Determine user's highest *primary* role based on group membership. diff --git a/app/src/databricks_labs_dqx_app/backend/services/scheduler_service.py b/app/src/databricks_labs_dqx_app/backend/services/scheduler_service.py index 2eb6531b3..4fce47bd0 100644 --- a/app/src/databricks_labs_dqx_app/backend/services/scheduler_service.py +++ b/app/src/databricks_labs_dqx_app/backend/services/scheduler_service.py @@ -13,6 +13,7 @@ from __future__ import annotations import asyncio +import calendar import json import re from datetime import datetime, timedelta, timezone @@ -30,6 +31,11 @@ _VALID_TRACKER_STATUSES = {"pending", "success", "partial_failure", "failed"} +# Fallback gap used to push ``next_run_at`` into the future when a schedule's +# trigger fails *and* its next occurrence cannot be computed. Prevents a +# deterministic failure from re-firing the schedule on every tick. +_FAILURE_BACKOFF = timedelta(hours=1) + # Length of the hex suffix on ``tmp_view_*`` names. ``uuid4().hex`` is # always 32 lowercase hex chars; we slice to keep schema-qualified # names short. Centralised so the GC regex below, the creation paths @@ -245,6 +251,13 @@ async def _tick(self, *, recalc: bool = False) -> None: freq = cfg.get("frequency", "manual") if freq == "manual": continue + # ``paused`` is a soft kill switch toggled from the Schedules + # list. Skipping here (rather than at config-save time) means a + # paused schedule keeps its existing ``next_run_at`` tracker, so + # resuming it does not retroactively fire any missed runs — the + # next tick simply picks the schedule back up. + if cfg.get("paused"): + continue try: tracker = await asyncio.to_thread(self._get_tracker, name) @@ -284,17 +297,53 @@ async def _tick(self, *, recalc: bool = False) -> None: run_id = uuid4().hex[:16] logger.info("Schedule '%s' is due (next_run_at=%s), triggering run %s", name, next_run, run_id) - errors = await asyncio.to_thread(self._trigger_run, name, cfg, run_id) - - new_next = self._compute_next_run(cfg, now) - status = "success" if not errors else "partial_failure" - if errors: - logger.warning("Schedule '%s' run %s had errors: %s", name, run_id, errors) - - await asyncio.to_thread(self._upsert_tracker, name, now, new_next, run_id, status) + try: + errors = await asyncio.to_thread(self._trigger_run, name, cfg, run_id) + + new_next = self._compute_next_run(cfg, now) + status = "success" if not errors else "partial_failure" + if errors: + logger.warning("Schedule '%s' run %s had errors: %s", name, run_id, errors) + + await asyncio.to_thread(self._upsert_tracker, name, now, new_next, run_id, status) + except Exception: + # A hard failure inside ``_trigger_run`` (e.g. ``_resolve_scope``, + # ``_load_custom_metrics``, or DB access) — anything outside the + # per-table try that returns ``errors`` — would otherwise skip the + # ``_upsert_tracker`` above and leave ``next_run_at`` in the past. + # The schedule then stays "due" and re-fires every tick, turning a + # deterministic error into a tight retry loop that keeps submitting + # jobs. Advance ``next_run_at`` (persisting a ``failed`` status) so + # the schedule resumes at its next occurrence instead of hammering. + logger.exception( + "Schedule '%s' run %s failed to trigger; advancing next_run_at to avoid a retry loop", + name, + run_id, + ) + await asyncio.to_thread(self._advance_after_failure, name, cfg, now, run_id) except Exception: logger.exception("Scheduler failed processing schedule '%s'", name) + def _advance_after_failure(self, name: str, cfg: dict[str, Any], now: datetime, run_id: str) -> None: + """Persist a failed run and push ``next_run_at`` forward after a trigger failure. + + Computes the next scheduled occurrence so a deterministic failure does not + re-fire every tick. If even the next-run computation fails, falls back to a + fixed backoff so the schedule still moves off "due". + """ + try: + new_next = self._compute_next_run(cfg, now) + except Exception: + logger.exception( + "Schedule '%s': could not compute next_run_at after a failure; using backoff", + name, + ) + new_next = now + _FAILURE_BACKOFF + try: + self._upsert_tracker(name, now, new_next, run_id, "failed") + except Exception: + logger.exception("Schedule '%s': failed to persist tracker after a trigger failure", name) + # ------------------------------------------------------------------ # Config loading # ------------------------------------------------------------------ @@ -447,20 +496,42 @@ def _trigger_run(self, schedule_name: str, cfg: dict[str, Any], run_id_prefix: s continue run_id = f"{run_id_prefix}_{i}" - is_sql_check = table_fqn.startswith(_SQL_CHECK_PREFIX) - + # Dataset-level rules (``__sql_check__/``) come in + # two flavours that need different runner inputs: + # * ``sql_query`` checks → the task runner creates + # a Spark temp view from the embedded query, so + # ``view_fqn`` can stay as the synthetic key. + # * ``has_valid_schema`` checks → the rule operates + # against a real UC table. We pass the real target as + # ``view_fqn`` so the runner's standard row-level + # path (``is_sql_check=False``) can read it. + # ``source_table_fqn`` keeps the synthetic key in either + # case so the run history still groups under the rule. + is_synthetic = table_fqn.startswith(_SQL_CHECK_PREFIX) sql_query: str | None = None - if is_sql_check: + runner_view_fqn = table_fqn + + if is_synthetic: sql_query = self._extract_sql_query(entry["checks"]) - if not sql_query: - errors.append(f"{table_fqn}: SQL check has no query") - continue + if sql_query is None: + schema_target = self._extract_schema_check_target(entry["checks"]) + if not schema_target: + errors.append( + f"{table_fqn}: dataset-level rule must provide either a " + "sql_query or a has_valid_schema check with " + "user_metadata.target_table" + ) + continue + runner_view_fqn = schema_target config = { "checks": entry["checks"], "sample_size": sample_size, "source_table_fqn": table_fqn, - "is_sql_check": is_sql_check, + # Only true SQL queries take the SQL fast-path in the + # runner. Schema-validation rules go through the + # row-level engine like any other check. + "is_sql_check": sql_query is not None, } if custom_metrics: @@ -473,7 +544,7 @@ def _trigger_run(self, schedule_name: str, cfg: dict[str, Any], run_id_prefix: s job_id=int(self._job_id), job_parameters={ "task_type": "scheduled", - "view_fqn": table_fqn, + "view_fqn": runner_view_fqn, "result_catalog": self._catalog, "result_schema": self._schema, "config_json": json.dumps(config), @@ -672,6 +743,27 @@ def _extract_sql_query(checks: list[dict[str, Any]]) -> str | None: return (check.get("check") or {}).get("arguments", {}).get("query") return None + @staticmethod + def _extract_schema_check_target(checks: list[dict[str, Any]]) -> str | None: + """Return the real Unity Catalog target table for a ``has_valid_schema`` rule. + + Mirrors ``dryrun._extract_schema_check_target`` — kept here as a + static helper to avoid the scheduler depending on the route + module. Schema-validation rules live under the synthetic + ``__sql_check__/`` table_fqn so we look up the *real* + target on ``user_metadata.target_table`` and run the row-level + engine against that table. + """ + for check in checks: + fn = (check.get("check") or {}).get("function", "") + if fn != "has_valid_schema": + continue + meta = check.get("user_metadata") or {} + target = meta.get("target_table") + if isinstance(target, str) and target.strip(): + return target.strip() + return None + # ------------------------------------------------------------------ # Orphan tmp-view GC (weekly, Saturday 01:00 UTC) # ------------------------------------------------------------------ @@ -1036,12 +1128,30 @@ def _compute_next_run(cfg: dict[str, Any], after: datetime) -> datetime: if freq == "monthly": dom = int(cfg.get("day_of_month") or 1) - candidate = after.replace(day=min(dom, 28), hour=hour, minute=minute, second=0, microsecond=0) + + # Clamp the configured day to the actual number of days in the + # target month (e.g. day 31 → 30 in April, 28/29 in February) + # rather than always capping at 28 — capping made schedules set + # for the 29th–31st silently fire on the 28th every month while + # the UI still showed the configured day. + def _clamp_day(year: int, month: int) -> int: + return min(dom, calendar.monthrange(year, month)[1]) + + candidate = after.replace( + day=_clamp_day(after.year, after.month), + hour=hour, + minute=minute, + second=0, + microsecond=0, + ) if candidate <= after: if after.month == 12: - candidate = candidate.replace(year=after.year + 1, month=1) + year, month = after.year + 1, 1 else: - candidate = candidate.replace(month=after.month + 1) + year, month = after.year, after.month + 1 + # Re-clamp for the next month before setting the day so a 31 → + # 30/28 rollover doesn't raise ValueError on a short month. + candidate = candidate.replace(year=year, month=month, day=_clamp_day(year, month)) return candidate return after + timedelta(hours=1) diff --git a/app/src/databricks_labs_dqx_app/ui/CLAUDE.md b/app/src/databricks_labs_dqx_app/ui/CLAUDE.md index f6a16db1d..478467d9a 100644 --- a/app/src/databricks_labs_dqx_app/ui/CLAUDE.md +++ b/app/src/databricks_labs_dqx_app/ui/CLAUDE.md @@ -11,19 +11,35 @@ ui/ ├── main.tsx # App bootstrap (QueryClient, Router, AuthGuard) ├── routes/ # File-based routing (TanStack Router) │ ├── __root.tsx # Root layout (ThemeProvider, AIAssistantProvider, Toaster) -│ ├── index.tsx # Home page — welcome screen, create config +│ ├── index.tsx # Home redirect │ └── _sidebar/ # Sidebar layout group (prefix _ = layout route) -│ ├── route.tsx # Sidebar nav (Config, Runs, Discovery) -│ ├── config.tsx # Config editor + settings modal -│ ├── runs.tsx # Run list + editor + AI assistant -│ ├── runs.index.tsx # Runs placeholder/redirect -│ ├── runs.$runName.tsx # Dynamic route for specific run +│ ├── route.tsx # Sidebar nav + persistent in-app docs link +│ ├── home.tsx # Landing page (welcome, primary CTAs) +│ ├── config.tsx # Workspace config + storage settings │ ├── discovery.tsx # Catalog browser (catalog → schema → table → columns) -│ └── profile.tsx # User profile +│ ├── insights.tsx # Stub route (renders null); dashboard hosted persistently in the layout (see components/insights/) +│ ├── profile.tsx # User profile + language preference +│ ├── profiler.tsx # Profiler launch + Profiler & Generate results modal +│ ├── rules.tsx # Rules layout (tabs) +│ ├── rules.index.tsx # Redirect to default rules tab +│ ├── rules.active.tsx # Active rules library (grouped by target table; __sql_check__ bucket) +│ ├── rules.drafts.tsx # Drafts & Review +│ ├── rules.create.tsx # Create rules landing (tiles) +│ ├── rules.single-table.tsx # Single-table rule editor +│ ├── rules.create-sql.tsx # Cross-table SQL editor (synthetic __sql_check__ FQN) +│ ├── rules.schema.tsx # has_valid_schema editor (edits to existing rules) + dry-run + snapshot +│ ├── rules.create-reusable.tsx # Reusable-rule template editor +│ ├── rules.import.tsx # Bulk import — TABBED: "yaml" + "contract" (?tab=) +│ ├── rules.from-contract.tsx # Legacy URL → Navigate redirect to /rules/import?tab=contract +│ ├── runs.tsx # Run editor + AI assistant +│ ├── runs.index.tsx # Runs placeholder/redirect +│ ├── runs.$runName.tsx # Manual-run launcher + per-table error details +│ └── runs-history.tsx # Run history with ratio bars + click-failed-check filter; schedules tab w/ pause/delete row actions ├── components/ │ ├── ui/ # shadcn/ui primitives (button, card, dialog, select, etc.) │ ├── layout/ # Shell components (Navbar, SidebarLayout, ThemeProvider, Logo) │ ├── anim/ # Animation components (FadeIn, ShinyText) +│ ├── insights/ # Persistent Insights dashboard host (iframe survives navigation) │ ├── backgrounds/ # Decorative backgrounds (gradient, stars) │ ├── AuthGuard.tsx # Blocks render until OBO auth confirmed (exponential backoff) │ ├── AIAssistantProvider.tsx # Context + Sheet modal for AI rule generator @@ -100,7 +116,7 @@ yarn vite preview # Preview production build ### Data Flow -1. Route component mounts → calls React Query hook (e.g., `useConfig()`) +1. Route component mounts → calls React Query hook (e.g., `useGetConfig()`) 2. Hook makes Axios request to `/api/v1/*` (OBO token in header automatically) 3. orval-generated hook transforms response via `selector` (extracts `.data`) 4. Component renders with cached data @@ -146,6 +162,33 @@ The UI is fully localized with **react-i18next**. Locale bundles live in `lib/i1 - **Pluralize with native i18next, not string concatenation.** Use `_one` / `_other` suffix keys with `{{count}}` (e.g. `columnsCount_one` / `columnsCount_other`). Never build plurals with a hard-coded `"s"` suffix or an interpolated `{{somethingPlural}}` placeholder — that bakes English grammar into the translation layer and breaks other locales. - **Adding a new language:** add it to `SUPPORTED_LANGUAGES` and register a loader in `localeLoaders` (both in `lib/i18n/index.ts`), then create the matching `locales/.json`. Only `en` ships in the initial JS bundle; other locales lazy-load on demand via `ensureLocaleLoaded`, so don't statically import them. +### Theming (CSS custom properties) + +Themes are CSS custom properties on `:root` and `.dark` in `styles/globals.css`. shadcn `Button` (and friends) read `--` (background) and `---foreground` (text) — **both must contrast**. We've already shipped a bug where `--destructive-foreground` matched `--destructive` and the "Delete" button text was invisible on red. If you change a `--*-foreground` token, eyeball the corresponding role in both light and dark themes before merging. + +### Dataset-level rules: routing & editing + +Cross-table SQL checks and `has_valid_schema` schema-validation rules share the synthetic `__sql_check__/` `table_fqn` so they bucket together in the **Cross-table rules** group on the Active Rules page. The Edit/View action must route each to its native editor — not lump everything into the SQL editor: + +- SQL check → `/rules/create-sql?...` +- `has_valid_schema` → `/rules/schema?...` + +This dispatch lives in `routes/_sidebar/rules.active.tsx`. When you add another dataset-level rule kind, add its editor route and extend the dispatch. + +> **Authoring vs editing:** New reference-table checks (`has_valid_schema`, `foreign_key`) are now authored inside the **single-table editor** (`rules.single-table.tsx`), which renders the reference-table picker and the `expected_schema`/`ref_table` mutually-exclusive fields. `/rules/schema` is retained for *editing existing* `has_valid_schema` rules opened from the Active Rules page (per the dispatch above). + +### Schema rule subset filtering (DDL trimming) + +`has_valid_schema` only filters the *actual* DataFrame when you pass `columns` / `exclude_columns` — it does **not** trim the *expected* schema. To keep both sides aligned in DDL mode, `routes/_sidebar/rules.schema.tsx#buildRule()` calls `filterDdlByColumns()` from `lib/format-utils.ts` to trim `expected_schema` before saving. Reference-table mode can't trim a remote schema client-side, so the editor shows a warning instead. Don't remove either branch without porting the trimming server-side first. + +### Import rules: tabbed page (?tab=yaml|contract) + +`/rules/import` is a single tabbed page hosting two flows; `/rules/from-contract` is now just a `Navigate` redirect to `/rules/import?tab=contract` to keep old bookmarks working. The contract flow's main component (`ContractWorkspace`) is exported from `rules.from-contract.tsx` and imported by `rules.import.tsx`. If you split or rename either file, update both the redirect and the import — and remember to re-run `make app-build` (or the dev server) so `routeTree.gen.ts` picks up new files. + +### Insights dashboard: persistent iframe host + +The Insights page embeds a Lakeview dashboard in an `