basedosdados · rdahis · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 1, 2026
diff --git a/.claude/commands/onboarding-clean.md b/.claude/commands/onboarding-clean.md
@@ -9,16 +9,18 @@ Write and run data cleaning code for a Data Basis dataset.
 
 ## Folder structure
 
-Work in a folder **external to the `pipelines/` repo**:
+Raw data and output live **external to the `pipelines/` repo** (large files are
+never committed). Cleaning code lives **inside** the repo under the model folder.
 
 ```text
-<dataset_root>/
-├── input/          ← raw files (CSV, Excel, JSON, etc.) — do not modify
-├── output/
-│   └── <table_slug>/
-│       └── ano=<year>/sigla_uf=<uf>/   (municipio/UF tables)
-│       └── ano=<year>/                  (Brasil-level tables)
-└── code/
+<dataset_root>/          ← external working directory (e.g. ~/Downloads/<slug>/)
+├── input/               ← raw files (CSV, Excel, JSON, etc.) — do not modify
+└── output/
+    └── <table_slug>/
+        └── ano=<year>/sigla_uf=<uf>/   (municipio/UF tables)
+        └── ano=<year>/                  (Brasil-level tables)
+
+pipelines/models/<dataset_gcp_id>/code/   ← write cleaning scripts here
     └── clean.py    (one script per dataset if tables share raw source)
     └── clean_<table>.py  (one per table if they don't)
 ```
@@ -35,10 +37,15 @@ Read the architecture tables from Drive (URLs from `databasis-architecture` outp
 
 Read the first 20 rows of each raw file to understand structure. Check:
 - File format (CSV, Excel, JSON, fixed-width, etc.)
-- Encoding (UTF-8, ISO-8859-1, etc.)
+- Encoding: try `utf-8-sig` first, fall back to `latin1`. For files where column
+  names with accents come out garbled under `latin1`, re-decode with
+  `.encode("latin1").decode("utf-8", errors="replace")`.
 - Column names and their mapping to architecture names
 - Any header rows, footer rows, or skip rows
 - Date formats
+- Number formatting: Brazilian CSVs often use `.` as thousands separator and `,`
+  as decimal separator (e.g. `"1.234,56"` = 1234.56). Strip `.` first, then
+  replace `,` with `.` before calling `pd.to_numeric`.
 
 ## Step 3 — Write cleaning code
 
@@ -58,13 +65,56 @@ Standard column types:
 - STRING: `.astype(str).str.strip().replace('nan', pd.NA)`
 - DATE: `pd.to_datetime(col, errors='coerce').dt.date`
 
+### Explicit pyarrow schema (required)
+
+Always build an explicit `pa.Schema` and pass it to `pa.Table.from_pandas`. This
+prevents INT64/FLOAT64 mismatches when some partitions have all-integer values
+in columns that should be FLOAT64.
+
+```python
+import pyarrow as pa
+
+def _build_schema() -> pa.Schema:
+    fields = []
+    for col in OUTPUT_COLUMNS:
+        if col in partition_cols:
+            continue
+        if col in INT_COLS:
+            fields.append(pa.field(col, pa.int64()))
+        elif col in FLOAT_COLS:
+            fields.append(pa.field(col, pa.float64()))
+        elif col in DATE_COLS:
+            fields.append(pa.field(col, pa.date32()))
+        else:
+            fields.append(pa.field(col, pa.string()))
+    return pa.schema(fields)
+
+_SCHEMA = _build_schema()
+
+def write_partition(df, ...):
+    table = pa.Table.from_pandas(data, schema=_SCHEMA, preserve_index=False)
+    pq.write_table(table, out / "data.parquet", compression="snappy")
+```
+
+### Geometry columns
+
+If the dataset includes a shapefile or WKT geometry:
+- Convert to WKT using `geopandas`: ensure CRS is EPSG:4674 (SIRGAS 2000).
+- Store as STRING in parquet (`pa.string()`).
+- **Verify the join key** between the shapefile and tabular data — shapefile IDs
+  and tabular IDs are often different systems (e.g. `cd_cnuc` vs `id_uc`).
+  Inspect both before joining.
+- In the DBT model, cast with `ST_GEOGFROMTEXT(col, make_valid => true)` and
+  type the column as GEOGRAPHY, not STRING.
+
 ## Step 4 — Validate subset output
 
 After running on the subset:
-1. Verify column names match architecture exactly
-2. Verify types are correct
-3. Check for unexpected nulls in primary key columns
-4. Print row counts and a sample
+1. Check the parquet schema with `pq.read_schema(path)` — verify all column types
+   match the architecture before uploading.
+2. Verify column names match architecture exactly.
+3. Check for unexpected nulls in primary key columns.
+4. Print row counts and a sample.
 
 Only proceed to full data after subset is verified. Ask the user to confirm.
 

diff --git a/.claude/commands/onboarding-dbt.md b/.claude/commands/onboarding-dbt.md
@@ -42,6 +42,17 @@ from
 
 Column order must match the architecture table exactly.
 
+### Geometry columns
+
+If the dataset has a WKT geometry column, cast it to GEOGRAPHY — not STRING:
+
+```sql
+st_geogfromtext(safe_cast(geometria as string), make_valid => true) geometria,
+```
+
+`make_valid => true` handles degenerate polygons (rings with fewer than 3 unique
+vertices) that may exist in source shapefiles.
+
 ## Step 3 — Write schema.yaml
 
 One file: `models/<dataset_slug>/schema.yml`
@@ -52,8 +63,12 @@ Template:
 version: 2
 models:
   - name: <dataset_slug>__<table_slug>
-    description: <description in Portuguese from architecture>
+    description: >
+      <description in Portuguese from architecture — use > block scalar whenever
+      the description spans multiple lines or contains a colon>
     tests:
+      - dbt_utils.unique_combination_of_columns:
+          combination_of_columns: [<partition_col>, <primary_key_col>]
       - not_null_proportion_multiple_columns:
           at_least: 0.05
     columns:
@@ -69,10 +84,30 @@ models:
 ```
 
 Rules:
-- Add `not_null` test to partition columns and primary keys
-- Add `relationships` test to any column with a `directory_column` in the architecture
-- Add `not_null_proportion_multiple_columns` at 0.05 to every model
-- Use Portuguese descriptions from architecture
+- **Always use `>` block scalar** for multi-line descriptions or any description
+  that may contain a `:` — bare scalars with `:` in continuation lines break YAML
+  parsing (e.g. `"Fonte: MMA"` on a continuation line triggers a parse error).
+- Add `not_null` test to partition columns and primary keys.
+- Add `relationships` test to any column with a `directory_column` in the architecture.
+- Add `not_null_proportion_multiple_columns` at 0.05 to every model.
+- Use Portuguese descriptions from architecture.
+- For the uniqueness test, prefer a stable string identifier (e.g. `codigo_uc`)
+  over an integer ID that may be NULL in older snapshots.
+
+### Excluding columns from `not_null_proportion_multiple_columns`
+
+The test macro supports an `ignore_values` parameter (not `exclude`):
+
+```yaml
+- not_null_proportion_multiple_columns:
+    at_least: 0.05
+    ignore_values:
+      - column_that_is_legitimately_empty
+      - another_sparse_column
+```
+
+Use this for columns that are 100 % null in the source (headers present but never
+populated by the provider).
 
 ## Step 4 — Check dbt_project.yml
 

diff --git a/.claude/commands/onboarding-discover.md b/.claude/commands/onboarding-discover.md
@@ -17,7 +17,11 @@ Use the `discover_ids` MCP tool (env from argument):
 discover_ids(env=<env>)
 ```
 
-This returns IDs for: status, bigquery_type, entity, area, license, availability, organization.
+This returns IDs for: status, bigquery_type, entity, license, availability, organization, theme.
+
+**Never search the web, hardcode IDs, or guess slugs.** All reference IDs (themes,
+organizations, licenses, tags, entities, statuses) must come from `discover_ids`
+or `lookup_area`. IDs differ between dev and prod environments.
 
 ## Step 2 — Fetch dataset state
 
@@ -47,10 +51,11 @@ Reference IDs:
   entity.year:              <id>
   entity.state:             <id>
   entity.municipality:      <id>
-  entity.financing_phase:   <id>
-  entity.financing_account: <id>
   area.br:                  <id>
   bigquery_type.INT64:       <id>
+  availability.online:      <id>
+  organization.<slug>:      <id>
+  theme.<slug>:             <id>
   ...
 
 Dataset:

diff --git a/models/br_mma_cnuc/br_mma_cnuc__unidades_conservacao.sql b/models/br_mma_cnuc/br_mma_cnuc__unidades_conservacao.sql
@@ -0,0 +1,70 @@
+{{
+    config(
+        alias="unidades_conservacao",
+        schema="br_mma_cnuc",
+        materialized="table",
+    )
+}}
+
+select
+    safe_cast(ano as int64) ano,
+    safe_cast(semestre as int64) semestre,
+    safe_cast(id_uc as int64) id_uc,
+    safe_cast(codigo_uc as string) codigo_uc,
+    safe_cast(nome_uc as string) nome_uc,
+    safe_cast(esfera_administrativa as string) esfera_administrativa,
+    safe_cast(categoria_manejo as string) categoria_manejo,
+    safe_cast(categoria_iucn as string) categoria_iucn,
+    safe_cast(grupo as string) grupo,
+    safe_cast(protecao_integral as int64) protecao_integral,
+    safe_cast(uso_sustentavel as int64) uso_sustentavel,
+    safe_cast(sigla_uf as string) sigla_uf,
+    safe_cast(municipios_abrangidos as string) municipios_abrangidos,
+    safe_cast(ano_criacao as int64) ano_criacao,
+    safe_cast(ano_ato_legal_recente as int64) ano_ato_legal_recente,
+    safe_cast(ato_legal_criacao as string) ato_legal_criacao,
+    safe_cast(outros_atos_legais as string) outros_atos_legais,
+    safe_cast(plano_manejo as string) plano_manejo,
+    safe_cast(conselho_gestor as string) conselho_gestor,
+    safe_cast(orgao_gestor as string) orgao_gestor,
+    safe_cast(informacoes_gerais as string) informacoes_gerais,
+    safe_cast(fonte_area as int64) fonte_area,
+    safe_cast(area_soma_biomas as float64) area_soma_biomas,
+    safe_cast(area_soma_biomas_continental as float64) area_soma_biomas_continental,
+    safe_cast(area_ato_legal_criacao as float64) area_ato_legal_criacao,
+    safe_cast(area_amazonia as float64) area_amazonia,
+    safe_cast(area_caatinga as float64) area_caatinga,
+    safe_cast(area_cerrado as float64) area_cerrado,
+    safe_cast(area_mata_atlantica as float64) area_mata_atlantica,
+    safe_cast(area_pampa as float64) area_pampa,
+    safe_cast(area_pantanal as float64) area_pantanal,
+    safe_cast(area_marinha as float64) area_marinha,
+    safe_cast(bioma_declarado as string) bioma_declarado,
+    safe_cast(biomas_abrangidos as string) biomas_abrangidos,
+    safe_cast(percentual_alem_linha_costa as float64) percentual_alem_linha_costa,
+    safe_cast(recortes as float64) recortes,
+    safe_cast(mar_territorial as float64) mar_territorial,
+    safe_cast(municipio_costeiro as float64) municipio_costeiro,
+    safe_cast(
+        municipio_costeiro_area_marinha as float64
+    ) municipio_costeiro_area_marinha,
+    safe_cast(amazonia_legal as float64) amazonia_legal,
+    safe_cast(lei_mata_atlantica as float64) lei_mata_atlantica,
+    safe_cast(sobreposicao_ti_tq as float64) sobreposicao_ti_tq,
+    safe_cast(programa_projeto as string) programa_projeto,
+    safe_cast(sitios_patrimonio_mundial as string) sitios_patrimonio_mundial,
+    safe_cast(sitios_ramsar as string) sitios_ramsar,
+    safe_cast(mosaico as string) mosaico,
+    safe_cast(reserva_biosfera as string) reserva_biosfera,
+    safe_cast(codigo_wdpa as string) codigo_wdpa,
+    safe_cast(regiao as string) regiao,
+    safe_cast(
+        qualidade_dados_georreferenciados as string
+    ) qualidade_dados_georreferenciados,
+    safe_cast(presente_versao_anterior as string) presente_versao_anterior,
+    safe_cast(diferenca_area as float64) diferenca_area,
+    safe_cast(razao_diferenca_area as float64) razao_diferenca_area,
+    safe_cast(data_publicacao_cnuc as date) data_publicacao_cnuc,
+    safe_cast(data_ultima_certificacao as date) data_ultima_certificacao,
+    st_geogfromtext(safe_cast(geometria as string), make_valid => true) geometria,
+from {{ set_datalake_project("br_mma_cnuc_staging.unidades_conservacao") }} as t