dandi · bendichter · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
     rev: v2.4.1
     hooks:
     -   id: codespell
-        exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml|dandi/data/allen_ccf_structures\.json)$
+        exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml|dandi/data/allen_ccf_structures\.json|dandi/data/uberon_brain_structures\.json)$
         additional_dependencies:
         - tomli; python_version<'3.11'
 -   repo: https://github.com/PyCQA/flake8

diff --git a/dandi/data/generate_uberon_structures.py b/dandi/data/generate_uberon_structures.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""Regenerate uberon_brain_structures.json from the UBERON OBO file.
+
+Run: python -m dandi.data.generate_uberon_structures
+
+Downloads the UBERON OBO file, parses it without any library dependency,
+extracts brain/nervous system descendants, and writes a compact JSON file.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+import json
+from pathlib import Path
+import re
+
+import requests
+
+# Root terms whose descendants (via is_a and part_of) we collect.
+_ROOT_IDS = frozenset({"UBERON:0001016", "UBERON:0000955"})  # nervous system, brain
+
+
+def _parse_obo_terms(text: str) -> list[dict]:  # pragma: no cover
+    """Parse [Term] stanzas from raw OBO text."""
+    terms: list[dict] = []
+    in_term = False
+    current: dict = {}
+
+    for line in text.splitlines():
+        line = line.strip()
+        if line == "[Term]":
+            if current.get("id"):
+                terms.append(current)
+            current = {"id": "", "name": "", "synonyms": [], "parents": []}
+            in_term = True
+            continue
+        if line.startswith("[") and line.endswith("]"):
+            # Another stanza type (e.g. [Typedef])
+            if current.get("id"):
+                terms.append(current)
+            current = {}
+            in_term = False
+            continue
+        if not in_term:
+            continue
+        if not line or line.startswith("!"):
+            continue
+
+        if line == "is_obsolete: true":
+            current["id"] = ""  # mark for skipping
+            continue
+        if line.startswith("id: "):
+            current["id"] = line[4:]
+        elif line.startswith("name: "):
+            current["name"] = line[6:]
+        elif line.startswith("is_a: "):
+            parent_id = line[6:].split("!")[0].strip()
+            if parent_id.startswith("UBERON:"):
+                current["parents"].append(parent_id)
+        elif line.startswith("relationship: part_of "):
+            parent_id = line[len("relationship: part_of ") :].split("!")[0].strip()
+            if parent_id.startswith("UBERON:"):
+                current["parents"].append(parent_id)
+        elif line.startswith("synonym: "):
+            m = re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)
+            if m:
-            m = re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)
-            if m:
+            if (m := re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)):
-            m = re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)
-            if m:
+            if (m := re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)):
+                current["synonyms"].append({"text": m.group(1), "scope": m.group(2)})
+
+    if current.get("id"):
+        terms.append(current)
+    return terms
+
+
+def _collect_descendants(
+    terms: list[dict], root_ids: frozenset[str]
+) -> set[str]:  # pragma: no cover
+    """BFS from root_ids through children (reverse of is_a/part_of) edges."""
+    children: dict[str, list[str]] = defaultdict(list)
+    for t in terms:
+        for parent in t["parents"]:
+            children[parent].append(t["id"])
+
+    visited: set[str] = set()
+    queue = list(root_ids)
+    while queue:
+        node = queue.pop()
+        if node in visited:
+            continue
+        visited.add(node)
+        queue.extend(children.get(node, []))
+    return visited
+
+
+def main() -> None:  # pragma: no cover
+    url = "http://purl.obolibrary.org/obo/uberon.obo"
+    print(f"Downloading {url} ...")
+    resp = requests.get(url, timeout=120)
+    resp.raise_for_status()
+    print(f"Downloaded {len(resp.text)} bytes, parsing ...")
+
+    all_terms = _parse_obo_terms(resp.text)
+    print(f"Parsed {len(all_terms)} terms")
+
+    # Filter to UBERON terms only (skip cross-ontology references)
+    uberon_terms = [t for t in all_terms if t["id"].startswith("UBERON:")]
+    print(f"UBERON terms: {len(uberon_terms)}")
+
+    descendant_ids = _collect_descendants(uberon_terms, _ROOT_IDS)
+    print(f"Nervous system descendants (including roots): {len(descendant_ids)}")
+
+    structures: list[dict] = []
+    for t in uberon_terms:
+        if t["id"] not in descendant_ids:
+            continue
+        numeric_id = t["id"].replace("UBERON:", "")
+        entry: dict = {"id": numeric_id, "name": t["name"]}
+        if t["synonyms"]:
+            # Compact format: [text, scope_letter] to keep file under 500KB
+            entry["synonyms"] = [
+                [syn["text"], syn["scope"][0]] for syn in t["synonyms"]
+            ]
+        structures.append(entry)
+
+    structures.sort(key=lambda s: s["id"])
+    out_path = Path(__file__).with_name("uberon_brain_structures.json")
+    with open(out_path, "w") as f:
+        json.dump(structures, f, separators=(",", ":"))
+        f.write("\n")
+    print(f"Wrote {len(structures)} structures to {out_path}")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/dandi/data/uberon_brain_structures.json b/dandi/data/uberon_brain_structures.json
diff --git a/dandi/metadata/brain_areas.py b/dandi/metadata/brain_areas.py
@@ -14,6 +14,7 @@
 lgr = get_logger()
 
 MBAO_URI_TEMPLATE = "http://purl.obolibrary.org/obo/MBA_{}"
+UBERON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/UBERON_{}"
 
 # Values that should be treated as missing / uninformative
 _TRIVIAL_VALUES = frozenset(
@@ -253,3 +254,206 @@ def locations_to_ccf_mouse_anatomy(locations: list[str]) -> list[models.Anatomy]
                     seen_ids.add(id_str)
                     results.append(anatomy)
     return results
+
+
+# ---------------------------------------------------------------------------
+# UBERON matching
+# ---------------------------------------------------------------------------
+
+
+@lru_cache(maxsize=1)
+def _load_uberon_structures() -> list[dict[str, Any]]:
+    """Load the bundled UBERON brain structures JSON."""
+    data_path = (
+        Path(__file__).resolve().parent.parent / "data" / "uberon_brain_structures.json"
+    )
+    with open(data_path) as f:
+        structures: list[dict[str, Any]] = json.load(f)
+    return structures
+
+
+# Synonym scopes ordered from most to least precise.
+_SCOPE_ORDER = ("EXACT", "NARROW", "BROAD", "RELATED")
+_SCOPE_LETTER = {"E": "EXACT", "N": "NARROW", "B": "BROAD", "R": "RELATED"}
+
+
+def _scopes_up_to(max_scope: str) -> tuple[str, ...]:
+    """Return scope tiers from EXACT up to and including *max_scope*."""
+    idx = _SCOPE_ORDER.index(max_scope)
+    return _SCOPE_ORDER[: idx + 1]
+
+
+@lru_cache(maxsize=4)
+def _build_uberon_lookup_dicts(
+    scope: str,
+) -> tuple[dict[str, dict], dict[str, dict]]:
+    """Build lookup dictionaries for a single UBERON synonym scope.
+
+    Term names are always included (they are not scope-gated).
+    Synonyms are filtered to only those matching *scope*.
+
+    Parameters
+    ----------
+    scope : str
+        A single scope to include (``"EXACT"``, ``"NARROW"``, etc.).
+        Pass ``"_NAME"`` to build a name-only dict (no synonyms).
+
+    Returns
+    -------
+    tuple of 2 dicts
+        (name_exact, name_lower) mapping texts to structure dicts.
+    """
+    structures = _load_uberon_structures()
+    name_exact: dict[str, dict] = {}
+    name_lower: dict[str, dict] = {}
+    for s in structures:
+        if scope == "_NAME":
+            # Name-only tier (always tried first)
+            name = s["name"]
+            if name not in name_exact:
+                name_exact[name] = s
+            name_low = name.lower()
+            if name_low not in name_lower:
+                name_lower[name_low] = s
+        else:
+            # Synonym tier for the given scope
+            for syn in s.get("synonyms", []):
+                syn_scope = _SCOPE_LETTER.get(syn[1], syn[1])
+                if syn_scope != scope:
+                    continue
+                text = syn[0]
+                if text not in name_exact:
+                    name_exact[text] = s
+                text_low = text.lower()
+                if text_low not in name_lower:
+                    name_lower[text_low] = s
+    return name_exact, name_lower
+
+
+def _lookup_in_dicts(
+    token: str, name_exact: dict[str, dict], name_lower: dict[str, dict]
+) -> dict | None:
+    """Try exact then case-insensitive lookup, return structure or None."""
+    s = name_exact.get(token)
+    if s is not None:
+        return s
+    return name_lower.get(token.lower())
+
+
+def match_location_to_uberon(
+    token: str,
+    max_synonym_scope: str = "EXACT",
+) -> models.Anatomy | None:
+    """Match a single location token against UBERON brain structures.
+
+    Matching is tiered: term names are tried first, then synonyms in
+    precision order (EXACT > NARROW > BROAD > RELATED) up to and
+    including *max_synonym_scope*.
+
+    Parameters
+    ----------
+    token : str
+        Location string to match.
+    max_synonym_scope : str
+        Most permissive synonym scope to try.  ``"EXACT"`` (default)
+        only uses exact synonyms.  ``"BROAD"`` tries EXACT, then
+        NARROW, then BROAD.
+
+    Returns
+    -------
+    models.Anatomy or None
+    """
+    token_stripped = token.strip()
+    if not token_stripped:
+        return None
+
+    # Always try term names first
+    s = _lookup_in_dicts(token_stripped, *_build_uberon_lookup_dicts("_NAME"))
+    if s is not None:
+        return _uberon_structure_to_anatomy(s)
+
+    # Try synonym tiers in precision order
+    for scope in _scopes_up_to(max_synonym_scope):
+        s = _lookup_in_dicts(token_stripped, *_build_uberon_lookup_dicts(scope))
+        if s is not None:
+            return _uberon_structure_to_anatomy(s)
+
+    return None
+
+
+def _uberon_structure_to_anatomy(s: dict[str, Any]) -> models.Anatomy:
+    """Convert a UBERON structure dict to a ``dandischema`` Anatomy model."""
+    return models.Anatomy(
+        identifier=UBERON_URI_TEMPLATE.format(s["id"]),
+        name=s["name"],
+    )
+
+
+def locations_to_uberon_anatomy(
+    locations: list[str],
+    max_synonym_scope: str = "EXACT",
+) -> list[models.Anatomy]:
+    """Convert raw NWB location strings to deduplicated UBERON Anatomy list.
+
+    Parameters
+    ----------
+    locations : list[str]
+        Raw location strings from NWB file.
+    max_synonym_scope : str
+        Most permissive synonym scope to try (see
+        :func:`match_location_to_uberon`).
+
+    Returns
+    -------
+    list[models.Anatomy]
+        Matched and deduplicated anatomy entries.
+    """
+    seen_ids: set[str] = set()
+    results: list[models.Anatomy] = []
+    for loc in locations:
+        tokens = _parse_location_string(loc)
+        for token in tokens:
+            anatomy = match_location_to_uberon(token, max_synonym_scope)
+            if anatomy is not None:
+                id_str = str(anatomy.identifier)
+                if id_str not in seen_ids:
+                    seen_ids.add(id_str)
+                    results.append(anatomy)
+    return results
+
+
+def locations_to_mouse_anatomy(
+    locations: list[str],
+    max_synonym_scope: str = "EXACT",
+) -> list[models.Anatomy]:
+    """Convert raw NWB location strings for mouse.
+
+    Tries Allen CCF first for each token, falls back to UBERON.
+
+    Parameters
+    ----------
+    locations : list[str]
+        Raw location strings from NWB file.
+    max_synonym_scope : str
+        Most permissive synonym scope for UBERON fallback (see
+        :func:`match_location_to_uberon`).
+
+    Returns
+    -------
+    list[models.Anatomy]
+        Matched and deduplicated anatomy entries.
+    """
+    seen_ids: set[str] = set()
+    results: list[models.Anatomy] = []
+    for loc in locations:
+        tokens = _parse_location_string(loc)
+        for token in tokens:
+            anatomy = match_location_to_allen(token)
+            if anatomy is None:
+                anatomy = match_location_to_uberon(token, max_synonym_scope)
+            if anatomy is not None:
+                id_str = str(anatomy.identifier)
+                if id_str not in seen_ids:
+                    seen_ids.add(id_str)
+                    results.append(anatomy)
+    return results