Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ repos:
rev: v2.4.1
hooks:
- id: codespell
exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml|dandi/data/allen_ccf_structures\.json)$
exclude: ^(dandi/_version\.py|dandi/due\.py|versioneer\.py|pyproject\.toml|dandi/data/allen_ccf_structures\.json|dandi/data/uberon_brain_structures\.json)$
Comment thread
yarikoptic marked this conversation as resolved.
Outdated
additional_dependencies:
- tomli; python_version<'3.11'
- repo: https://github.com/PyCQA/flake8
Expand Down
133 changes: 133 additions & 0 deletions dandi/data/generate_uberon_structures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""Regenerate uberon_brain_structures.json from the UBERON OBO file.

Run: python -m dandi.data.generate_uberon_structures
Comment thread
yarikoptic marked this conversation as resolved.
Outdated

Downloads the UBERON OBO file, parses it without any library dependency,
extracts brain/nervous system descendants, and writes a compact JSON file.
"""

from __future__ import annotations

from collections import defaultdict
import json
from pathlib import Path
import re

import requests

# Root terms whose descendants (via is_a and part_of) we collect.
_ROOT_IDS = frozenset({"UBERON:0001016", "UBERON:0000955"}) # nervous system, brain


def _parse_obo_terms(text: str) -> list[dict]: # pragma: no cover
"""Parse [Term] stanzas from raw OBO text."""
terms: list[dict] = []
in_term = False
current: dict = {}

for line in text.splitlines():
line = line.strip()
if line == "[Term]":
if current.get("id"):
terms.append(current)
current = {"id": "", "name": "", "synonyms": [], "parents": []}
in_term = True
continue
if line.startswith("[") and line.endswith("]"):
# Another stanza type (e.g. [Typedef])
if current.get("id"):
terms.append(current)
current = {}
in_term = False
continue
if not in_term:
continue
if not line or line.startswith("!"):
continue

if line == "is_obsolete: true":
current["id"] = "" # mark for skipping
continue
if line.startswith("id: "):
current["id"] = line[4:]
elif line.startswith("name: "):
current["name"] = line[6:]
elif line.startswith("is_a: "):
parent_id = line[6:].split("!")[0].strip()
if parent_id.startswith("UBERON:"):
current["parents"].append(parent_id)
elif line.startswith("relationship: part_of "):
parent_id = line[len("relationship: part_of ") :].split("!")[0].strip()
if parent_id.startswith("UBERON:"):
current["parents"].append(parent_id)
elif line.startswith("synonym: "):
m = re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)
if m:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tend to instruct my AIs to do walrus for such... I guess we need to adjust DEVELOPMENT.md and/or .lad for that to be auto-picked up

Suggested change
m = re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)
if m:
if (m := re.match(r'synonym:\s+"(.+?)"\s+(EXACT|RELATED|NARROW|BROAD)', line)):

current["synonyms"].append({"text": m.group(1), "scope": m.group(2)})

if current.get("id"):
terms.append(current)
return terms


def _collect_descendants(
terms: list[dict], root_ids: frozenset[str]
) -> set[str]: # pragma: no cover
"""BFS from root_ids through children (reverse of is_a/part_of) edges."""
children: dict[str, list[str]] = defaultdict(list)
for t in terms:
for parent in t["parents"]:
children[parent].append(t["id"])

visited: set[str] = set()
queue = list(root_ids)
while queue:
node = queue.pop()
if node in visited:
continue
visited.add(node)
queue.extend(children.get(node, []))
return visited


def main() -> None: # pragma: no cover
url = "http://purl.obolibrary.org/obo/uberon.obo"
print(f"Downloading {url} ...")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

especially when moved into service command - use logging to gain logging control/archival etc

resp = requests.get(url, timeout=120)
resp.raise_for_status()
print(f"Downloaded {len(resp.text)} bytes, parsing ...")

all_terms = _parse_obo_terms(resp.text)
print(f"Parsed {len(all_terms)} terms")

# Filter to UBERON terms only (skip cross-ontology references)
uberon_terms = [t for t in all_terms if t["id"].startswith("UBERON:")]
print(f"UBERON terms: {len(uberon_terms)}")

descendant_ids = _collect_descendants(uberon_terms, _ROOT_IDS)
print(f"Nervous system descendants (including roots): {len(descendant_ids)}")

structures: list[dict] = []
for t in uberon_terms:
if t["id"] not in descendant_ids:
continue
numeric_id = t["id"].replace("UBERON:", "")
entry: dict = {"id": numeric_id, "name": t["name"]}
if t["synonyms"]:
# Compact format: [text, scope_letter] to keep file under 500KB
entry["synonyms"] = [
[syn["text"], syn["scope"][0]] for syn in t["synonyms"]
]
structures.append(entry)

structures.sort(key=lambda s: s["id"])
out_path = Path(__file__).with_name("uberon_brain_structures.json")
with open(out_path, "w") as f:
json.dump(structures, f, separators=(",", ":"))
f.write("\n")
print(f"Wrote {len(structures)} structures to {out_path}")


if __name__ == "__main__": # pragma: no cover
main()
1 change: 1 addition & 0 deletions dandi/data/uberon_brain_structures.json

Large diffs are not rendered by default.

204 changes: 204 additions & 0 deletions dandi/metadata/brain_areas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
lgr = get_logger()

MBAO_URI_TEMPLATE = "http://purl.obolibrary.org/obo/MBA_{}"
UBERON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/UBERON_{}"

# Values that should be treated as missing / uninformative
_TRIVIAL_VALUES = frozenset(
Expand Down Expand Up @@ -253,3 +254,206 @@ def locations_to_ccf_mouse_anatomy(locations: list[str]) -> list[models.Anatomy]
seen_ids.add(id_str)
results.append(anatomy)
return results


# ---------------------------------------------------------------------------
# UBERON matching
# ---------------------------------------------------------------------------


@lru_cache(maxsize=1)
def _load_uberon_structures() -> list[dict[str, Any]]:
"""Load the bundled UBERON brain structures JSON."""
data_path = (
Path(__file__).resolve().parent.parent / "data" / "uberon_brain_structures.json"
)
with open(data_path) as f:
structures: list[dict[str, Any]] = json.load(f)
return structures


# Synonym scopes ordered from most to least precise.
_SCOPE_ORDER = ("EXACT", "NARROW", "BROAD", "RELATED")
_SCOPE_LETTER = {"E": "EXACT", "N": "NARROW", "B": "BROAD", "R": "RELATED"}


def _scopes_up_to(max_scope: str) -> tuple[str, ...]:
"""Return scope tiers from EXACT up to and including *max_scope*."""
idx = _SCOPE_ORDER.index(max_scope)
return _SCOPE_ORDER[: idx + 1]


@lru_cache(maxsize=4)
def _build_uberon_lookup_dicts(
scope: str,
) -> tuple[dict[str, dict], dict[str, dict]]:
"""Build lookup dictionaries for a single UBERON synonym scope.

Term names are always included (they are not scope-gated).
Synonyms are filtered to only those matching *scope*.

Parameters
----------
scope : str
A single scope to include (``"EXACT"``, ``"NARROW"``, etc.).
Pass ``"_NAME"`` to build a name-only dict (no synonyms).

Returns
-------
tuple of 2 dicts
(name_exact, name_lower) mapping texts to structure dicts.
"""
structures = _load_uberon_structures()
name_exact: dict[str, dict] = {}
name_lower: dict[str, dict] = {}
for s in structures:
if scope == "_NAME":
# Name-only tier (always tried first)
name = s["name"]
if name not in name_exact:
name_exact[name] = s
name_low = name.lower()
if name_low not in name_lower:
name_lower[name_low] = s
else:
# Synonym tier for the given scope
for syn in s.get("synonyms", []):
syn_scope = _SCOPE_LETTER.get(syn[1], syn[1])
if syn_scope != scope:
continue
text = syn[0]
if text not in name_exact:
name_exact[text] = s
text_low = text.lower()
if text_low not in name_lower:
name_lower[text_low] = s
return name_exact, name_lower


def _lookup_in_dicts(
token: str, name_exact: dict[str, dict], name_lower: dict[str, dict]
) -> dict | None:
"""Try exact then case-insensitive lookup, return structure or None."""
s = name_exact.get(token)
if s is not None:
return s
return name_lower.get(token.lower())


def match_location_to_uberon(
token: str,
max_synonym_scope: str = "EXACT",
) -> models.Anatomy | None:
"""Match a single location token against UBERON brain structures.

Matching is tiered: term names are tried first, then synonyms in
precision order (EXACT > NARROW > BROAD > RELATED) up to and
including *max_synonym_scope*.

Parameters
----------
token : str
Location string to match.
max_synonym_scope : str
Most permissive synonym scope to try. ``"EXACT"`` (default)
only uses exact synonyms. ``"BROAD"`` tries EXACT, then
NARROW, then BROAD.

Returns
-------
models.Anatomy or None
"""
token_stripped = token.strip()
if not token_stripped:
return None

# Always try term names first
s = _lookup_in_dicts(token_stripped, *_build_uberon_lookup_dicts("_NAME"))
if s is not None:
return _uberon_structure_to_anatomy(s)

# Try synonym tiers in precision order
for scope in _scopes_up_to(max_synonym_scope):
s = _lookup_in_dicts(token_stripped, *_build_uberon_lookup_dicts(scope))
if s is not None:
return _uberon_structure_to_anatomy(s)
Comment thread
yarikoptic marked this conversation as resolved.

return None


def _uberon_structure_to_anatomy(s: dict[str, Any]) -> models.Anatomy:
"""Convert a UBERON structure dict to a ``dandischema`` Anatomy model."""
return models.Anatomy(
identifier=UBERON_URI_TEMPLATE.format(s["id"]),
name=s["name"],
)


def locations_to_uberon_anatomy(
locations: list[str],
max_synonym_scope: str = "EXACT",
) -> list[models.Anatomy]:
"""Convert raw NWB location strings to deduplicated UBERON Anatomy list.

Parameters
----------
locations : list[str]
Raw location strings from NWB file.
max_synonym_scope : str
Most permissive synonym scope to try (see
:func:`match_location_to_uberon`).

Returns
-------
list[models.Anatomy]
Matched and deduplicated anatomy entries.
"""
seen_ids: set[str] = set()
results: list[models.Anatomy] = []
for loc in locations:
tokens = _parse_location_string(loc)
for token in tokens:
anatomy = match_location_to_uberon(token, max_synonym_scope)
if anatomy is not None:
id_str = str(anatomy.identifier)
if id_str not in seen_ids:
seen_ids.add(id_str)
results.append(anatomy)
return results


def locations_to_mouse_anatomy(
locations: list[str],
max_synonym_scope: str = "EXACT",
) -> list[models.Anatomy]:
"""Convert raw NWB location strings for mouse.

Tries Allen CCF first for each token, falls back to UBERON.

Parameters
----------
locations : list[str]
Raw location strings from NWB file.
max_synonym_scope : str
Most permissive synonym scope for UBERON fallback (see
:func:`match_location_to_uberon`).

Returns
-------
list[models.Anatomy]
Matched and deduplicated anatomy entries.
"""
seen_ids: set[str] = set()
results: list[models.Anatomy] = []
for loc in locations:
tokens = _parse_location_string(loc)
for token in tokens:
anatomy = match_location_to_allen(token)
if anatomy is None:
anatomy = match_location_to_uberon(token, max_synonym_scope)
if anatomy is not None:
id_str = str(anatomy.identifier)
if id_str not in seen_ids:
seen_ids.add(id_str)
results.append(anatomy)
return results
Loading
Loading