Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
e89f6f4
Add owner: operator to advanced search
bendichter May 7, 2026
fd0567b
owner: also match by display name (first/last/full)
bendichter May 7, 2026
74fcc5b
Apply ruff format to test_dandiset.py
bendichter May 8, 2026
a55ec56
owner: keep owner:me magic; add quoted-form escape; consolidate tests
bendichter May 8, 2026
a328da1
Drop owner:me magic alias (defer to a follow-up PR)
bendichter May 11, 2026
6ea71f6
Add contributor: + 28 per-role operators with ORCID/ROR identifier lo…
bendichter May 9, 2026
2ab3257
affiliation: query nested Person.affiliation[] field, not roleName
bendichter May 9, 2026
01b4bb1
Drop ethics_approval and other operators
bendichter May 9, 2026
86ff471
Extract operator dispatch into operators.py; trim role list
bendichter May 11, 2026
463c366
Use project_leader (full schema name) as the operator name
bendichter May 11, 2026
76c61f7
Test: rename baker/cody variable names + fix earlier sed damage
bendichter May 11, 2026
e4d03d3
Apply ruff format
bendichter May 11, 2026
5d6c277
Asset operators: AND at the dandiset level, not at the asset level
bendichter May 11, 2026
9fee8a5
Fix SQL for contributor + affiliation operators
bendichter May 11, 2026
6fa2f4b
Parser: case-insensitive operator keys
bendichter May 11, 2026
a994d90
Update dandiapi/api/services/search/filters.py
bendichter May 13, 2026
f9a8155
test: expand affiliation fixture with Organization contributors
bendichter May 13, 2026
37a78b4
Add `num_subjects:` advanced-search operator
bendichter May 13, 2026
9b8b9ad
ruff format
bendichter May 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 180 additions & 76 deletions dandiapi/api/services/search/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,36 @@
from __future__ import annotations

from datetime import UTC, datetime
import json
import re
from typing import TYPE_CHECKING

from django.db.models import OuterRef, Subquery
from django.contrib.auth.models import User
from django.db.models import OuterRef, Q, Subquery, Value
from django.db.models.functions import Concat

from dandiapi.api.models import Version
from dandiapi.api.models.dandiset import DandisetUserObjectPermission
from dandiapi.api.services.search.operators import (
AFFILIATION_OPS,
ASSET_NAME_PATH_OPS,
ASSET_OPS,
CONTRIBUTOR_ROLE_OPS,
COUNT_OPS,
DATE_OPS,
FILE_TYPE_ALIASES,
OWNER_OPS,
)
from dandiapi.api.services.search.parser import SearchSyntaxError
from dandiapi.search.models import AssetSearch

if TYPE_CHECKING:
from django.contrib.auth.models import AnonymousUser, User
from django.contrib.auth.models import AnonymousUser
from django.db.models import QuerySet

from dandiapi.api.models import Dandiset
from dandiapi.api.services.search.parser import ParsedSearch

# Aliases for the file-type operator: short name → MIME prefix matched with
# istartswith. Keep in sync with DandisetSearchQueryParameterSerializer.
_FILE_TYPE_ALIASES = {
'nwb': 'application/x-nwb',
'image': 'image/',
'text': 'text/',
'video': 'video/',
}

_DATE_OPS = frozenset(
{
'created_before',
'created_after',
'modified_before',
'modified_after',
'published_before',
'published_after',
}
)
_ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'})


def _annotate_latest_version_modified(queryset):
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[:1]
Expand All @@ -59,29 +52,8 @@ def _annotate_latest_published_created(queryset):
)


# Each entry maps an operator to a Postgres jsonpath that selects the names
# we want to match against. `[*]` wildcards mean we match if ANY element of
# the array satisfies the predicate — important for assets that list
# multiple species, approaches, etc. Paths MUST be trusted constants:
# they're interpolated into the SQL.
_NAME_PATH_OPS = {
'species': '$.wasAttributedTo[*].species.name',
'approach': '$.approach[*].name',
'technique': '$.measurementTechnique[*].name',
'standard': '$.dataStandard[*].name',
}


def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]:
"""Build a parameterized Postgres `jsonb_path_exists` predicate.

Matches `value` case-insensitively as a substring against any node
selected by `path`. `path` MUST come from a trusted allowlist; `value`
is parameterized and regex-escaped.
"""
# No table prefix on `asset_metadata`: Django may alias the AssetSearch
# table (e.g. inside a subquery), and qualifying the column would break
# those queries. The unqualified column is unambiguous in our usage.
"""Asset-metadata jsonpath substring predicate; `path` must be trusted."""
where = (
'jsonb_path_exists(asset_metadata, '
f"('{path} ? (@ like_regex ' "
Expand All @@ -93,17 +65,140 @@ def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]:

def _apply_asset_filter(queryset, operator: str, value: str):
"""Apply one parsed asset operator to an AssetSearch queryset."""
if operator in _NAME_PATH_OPS:
where, params = _jsonpath_name_match(_NAME_PATH_OPS[operator], value)
# `where` interpolates only an allowlisted jsonpath; the user value
# is bound via params (and re-escaped against regex injection).
if operator in ASSET_NAME_PATH_OPS:
where, params = _jsonpath_name_match(ASSET_NAME_PATH_OPS[operator], value)
return queryset.extra(where=[where], params=params) # noqa: S610
if operator == 'file_type':
mime_prefix = _FILE_TYPE_ALIASES.get(value.lower(), value)
mime_prefix = FILE_TYPE_ALIASES.get(value.lower(), value)
return queryset.filter(asset_metadata__encodingFormat__istartswith=mime_prefix)
raise ValueError(f'unknown asset operator: {operator}') # pragma: no cover


def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Dandiset]:
"""Filter dandisets to those owned by users matching `value` (iexact)."""
matched_user_pks = (
User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name'))
.filter(
Q(username__iexact=value)
| Q(email__iexact=value)
| Q(first_name__iexact=value)
| Q(last_name__iexact=value)
| Q(_full_name__iexact=value)
)
.values_list('pk', flat=True)
)
owned_pks = DandisetUserObjectPermission.objects.filter(
user__in=matched_user_pks, permission__codename='owner'
).values('content_object')
return queryset.filter(pk__in=owned_pks)


# Postgres jsonpath quirk: `like_regex` requires its pattern to be a STRING
# LITERAL inside the jsonpath text — not a `$variable`. So we can't use the
# `vars` arg of `jsonb_path_exists` for the regex pattern; we have to build
# the jsonpath at SQL execution time by concatenating `to_jsonb(?::text)::text`
# (a properly-quoted JSON string literal) into the path. The user value is
# still bound as a parameter — never inlined into the SQL — so the value
# remains SQL-injection-safe. `re.escape` neutralizes regex metachars.
_LIKE_REGEX_PATTERN = ' like_regex \' || to_jsonb(%s::text)::text || \' flag "i"'


def _contributor_where(value: str, role: str | None) -> tuple[str, list[str]]:
"""Build a `jsonb_path_exists(metadata, ...)` where clause for a contributor[] predicate.

Matches a `contributor[]` element whose `name`, `email`, OR `identifier`
contains `value` (case-insensitive). If `role` is given, additionally
requires that element's `roleName` array to contain a string matching
`role` (also case-insensitive substring).
"""
val_clause = (
f'@.name{_LIKE_REGEX_PATTERN}'
f' || @.email{_LIKE_REGEX_PATTERN}'
f' || @.identifier{_LIKE_REGEX_PATTERN}'
)
params = [re.escape(value)] * 3
if role is None:
jsonpath_expr = f"'$.contributor[*] ? ({val_clause})'"
else:
jsonpath_expr = (
f"'$.contributor[*] ? (({val_clause})"
f" && exists(@.roleName[*] ? (@{_LIKE_REGEX_PATTERN})))'"
)
params.append(re.escape(role))
where = f'jsonb_path_exists(metadata, ({jsonpath_expr})::jsonpath)'
return where, params


def _affiliation_where(value: str) -> tuple[str, list[str]]:
"""Build a `jsonb_path_exists(metadata, ...)` where clause for the affiliation predicate.

Affiliations live at `contributor[].affiliation[]`, each with a `name` and
optionally an `identifier` (ROR URL). Matches case-insensitive substring
on either.
"""
clause = f'@.name{_LIKE_REGEX_PATTERN} || @.identifier{_LIKE_REGEX_PATTERN}'
where = (
f"jsonb_path_exists(metadata, ('$.contributor[*].affiliation[*] ? ({clause})')::jsonpath)"
)
return where, [re.escape(value)] * 2


def _apply_contributor_filters(
queryset: QuerySet[Dandiset], wheres: list[tuple[str, list[str]]]
) -> QuerySet[Dandiset]:
"""Filter dandisets by accumulated contributor predicates.

`wheres` is a list of `(where_clause, params)` pairs (one per operator).
The returned queryset is restricted to dandisets that have at least ONE
Version whose `metadata` satisfies ALL the predicates simultaneously.
Operators thus AND on the same Version (a draft and a published version
with disjoint contributor lists never combine into a spurious match).

Each operator is independent: `author:Doe funder:NIH` matches if SOME
contributor element has Doe as Author AND SOME contributor element (the
same OR a different one) has NIH as Funder.
"""
matching_versions = Version.objects.all()
for where, params in wheres:
# Trusted jsonpath template (no user value interpolated); user value
# is bound via the jsonb vars param and additionally regex-escaped.
matching_versions = matching_versions.extra( # noqa: S610
where=[where], params=params
)
return queryset.filter(versions__pk__in=matching_versions.values('pk'))


def _apply_count_filter(
queryset: QuerySet[Dandiset], jsonpath_path: str, value: str
) -> QuerySet[Dandiset]:
"""Filter dandisets where `Version.metadata` at `jsonpath_path` is >= `value`.

The value is a non-negative integer (e.g. `num_subjects:10`). Treated as
a lower bound — "at least N" is the search intent users actually have;
upper bounds are rare enough not to be worth the syntax cost.

`jsonpath_path` is a trusted constant pointing into `Version.metadata`.
The integer is bound via `jsonb_path_exists`'s `vars` parameter (not
inlined into SQL or jsonpath), so the value is injection-safe.

A dandiset matches if at least one of its versions satisfies the
predicate. Versions whose metadata lacks the count field don't match —
the jsonpath `?` filter drops missing/null/non-numeric values naturally.
"""
if not value.isdigit():
raise SearchSyntaxError(
f'Invalid count value {value!r}. Use a non-negative integer (e.g. `num_subjects:10`).'
)
n = int(value)
jsonpath = f'{jsonpath_path} ? (@ >= $val)'
vars_json = json.dumps({'val': n})
where = 'jsonb_path_exists(metadata, %s::jsonpath, %s::jsonb)'
matching_versions = Version.objects.all().extra( # noqa: S610
where=[where], params=[jsonpath, vars_json]
)
return queryset.filter(id__in=matching_versions.values('dandiset_id'))


_MODIFIED_ALIAS = '_search_latest_version_modified'
_PUBLISHED_ALIAS = '_search_latest_published_created'

Expand Down Expand Up @@ -134,7 +229,7 @@ def _apply_date_filter(queryset, operator: str, ts: datetime, annotated: set[str
raise ValueError(f'unknown date operator: {operator}') # pragma: no cover


def apply_search_filters(
def apply_search_filters( # noqa: C901 (one branch per operator category — splitting the dispatch loop wouldn't make it more readable)
queryset: QuerySet[Dandiset],
parsed: ParsedSearch,
*,
Expand All @@ -148,42 +243,51 @@ def apply_search_filters(
if not parsed.operators:
return queryset

# `asset_qs` is built lazily so a query with no asset ops pays nothing.
# Note on semantics: chaining `.filter()` AND's the conditions on a SINGLE
# AssetSearch row, so e.g. `species:mouse approach:ephys` returns dandisets
# with at least one asset that satisfies BOTH (cross-key AND on the same
# asset). Repeated keys (`species:mouse species:rat`) likewise require a
# single asset to match both substrings — same as GitHub's default.
asset_qs = None
# Semantics: every operator describes a property of the dandiset, and
# multiple operators are AND'd at the dandiset level — NOT at the asset
# or version level. So `species:mouse species:rat` returns dandisets that
# have at least one mouse asset AND at least one rat asset (possibly the
# same asset, possibly different ones). Each asset operator therefore
# builds an independent AssetSearch subquery and we filter dandisets to
# those whose IDs appear in EVERY such subquery.
#
# Contributor predicates are different: they apply to a single Version's
# metadata.contributor[] array (since contributors live on the version,
# not on individual assets), so we accumulate them and AND on the same
# Version to avoid cross-version weirdness when a draft and a published
# version have disjoint contributor lists. Within that single version
# each predicate independently scans `contributor[*]`, so two operators
# may match different contributor entries.
annotated: set[str] = set()
contributor_wheres: list[tuple[str, list[str]]] = []

for key, raw_value in parsed.operators:
value = raw_value.strip()
for op in parsed.operators:
key = op.key
value = op.value.strip()
if not value:
raise SearchSyntaxError(f'Operator "{key}" requires a value (e.g. {key}:something).')

if key in _DATE_OPS:
if key in DATE_OPS:
try:
ts = datetime.strptime(value, '%Y-%m-%d').replace(tzinfo=UTC)
except ValueError as exc:
raise SearchSyntaxError(
f'Invalid date for "{key}": {value!r}. Use YYYY-MM-DD.'
) from exc
queryset = _apply_date_filter(queryset, key, ts, annotated)
elif key in _ASSET_OPS:
if asset_qs is None:
asset_qs = AssetSearch.objects.visible_to(user)
asset_qs = _apply_asset_filter(asset_qs, key, value)

if asset_qs is not None:
# NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot
# use the existing per-field GIN indexes; the path-scan operators
# (species/approach/technique/standard) currently sequential-scan the
# asset_search materialized view. The view is small enough today
# (~one row per asset) that this is acceptable, but if it becomes a
# hot path the fix is expression GIN indexes on each path or
# denormalized text columns + trgm_ops indexes.
matching_dandiset_ids = asset_qs.values_list('dandiset_id', flat=True).distinct()
queryset = queryset.filter(id__in=matching_dandiset_ids)
elif key in ASSET_OPS:
asset_match = _apply_asset_filter(AssetSearch.objects.visible_to(user), key, value)
queryset = queryset.filter(id__in=asset_match.values('dandiset_id'))
elif key in OWNER_OPS:
queryset = _apply_owner_filter(queryset, value)
elif key in CONTRIBUTOR_ROLE_OPS:
contributor_wheres.append(_contributor_where(value, CONTRIBUTOR_ROLE_OPS[key]))
elif key in AFFILIATION_OPS:
contributor_wheres.append(_affiliation_where(value))
elif key in COUNT_OPS:
queryset = _apply_count_filter(queryset, COUNT_OPS[key], value)

if contributor_wheres:
queryset = _apply_contributor_filters(queryset, contributor_wheres)

return queryset
Loading
Loading