diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index db3c0cf7c..467ed2bea 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -3,43 +3,36 @@ from __future__ import annotations from datetime import UTC, datetime +import json import re from typing import TYPE_CHECKING -from django.db.models import OuterRef, Subquery +from django.contrib.auth.models import User +from django.db.models import OuterRef, Q, Subquery, Value +from django.db.models.functions import Concat from dandiapi.api.models import Version +from dandiapi.api.models.dandiset import DandisetUserObjectPermission +from dandiapi.api.services.search.operators import ( + AFFILIATION_OPS, + ASSET_NAME_PATH_OPS, + ASSET_OPS, + CONTRIBUTOR_ROLE_OPS, + COUNT_OPS, + DATE_OPS, + FILE_TYPE_ALIASES, + OWNER_OPS, +) from dandiapi.api.services.search.parser import SearchSyntaxError from dandiapi.search.models import AssetSearch if TYPE_CHECKING: - from django.contrib.auth.models import AnonymousUser, User + from django.contrib.auth.models import AnonymousUser from django.db.models import QuerySet from dandiapi.api.models import Dandiset from dandiapi.api.services.search.parser import ParsedSearch -# Aliases for the file-type operator: short name → MIME prefix matched with -# istartswith. Keep in sync with DandisetSearchQueryParameterSerializer. -_FILE_TYPE_ALIASES = { - 'nwb': 'application/x-nwb', - 'image': 'image/', - 'text': 'text/', - 'video': 'video/', -} - -_DATE_OPS = frozenset( - { - 'created_before', - 'created_after', - 'modified_before', - 'modified_after', - 'published_before', - 'published_after', - } -) -_ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) - def _annotate_latest_version_modified(queryset): latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[:1] @@ -59,29 +52,8 @@ def _annotate_latest_published_created(queryset): ) -# Each entry maps an operator to a Postgres jsonpath that selects the names -# we want to match against. `[*]` wildcards mean we match if ANY element of -# the array satisfies the predicate — important for assets that list -# multiple species, approaches, etc. Paths MUST be trusted constants: -# they're interpolated into the SQL. -_NAME_PATH_OPS = { - 'species': '$.wasAttributedTo[*].species.name', - 'approach': '$.approach[*].name', - 'technique': '$.measurementTechnique[*].name', - 'standard': '$.dataStandard[*].name', -} - - def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]: - """Build a parameterized Postgres `jsonb_path_exists` predicate. - - Matches `value` case-insensitively as a substring against any node - selected by `path`. `path` MUST come from a trusted allowlist; `value` - is parameterized and regex-escaped. - """ - # No table prefix on `asset_metadata`: Django may alias the AssetSearch - # table (e.g. inside a subquery), and qualifying the column would break - # those queries. The unqualified column is unambiguous in our usage. + """Asset-metadata jsonpath substring predicate; `path` must be trusted.""" where = ( 'jsonb_path_exists(asset_metadata, ' f"('{path} ? (@ like_regex ' " @@ -93,17 +65,140 @@ def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]: def _apply_asset_filter(queryset, operator: str, value: str): """Apply one parsed asset operator to an AssetSearch queryset.""" - if operator in _NAME_PATH_OPS: - where, params = _jsonpath_name_match(_NAME_PATH_OPS[operator], value) - # `where` interpolates only an allowlisted jsonpath; the user value - # is bound via params (and re-escaped against regex injection). + if operator in ASSET_NAME_PATH_OPS: + where, params = _jsonpath_name_match(ASSET_NAME_PATH_OPS[operator], value) return queryset.extra(where=[where], params=params) # noqa: S610 if operator == 'file_type': - mime_prefix = _FILE_TYPE_ALIASES.get(value.lower(), value) + mime_prefix = FILE_TYPE_ALIASES.get(value.lower(), value) return queryset.filter(asset_metadata__encodingFormat__istartswith=mime_prefix) raise ValueError(f'unknown asset operator: {operator}') # pragma: no cover +def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Dandiset]: + """Filter dandisets to those owned by users matching `value` (iexact).""" + matched_user_pks = ( + User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name')) + .filter( + Q(username__iexact=value) + | Q(email__iexact=value) + | Q(first_name__iexact=value) + | Q(last_name__iexact=value) + | Q(_full_name__iexact=value) + ) + .values_list('pk', flat=True) + ) + owned_pks = DandisetUserObjectPermission.objects.filter( + user__in=matched_user_pks, permission__codename='owner' + ).values('content_object') + return queryset.filter(pk__in=owned_pks) + + +# Postgres jsonpath quirk: `like_regex` requires its pattern to be a STRING +# LITERAL inside the jsonpath text — not a `$variable`. So we can't use the +# `vars` arg of `jsonb_path_exists` for the regex pattern; we have to build +# the jsonpath at SQL execution time by concatenating `to_jsonb(?::text)::text` +# (a properly-quoted JSON string literal) into the path. The user value is +# still bound as a parameter — never inlined into the SQL — so the value +# remains SQL-injection-safe. `re.escape` neutralizes regex metachars. +_LIKE_REGEX_PATTERN = ' like_regex \' || to_jsonb(%s::text)::text || \' flag "i"' + + +def _contributor_where(value: str, role: str | None) -> tuple[str, list[str]]: + """Build a `jsonb_path_exists(metadata, ...)` where clause for a contributor[] predicate. + + Matches a `contributor[]` element whose `name`, `email`, OR `identifier` + contains `value` (case-insensitive). If `role` is given, additionally + requires that element's `roleName` array to contain a string matching + `role` (also case-insensitive substring). + """ + val_clause = ( + f'@.name{_LIKE_REGEX_PATTERN}' + f' || @.email{_LIKE_REGEX_PATTERN}' + f' || @.identifier{_LIKE_REGEX_PATTERN}' + ) + params = [re.escape(value)] * 3 + if role is None: + jsonpath_expr = f"'$.contributor[*] ? ({val_clause})'" + else: + jsonpath_expr = ( + f"'$.contributor[*] ? (({val_clause})" + f" && exists(@.roleName[*] ? (@{_LIKE_REGEX_PATTERN})))'" + ) + params.append(re.escape(role)) + where = f'jsonb_path_exists(metadata, ({jsonpath_expr})::jsonpath)' + return where, params + + +def _affiliation_where(value: str) -> tuple[str, list[str]]: + """Build a `jsonb_path_exists(metadata, ...)` where clause for the affiliation predicate. + + Affiliations live at `contributor[].affiliation[]`, each with a `name` and + optionally an `identifier` (ROR URL). Matches case-insensitive substring + on either. + """ + clause = f'@.name{_LIKE_REGEX_PATTERN} || @.identifier{_LIKE_REGEX_PATTERN}' + where = ( + f"jsonb_path_exists(metadata, ('$.contributor[*].affiliation[*] ? ({clause})')::jsonpath)" + ) + return where, [re.escape(value)] * 2 + + +def _apply_contributor_filters( + queryset: QuerySet[Dandiset], wheres: list[tuple[str, list[str]]] +) -> QuerySet[Dandiset]: + """Filter dandisets by accumulated contributor predicates. + + `wheres` is a list of `(where_clause, params)` pairs (one per operator). + The returned queryset is restricted to dandisets that have at least ONE + Version whose `metadata` satisfies ALL the predicates simultaneously. + Operators thus AND on the same Version (a draft and a published version + with disjoint contributor lists never combine into a spurious match). + + Each operator is independent: `author:Doe funder:NIH` matches if SOME + contributor element has Doe as Author AND SOME contributor element (the + same OR a different one) has NIH as Funder. + """ + matching_versions = Version.objects.all() + for where, params in wheres: + # Trusted jsonpath template (no user value interpolated); user value + # is bound via the jsonb vars param and additionally regex-escaped. + matching_versions = matching_versions.extra( # noqa: S610 + where=[where], params=params + ) + return queryset.filter(versions__pk__in=matching_versions.values('pk')) + + +def _apply_count_filter( + queryset: QuerySet[Dandiset], jsonpath_path: str, value: str +) -> QuerySet[Dandiset]: + """Filter dandisets where `Version.metadata` at `jsonpath_path` is >= `value`. + + The value is a non-negative integer (e.g. `num_subjects:10`). Treated as + a lower bound — "at least N" is the search intent users actually have; + upper bounds are rare enough not to be worth the syntax cost. + + `jsonpath_path` is a trusted constant pointing into `Version.metadata`. + The integer is bound via `jsonb_path_exists`'s `vars` parameter (not + inlined into SQL or jsonpath), so the value is injection-safe. + + A dandiset matches if at least one of its versions satisfies the + predicate. Versions whose metadata lacks the count field don't match — + the jsonpath `?` filter drops missing/null/non-numeric values naturally. + """ + if not value.isdigit(): + raise SearchSyntaxError( + f'Invalid count value {value!r}. Use a non-negative integer (e.g. `num_subjects:10`).' + ) + n = int(value) + jsonpath = f'{jsonpath_path} ? (@ >= $val)' + vars_json = json.dumps({'val': n}) + where = 'jsonb_path_exists(metadata, %s::jsonpath, %s::jsonb)' + matching_versions = Version.objects.all().extra( # noqa: S610 + where=[where], params=[jsonpath, vars_json] + ) + return queryset.filter(id__in=matching_versions.values('dandiset_id')) + + _MODIFIED_ALIAS = '_search_latest_version_modified' _PUBLISHED_ALIAS = '_search_latest_published_created' @@ -134,7 +229,7 @@ def _apply_date_filter(queryset, operator: str, ts: datetime, annotated: set[str raise ValueError(f'unknown date operator: {operator}') # pragma: no cover -def apply_search_filters( +def apply_search_filters( # noqa: C901 (one branch per operator category — splitting the dispatch loop wouldn't make it more readable) queryset: QuerySet[Dandiset], parsed: ParsedSearch, *, @@ -148,21 +243,31 @@ def apply_search_filters( if not parsed.operators: return queryset - # `asset_qs` is built lazily so a query with no asset ops pays nothing. - # Note on semantics: chaining `.filter()` AND's the conditions on a SINGLE - # AssetSearch row, so e.g. `species:mouse approach:ephys` returns dandisets - # with at least one asset that satisfies BOTH (cross-key AND on the same - # asset). Repeated keys (`species:mouse species:rat`) likewise require a - # single asset to match both substrings — same as GitHub's default. - asset_qs = None + # Semantics: every operator describes a property of the dandiset, and + # multiple operators are AND'd at the dandiset level — NOT at the asset + # or version level. So `species:mouse species:rat` returns dandisets that + # have at least one mouse asset AND at least one rat asset (possibly the + # same asset, possibly different ones). Each asset operator therefore + # builds an independent AssetSearch subquery and we filter dandisets to + # those whose IDs appear in EVERY such subquery. + # + # Contributor predicates are different: they apply to a single Version's + # metadata.contributor[] array (since contributors live on the version, + # not on individual assets), so we accumulate them and AND on the same + # Version to avoid cross-version weirdness when a draft and a published + # version have disjoint contributor lists. Within that single version + # each predicate independently scans `contributor[*]`, so two operators + # may match different contributor entries. annotated: set[str] = set() + contributor_wheres: list[tuple[str, list[str]]] = [] - for key, raw_value in parsed.operators: - value = raw_value.strip() + for op in parsed.operators: + key = op.key + value = op.value.strip() if not value: raise SearchSyntaxError(f'Operator "{key}" requires a value (e.g. {key}:something).') - if key in _DATE_OPS: + if key in DATE_OPS: try: ts = datetime.strptime(value, '%Y-%m-%d').replace(tzinfo=UTC) except ValueError as exc: @@ -170,20 +275,19 @@ def apply_search_filters( f'Invalid date for "{key}": {value!r}. Use YYYY-MM-DD.' ) from exc queryset = _apply_date_filter(queryset, key, ts, annotated) - elif key in _ASSET_OPS: - if asset_qs is None: - asset_qs = AssetSearch.objects.visible_to(user) - asset_qs = _apply_asset_filter(asset_qs, key, value) - - if asset_qs is not None: - # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot - # use the existing per-field GIN indexes; the path-scan operators - # (species/approach/technique/standard) currently sequential-scan the - # asset_search materialized view. The view is small enough today - # (~one row per asset) that this is acceptable, but if it becomes a - # hot path the fix is expression GIN indexes on each path or - # denormalized text columns + trgm_ops indexes. - matching_dandiset_ids = asset_qs.values_list('dandiset_id', flat=True).distinct() - queryset = queryset.filter(id__in=matching_dandiset_ids) + elif key in ASSET_OPS: + asset_match = _apply_asset_filter(AssetSearch.objects.visible_to(user), key, value) + queryset = queryset.filter(id__in=asset_match.values('dandiset_id')) + elif key in OWNER_OPS: + queryset = _apply_owner_filter(queryset, value) + elif key in CONTRIBUTOR_ROLE_OPS: + contributor_wheres.append(_contributor_where(value, CONTRIBUTOR_ROLE_OPS[key])) + elif key in AFFILIATION_OPS: + contributor_wheres.append(_affiliation_where(value)) + elif key in COUNT_OPS: + queryset = _apply_count_filter(queryset, COUNT_OPS[key], value) + + if contributor_wheres: + queryset = _apply_contributor_filters(queryset, contributor_wheres) return queryset diff --git a/dandiapi/api/services/search/operators.py b/dandiapi/api/services/search/operators.py new file mode 100644 index 000000000..e36dc16fb --- /dev/null +++ b/dandiapi/api/services/search/operators.py @@ -0,0 +1,98 @@ +"""Operator vocabulary and dispatch tables for the advanced-search syntax. + +Pure-Python module so :mod:`parser` (which knows nothing about Django) can +import the operator allowlist alongside the filter dispatch tables. Adding +a new operator is one entry in the relevant table here; the parser picks +it up automatically via :data:`OPERATOR_KEYS`. +""" + +from __future__ import annotations + +# File-type operator: short name → MIME prefix matched with istartswith. +# Keep in sync with DandisetSearchQueryParameterSerializer. +FILE_TYPE_ALIASES = { + 'nwb': 'application/x-nwb', + 'image': 'image/', + 'text': 'text/', + 'video': 'video/', +} + +DATE_OPS = frozenset( + { + 'created_before', + 'created_after', + 'modified_before', + 'modified_after', + 'published_before', + 'published_after', + } +) + +ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) + +OWNER_OPS = frozenset({'owner'}) + +# Per-role contributor operators: snake_case operator → PascalCase +# `dandischema.RoleType.name`. The catch-all `contributor:` matches any +# role; each named operator additionally requires the matched contributor's +# `roleName` array to contain `dcite:`. +# +# Kept as an explicit allowlist (rather than auto-derived from RoleType) so +# schema-level renames or additions can't silently change the public search +# syntax. A unit test validates each value against `RoleType.name`. +CONTRIBUTOR_ROLE_OPS: dict[str, str | None] = { + 'contributor': None, # catch-all + 'author': 'Author', + 'contact_person': 'ContactPerson', + 'data_collector': 'DataCollector', + 'data_curator': 'DataCurator', + 'data_manager': 'DataManager', + 'maintainer': 'Maintainer', + 'project_leader': 'ProjectLeader', + 'funder': 'Funder', + 'sponsor': 'Sponsor', + # The other RoleType values exist on `dandischema` but aren't exposed as + # search operators — see PR discussion. The catch-all `contributor:` still + # finds anyone in any role; only the role-restricting shortcuts are pruned. +} + +AFFILIATION_OPS = frozenset({'affiliation'}) + +# Numeric "at least N" operators: short name → jsonpath into `Version.metadata`. +# Value is a non-negative integer; `num_subjects:10` matches dandisets whose +# `assetsSummary.numberOfSubjects` is >= 10. The intuition is "studies with +# at least N ", which is what users actually search for; an upper +# bound is rarely useful in practice. +# +# Only `num_subjects` is exposed for now. The `assetsSummary` schema also +# carries `numberOfFiles`, `numberOfBytes`, `numberOfSamples`, and +# `numberOfCells` — each would be a one-line entry here. +# +# `num_sessions` is intentionally absent: dandischema's `AssetsSummary` does +# not aggregate sessions, and there is no per-asset `sessionId` field from +# which to derive a count. Adding a `num_sessions:` operator would require +# upstream schema work. +COUNT_OPS: dict[str, str] = { + 'num_subjects': '$.assetsSummary.numberOfSubjects', +} + +# Asset-name jsonpaths: each operator selects a different array path on +# `asset_metadata` whose elements have a `.name` we substring-match against. +# Paths MUST be trusted constants (interpolated into the SQL). +ASSET_NAME_PATH_OPS = { + 'species': '$.wasAttributedTo[*].species.name', + 'approach': '$.approach[*].name', + 'technique': '$.measurementTechnique[*].name', + 'standard': '$.dataStandard[*].name', +} + +# Union of every operator key. The parser uses this for its allowlist; +# adding a new operator anywhere above is automatically known to the parser. +OPERATOR_KEYS: frozenset[str] = ( + DATE_OPS + | ASSET_OPS + | OWNER_OPS + | AFFILIATION_OPS + | frozenset(CONTRIBUTOR_ROLE_OPS) + | frozenset(COUNT_OPS) +) diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index 817642341..87282ebbd 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -18,21 +18,7 @@ from difflib import get_close_matches import re -OPERATOR_KEYS: frozenset[str] = frozenset( - { - 'created_before', - 'created_after', - 'modified_before', - 'modified_after', - 'published_before', - 'published_after', - 'species', - 'approach', - 'technique', - 'standard', - 'file_type', - } -) +from dandiapi.api.services.search.operators import OPERATOR_KEYS # A token in the input is one of: # key:"quoted value" — operator with quoted value @@ -42,12 +28,14 @@ # # We deliberately match `key:"value"` and `"value"` *before* the bare-token # alternative so quoted segments stay together. +# Operator keys are matched case-insensitively (`AUTHOR:doe` works the same +# as `author:doe`) — we lowercase the captured key before validation/dispatch. _TOKEN_RE = re.compile( - r'(?P[a-z_]+):"(?P[^"]*)"' + r'(?P[A-Za-z_]+):"(?P[^"]*)"' r'|"(?P[^"]*)"' r'|(?P\S+)' ) -_BARE_OP_RE = re.compile(r'^([a-z_]+):(.+)$') +_BARE_OP_RE = re.compile(r'^([A-Za-z_]+):(.+)$') # Defense-in-depth: cap search-term length so an unauthenticated caller can't @@ -60,10 +48,18 @@ class SearchSyntaxError(ValueError): """Raised when a search query can't be parsed.""" +@dataclass +class Operator: + """One parsed `key:value` operator.""" + + key: str + value: str + + @dataclass class ParsedSearch: free_text: list[str] = field(default_factory=list) - operators: list[tuple[str, str]] = field(default_factory=list) + operators: list[Operator] = field(default_factory=list) def _check_balanced_quotes(query: str) -> None: @@ -98,16 +94,17 @@ def parse_search(query: str) -> ParsedSearch: for match in _TOKEN_RE.finditer(query): if (key := match.group('op_key')) is not None: + key = key.lower() _validate_operator_key(key) - parsed.operators.append((key, match.group('op_qval'))) + parsed.operators.append(Operator(key, match.group('op_qval'))) elif (free := match.group('free_quoted')) is not None: parsed.free_text.append(free) else: bare = match.group('bare') if op_match := _BARE_OP_RE.match(bare): - key = op_match.group(1) + key = op_match.group(1).lower() _validate_operator_key(key) - parsed.operators.append((key, op_match.group(2))) + parsed.operators.append(Operator(key, op_match.group(2))) else: parsed.free_text.append(bare) return parsed diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 712399fc3..d547bb6c4 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2003,24 +2003,47 @@ def test_advanced_search_file_type_alias_and_mime(api_client): @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_repeated_asset_operators_intersect(api_client): - # Cross-key semantics: a SINGLE asset must satisfy ALL constraints. So - # `species:mouse approach:electrophysiological` requires one asset that - # is both attributed to a mouse AND uses an electrophysiological approach. - # An asset that has the species but a different approach (and vice versa - # for a sibling dandiset) does NOT qualify. - mouse_ephys = _seed_dandiset_with_asset( +def test_advanced_search_asset_operators_combine_at_dandiset_level(api_client): + # Multiple asset operators are AND'd at the DANDISET level, not at the + # asset level. So `species:mouse approach:electrophysiological` returns + # any dandiset that has SOME mouse asset AND SOME ephys asset — they may + # be the same asset, but they don't have to be. + one_asset_both = _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], 'approach': [{'name': 'electrophysiological approach'}], }, ) + # Two distinct assets, each satisfying one of the operators. + two_assets_split = DandisetFactory.create() + two_assets_split_version = DraftVersionFactory.create(dandiset=two_assets_split) + two_assets_split_version.assets.add( + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], + }, + ), + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'approach': [{'name': 'electrophysiological approach'}], + }, + ), + ) + add_version_asset_paths(two_assets_split_version) + # Has mouse but no ephys — should NOT match. _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], 'approach': [{'name': 'behavioral approach'}], }, ) + # Has ephys but no mouse — should NOT match. _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'Norway rat'}}], @@ -2030,17 +2053,20 @@ def test_advanced_search_repeated_asset_operators_intersect(api_client): _refresh_asset_search() query = 'species:mouse approach:electrophysiological' - assert _search_ids(api_client, query) == {mouse_ephys.identifier} + assert _search_ids(api_client, query) == { + one_asset_both.identifier, + two_assets_split.identifier, + } @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client): - # Same-key semantics: `species:mouse species:rat` requires a single - # asset whose species set contains BOTH "mouse" and "rat" (matches - # GitHub's default for repeated keys). Pinning this so a future change - # to OR-within-key is a deliberate decision, not a regression. - multi = _seed_dandiset_with_asset( +def test_advanced_search_repeated_same_key_operator_combines_at_dandiset_level(api_client): + # `species:mouse species:rat` returns dandisets that have a mouse asset + # AND a rat asset. The two assets can be the same row (multi-species) or + # two different rows — what matters is that the dandiset, as a whole, + # has both kinds of data represented. + multi_species_one_asset = _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [ {'species': {'name': 'House mouse'}}, @@ -2048,6 +2074,31 @@ def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client ], }, ) + # Two assets: one mouse, one rat. This is the case that distinguishes the + # dandiset-level semantic from a same-asset semantic — it would have been + # excluded under the older "single asset matches both" rule. + two_assets_mouse_and_rat = DandisetFactory.create() + two_assets_mouse_and_rat_version = DraftVersionFactory.create(dandiset=two_assets_mouse_and_rat) + two_assets_mouse_and_rat_version.assets.add( + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], + }, + ), + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'Norway rat'}}], + }, + ), + ) + add_version_asset_paths(two_assets_mouse_and_rat_version) + # Only mouse, only rat — neither should match the combined query. _seed_dandiset_with_asset( asset_metadata={'wasAttributedTo': [{'species': {'name': 'House mouse'}}]}, ) @@ -2056,7 +2107,10 @@ def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client ) _refresh_asset_search() - assert _search_ids(api_client, 'species:mouse species:rat') == {multi.identifier} + assert _search_ids(api_client, 'species:mouse species:rat') == { + multi_species_one_asset.identifier, + two_assets_mouse_and_rat.identifier, + } @pytest.mark.ai_generated @@ -2086,3 +2140,393 @@ def test_advanced_search_species_respects_embargo_visibility(api_client): # Anonymous request: embargoed must be filtered out. assert _search_ids(api_client, 'species:mouse') == {open_ds.identifier} + + +# --- owner: operator ----------------------------------------------------------------------------- + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_lookup_paths_and_combinations(api_client): + """One setup, many assertions for the owner: operator. + + Resolves users by every documented lookup path, unions across multiple + matched users, returns 0 for unknown values, is case-insensitive, and + combines correctly with other operators (cross-key AND on the same + dandiset). + """ + # Three users with overlapping last names so we can exercise every lookup + # path AND the multi-user union in a single setup. + alice = UserFactory.create( + username='Alice', email='Alice@Example.com', first_name='Alice', last_name='Smith' + ) + bob = UserFactory.create( + username='bob', email='bob@example.com', first_name='Bob', last_name='Smith' + ) + carol = UserFactory.create( + username='carol', email='carol@example.com', first_name='Carol', last_name='Jones' + ) + alice_old = DandisetFactory.create(owners=[alice]) + alice_new = DandisetFactory.create(owners=[alice]) + bob_ds = DandisetFactory.create(owners=[bob]) + carol_ds = DandisetFactory.create(owners=[carol]) + for ds in (alice_old, alice_new, bob_ds, carol_ds): + DraftVersionFactory.create(dandiset=ds) + + # Backdate alice_old so we can intersect with a date operator below. + cutoff = timezone.now() - datetime.timedelta(days=1) + Dandiset.objects.filter(pk=alice_old.pk).update(created=cutoff - datetime.timedelta(days=30)) + after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() + + alice_dsets = {alice_old.identifier, alice_new.identifier} + + # username (case-insensitive) + assert _search_ids(api_client, 'owner:alice') == alice_dsets + assert _search_ids(api_client, 'owner:ALICE') == alice_dsets + + # email (case-insensitive) + assert _search_ids(api_client, 'owner:alice@example.com') == alice_dsets + assert _search_ids(api_client, 'owner:ALICE@Example.com') == alice_dsets + + # first / last / full name + assert _search_ids(api_client, 'owner:Bob') == {bob_ds.identifier} + assert _search_ids(api_client, 'owner:Jones') == {carol_ds.identifier} + assert _search_ids(api_client, 'owner:"Carol Jones"') == {carol_ds.identifier} + + # union: shared last name returns dandisets from both users + assert _search_ids(api_client, 'owner:Smith') == alice_dsets | {bob_ds.identifier} + + # unknown user → 0 results, not 400 (a valid 0-hit query) + assert _search_ids(api_client, 'owner:no_such_user_anywhere') == set() + + # combines with other operators: cross-key AND on the same dandiset. + # Only alice_new satisfies BOTH owner:alice AND created_after. + assert _search_ids(api_client, f'owner:alice created_after:{after_str}') == { + alice_new.identifier + } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_does_not_inflate_to_superuser_archive(api_client): + # Guardian's get_objects_for_user(with_superuser=True) returns ALL objects + # for superusers — wrong semantics for owner: searches. We pass + # with_superuser=False so `owner:admin` returns only what admin + # explicitly owns, not the entire archive. + admin = UserFactory.create(username='admin', is_superuser=True) + other = UserFactory.create() + DraftVersionFactory.create(dandiset=DandisetFactory.create(owners=[other])) + admin_owned = DandisetFactory.create(owners=[admin]) + DraftVersionFactory.create(dandiset=admin_owned) + + assert _search_ids(api_client, 'owner:admin') == {admin_owned.identifier} + + +# --- contributor / role operators ---------------------------------------------------------------- + + +def _seed_dandiset_with_contributors(*, contributors: list[dict]) -> Dandiset: + """Create a draft dandiset with the given contributor list on its version. + + Used by the contributor/role tests below. + """ + dandiset = DandisetFactory.create() + version = DraftVersionFactory.create(dandiset=dandiset) + version.metadata = {**version.metadata, 'contributor': contributors} + version.save() + return dandiset + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_contributor_and_role_operators(api_client): + """One setup, many assertions for the contributor + role operators. + + Covers the catch-all `contributor:`, several role-specific operators + (author/data_curator/funder/contact_person), independent-operator AND + semantics across roles, name vs email lookup, case-insensitive substring, + role substring (`data_curator:` matches `dcite:DataCurator`), and the + "different elements may satisfy different operators" composability that + distinguishes Option D from same-element semantics. + """ + # Three dandisets with overlapping names but different role assignments. + # Realistic identifiers: ORCID for Persons, ROR URL for Organizations. + ds_doe_curator = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'email': 'jane.doe@example.com', + 'identifier': '0000-0002-2990-9889', + 'roleName': ['dcite:DataCurator', 'dcite:Author'], + 'schemaKey': 'Person', + }, + { + 'name': 'National Institutes of Health', + 'identifier': 'https://ror.org/01cwqze88', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, + ], + ) + ds_doe_author_only = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'identifier': '0000-0001-2222-3333', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + }, + ], + ) + ds_smith_curator = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Smith, Alice', + 'roleName': ['dcite:DataCurator'], + 'schemaKey': 'Person', + }, + ], + ) + + # `contributor:` (catch-all) — any role + assert _search_ids(api_client, 'contributor:Doe') == { + ds_doe_curator.identifier, + ds_doe_author_only.identifier, + } + assert _search_ids(api_client, 'contributor:NIH') == set() # full org name only + assert _search_ids(api_client, 'contributor:"National Institutes"') == { + ds_doe_curator.identifier + } + # Email lookup also works via the catch-all. + assert _search_ids(api_client, 'contributor:jane.doe@example.com') == { + ds_doe_curator.identifier + } + + # role-specific: author:Doe matches the two dandisets where Doe has Author role + assert _search_ids(api_client, 'author:Doe') == { + ds_doe_curator.identifier, + ds_doe_author_only.identifier, + } + # data_curator:Doe matches only the dandiset where Doe is also a curator + assert _search_ids(api_client, 'data_curator:Doe') == {ds_doe_curator.identifier} + # data_curator:Smith matches the smith dandiset + assert _search_ids(api_client, 'data_curator:Smith') == {ds_smith_curator.identifier} + # funder:"National Institutes" only the one with NIH funder + assert _search_ids(api_client, 'funder:"National Institutes"') == {ds_doe_curator.identifier} + + # Case-insensitive on both name and role + assert _search_ids(api_client, 'AUTHOR:doe') == { + ds_doe_curator.identifier, + ds_doe_author_only.identifier, + } + + # Identifier lookup: full ORCID for Person, full ROR URL for Org, AND + # bare-id substring forms (`01cwqze88` matches the ROR URL via icontains). + assert _search_ids(api_client, 'contributor:0000-0002-2990-9889') == {ds_doe_curator.identifier} + assert _search_ids(api_client, 'contributor:"https://ror.org/01cwqze88"') == { + ds_doe_curator.identifier + } + assert _search_ids(api_client, 'contributor:01cwqze88') == {ds_doe_curator.identifier} + # Identifier lookup composes with role: a role-specific operator with an + # ORCID also requires the matched contributor to hold that role. + assert _search_ids(api_client, 'data_curator:0000-0002-2990-9889') == { + ds_doe_curator.identifier + } + # Wrong role for that ORCID → 0 (Doe @ ds_doe_curator IS a curator, + # but Doe @ ds_doe_author_only has a different ORCID). + assert _search_ids(api_client, 'data_curator:0000-0001-2222-3333') == set() + + # Independent-operator AND across DIFFERENT contributor elements: + # `author:Doe funder:"National Institutes"` matches the dandiset where + # one contributor element is Doe-as-Author AND a different contributor + # element is NIH-as-Funder. (This is the Option D semantic the user + # specifically wanted — with same-element semantics the query would + # require Doe to himself be a Funder, which is wrong.) + assert _search_ids(api_client, 'author:Doe funder:"National Institutes"') == { + ds_doe_curator.identifier + } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_contributor_role_substring_match(api_client): + """Role substring matches the dcite:-prefixed stored value. + + `data_curator:` should match the stored value `dcite:DataCurator` via + case-insensitive substring on `roleName` — users don't have to type + the `dcite:` prefix. + """ + ds = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Curator, Connie', + 'roleName': ['dcite:DataCurator'], # stored with dcite: prefix + 'schemaKey': 'Person', + }, + ], + ) + + # The operator name maps to the substring "DataCurator" (without the + # "dcite:" prefix) and matches the stored "dcite:DataCurator" via + # case-insensitive regex inside the jsonpath. + assert _search_ids(api_client, 'data_curator:Curator') == {ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_unknown_role_operator_returns_400_with_suggestion(api_client): + response = api_client.get( + '/api/dandisets/', + {'draft': 'true', 'empty': 'true', 'search': 'data_curatr:Doe'}, + ) + assert response.status_code == 400 + assert 'data_curator' in response.json()['search'] + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_affiliation_operator(api_client): + """`affiliation:` queries the nested Person.affiliation[] field, not roleName. + + Real DANDI data stores affiliations on the Person object itself (e.g. + `Doe, Jane` is affiliated with Stanford via `Doe.affiliation[0].name`), + not as a `dcite:Affiliation` roleName entry. Match by org name OR by + the affiliation's ROR identifier; substring forms work for both. + """ + ds_stanford = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + 'affiliation': [ + { + 'name': 'Stanford University', + 'identifier': 'https://ror.org/00f54p054', + 'schemaKey': 'Affiliation', + }, + ], + }, + # An Organization contributor (no `affiliation` field of its own — + # affiliations live on Persons in the schema). The affiliation + # jsonpath must walk past this element without exploding. + { + 'name': 'National Institutes of Health (NIH)', + 'identifier': 'https://ror.org/01cwqze88', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, + ], + ) + ds_ucl = _seed_dandiset_with_contributors( + contributors=[ + # Same shape but the Organization comes first this time — the + # jsonpath shouldn't be order-sensitive. + { + 'name': 'Wellcome Trust', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, + { + 'name': 'Doe, Jane', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + 'affiliation': [ + { + 'name': 'University College London', + 'schemaKey': 'Affiliation', + }, + ], + }, + ], + ) + + # Affiliation by organization name + assert _search_ids(api_client, 'affiliation:Stanford') == {ds_stanford.identifier} + assert _search_ids(api_client, 'affiliation:"University College London"') == {ds_ucl.identifier} + # Affiliation by ROR identifier (full URL or bare ID via substring) + assert _search_ids(api_client, 'affiliation:00f54p054') == {ds_stanford.identifier} + # The Organization contributors' own identifiers are NOT matched by + # `affiliation:` — that operator queries `contributor[].affiliation[]`, + # not the contributor's own identifier. (Use `funder:` instead.) + assert _search_ids(api_client, 'affiliation:01cwqze88') == set() + + # Composes with role/contributor operators on the same Version (different + # contributor elements OK — here `author:Doe` matches the Person and + # `affiliation:Stanford` matches via that Person's affiliation, while + # the unrelated Organization contributor is harmlessly ignored). + assert _search_ids(api_client, 'author:Doe affiliation:Stanford') == {ds_stanford.identifier} + # Cross-key with role on the Organization side also works. + assert _search_ids(api_client, 'funder:NIH affiliation:Stanford') == {ds_stanford.identifier} + + +def _seed_dandiset_with_subject_count(n: int | None) -> Dandiset: + """Create a dandiset whose draft version's `assetsSummary.numberOfSubjects` is `n`. + + Pass `None` to omit the field entirely (simulates a dandiset that hasn't + had its assets summarized — the count operator should not match these). + """ + dandiset = DandisetFactory.create() + version = DraftVersionFactory.create(dandiset=dandiset) + summary = {'schemaKey': 'AssetsSummary', 'numberOfBytes': 0, 'numberOfFiles': 0} + if n is not None: + summary['numberOfSubjects'] = n + version.metadata = {**version.metadata, 'assetsSummary': summary} + version.save() + return dandiset + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_num_subjects_operator(api_client): + """`num_subjects:N` is "at least N subjects" against assetsSummary.numberOfSubjects. + + Covers the threshold (exact + above match, below doesn't), missing + field (doesn't match), case-insensitivity, and composition with another + operator on the same dandiset. + """ + ds_2 = _seed_dandiset_with_subject_count(2) + ds_10 = _seed_dandiset_with_subject_count(10) + ds_50 = _seed_dandiset_with_subject_count(50) + ds_unknown = _seed_dandiset_with_subject_count(None) # no numberOfSubjects field + + # >= threshold semantics + assert _search_ids(api_client, 'num_subjects:10') == {ds_10.identifier, ds_50.identifier} + assert _search_ids(api_client, 'num_subjects:0') == { + ds_2.identifier, + ds_10.identifier, + ds_50.identifier, + } + # ds_unknown never matches: a missing count must not be coerced to 0 + assert ds_unknown.identifier not in _search_ids(api_client, 'num_subjects:0') + + # Case-insensitive operator key + assert _search_ids(api_client, 'NUM_SUBJECTS:10') == {ds_10.identifier, ds_50.identifier} + + # Composes with another operator (here `name` free-text would compose + # via the existing path; use a date operator to stay structured-only). + # Constrain `ds_50` only via its created date. + cutoff = timezone.now() - datetime.timedelta(days=1) + Dandiset.objects.filter(pk=ds_2.pk).update(created=cutoff - datetime.timedelta(days=30)) + after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() + # `num_subjects:2 created_after:` → dandisets with >=2 subjects + # AND created in the last day → ds_10 + ds_50 (ds_2 is older). + assert _search_ids(api_client, f'num_subjects:2 created_after:{after_str}') == { + ds_10.identifier, + ds_50.identifier, + } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_num_subjects_invalid_value_returns_400(api_client): + # Non-integer values are rejected with a helpful message; negatives and + # decimals fail the digits-only check. `num_subjects:` alone (no value) + # doesn't reach this path — the parser treats it as free text since the + # operator regex requires at least one character after the colon. + for bad in ('abc', '-5', '5.5'): + response = api_client.get( + '/api/dandisets/', + {'draft': 'true', 'empty': 'true', 'search': f'num_subjects:{bad}'}, + ) + assert response.status_code == 400, bad diff --git a/dandiapi/api/tests/test_search_parser.py b/dandiapi/api/tests/test_search_parser.py index 2ab5b42f8..6b3d3123c 100644 --- a/dandiapi/api/tests/test_search_parser.py +++ b/dandiapi/api/tests/test_search_parser.py @@ -3,6 +3,7 @@ import pytest from dandiapi.api.services.search.parser import ( + Operator, SearchSyntaxError, parse_search, ) @@ -22,29 +23,32 @@ ( 'species:mouse created_after:2024-01-01', [], - [('species', 'mouse'), ('created_after', '2024-01-01')], + [Operator('species', 'mouse'), Operator('created_after', '2024-01-01')], ), # Mixed ( 'place cells species:mouse created_after:2024-01-01 ca1', ['place', 'cells', 'ca1'], - [('species', 'mouse'), ('created_after', '2024-01-01')], + [Operator('species', 'mouse'), Operator('created_after', '2024-01-01')], ), # Quoted phrase as free text ('"place cells" hippocampus', ['place cells', 'hippocampus'], []), # Quoted operator value (multi-word) - ('technique:"patch clamp"', [], [('technique', 'patch clamp')]), + ('technique:"patch clamp"', [], [Operator('technique', 'patch clamp')]), # Repeated operator keeps every entry (AND'd downstream) ( 'species:mouse species:rat', [], - [('species', 'mouse'), ('species', 'rat')], + [Operator('species', 'mouse'), Operator('species', 'rat')], ), # Special characters preserved inside quoted operator value - ('species:"C57BL/6"', [], [('species', 'C57BL/6')]), + ('species:"C57BL/6"', [], [Operator('species', 'C57BL/6')]), # Quoted token that *looks* like an operator is treated as free text — - # this is the documented escape hatch for searching for a literal colon. + # documented escape hatch for searching for a literal colon. ('"foo:bar" hippocampus', ['foo:bar', 'hippocampus'], []), + # Owner operator + ('owner:jdoe', [], [Operator('owner', 'jdoe')]), + ('owner:user@example.com', [], [Operator('owner', 'user@example.com')]), ], ids=[ 'empty', @@ -57,6 +61,8 @@ 'repeated-operator-key', 'special-chars-in-quoted-value', 'quoted-operator-like-token-is-free-text', + 'owner-username', + 'owner-email', ], ) def test_parse_search(query, expected_free_text, expected_operators): @@ -92,3 +98,25 @@ def test_parse_search(query, expected_free_text, expected_operators): def test_parse_search_raises_on_invalid_query(query, expected_message_fragment): with pytest.raises(SearchSyntaxError, match=expected_message_fragment): parse_search(query) + + +def test_contributor_role_ops_match_actual_dandischema_roletype(): + """Guard against schema drift. + + Every non-catch-all `CONTRIBUTOR_ROLE_OPS` value must match a real + `dandischema.RoleType` member name. Renames or removals on the schema + side trip this test, forcing an explicit decision here instead of + silently changing user-facing search syntax. + """ + from dandischema.models import RoleType + + from dandiapi.api.services.search.operators import CONTRIBUTOR_ROLE_OPS + + role_names = {r.name for r in RoleType} + for op_name, role_name in CONTRIBUTOR_ROLE_OPS.items(): + if role_name is None: + continue + assert role_name in role_names, ( + f'CONTRIBUTOR_ROLE_OPS[{op_name!r}] = {role_name!r} is not a valid ' + 'dandischema.RoleType member name' + ) diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 0d64477c8..2bf87efe5 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -311,7 +311,20 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'published_before, published_after (all take YYYY-MM-DD); ' 'species, approach, technique, standard (case-insensitive ' 'substring against the corresponding asset_metadata array); ' - 'file_type (nwb, image, text, video — or any MIME prefix). ' + 'file_type (nwb, image, text, video — or any MIME prefix); ' + 'owner (case-insensitive match against username, email, first ' + 'name, last name, or "first last"); ' + 'contributor (matches a contributor by name, email, or identifier ' + '— ORCID for Persons, ROR URL for Organizations; any role); ' + 'role-restricting shortcuts that match a contributor by ' + 'name/email/identifier AND require that contributor to hold a ' + 'specific role: author, contact_person, data_collector, ' + 'data_curator, data_manager, maintainer, project_leader, funder, ' + 'sponsor. ' + 'affiliation (matches the nested Person.affiliation[] field by ' + 'organization name or ROR identifier — e.g. affiliation:Stanford); ' + 'num_subjects (matches dandisets with at least N subjects per ' + 'assetsSummary.numberOfSubjects — e.g. num_subjects:10). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 5b3b5a6db..7c8b47a7a 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -95,6 +95,16 @@ const operatorHelp = [ { example: 'technique:"patch clamp"', description: 'Has assets using a measurement technique' }, { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, + { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, or email)' }, + { example: 'contributor:"Doe, Jane"', description: 'Listed as a contributor (any role; matches name, email, ORCID, or ROR ID)' }, + { example: 'author:Doe', description: 'Listed as an Author (matches name, email, ORCID, or ROR ID)' }, + { example: 'data_curator:Doe', description: 'Listed as a Data Curator' }, + { example: 'funder:NIH', description: 'Listed as a Funder' }, + { example: 'contact_person:Doe', description: 'Listed as the Contact Person' }, + { example: 'maintainer:Doe', description: 'Listed as a Maintainer' }, + { example: 'project_leader:Doe', description: 'Listed as the Project Leader (also: data_collector, data_manager, sponsor)' }, + { example: 'affiliation:Stanford', description: 'Has a contributor affiliated with the named organization (or ROR ID)' }, + { example: 'num_subjects:10', description: 'Has at least N subjects (assetsSummary.numberOfSubjects)' }, ]; function updateSearch(search: string) {