From e89f6f45147c548a25e17f6428a6b3df796cd08d Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 6 May 2026 20:04:44 -0400 Subject: [PATCH 01/19] Add owner: operator to advanced search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filters dandisets to those owned by a given user. The value is matched case-insensitively against User.username OR User.email. The special form `owner:me` resolves to the requesting user (consistent with the existing ?user=me query parameter) and returns 400 if the request is anonymous. Implementation reuses the existing `get_owned_dandisets()` permission helper. We pass `with_superuser=False` so `owner:admin` returns only what admin explicitly owns — guardian's default would otherwise inflate to the entire archive for any superuser. Unknown users return zero results (not an error): a search for a nonexistent owner is a valid 0-hit query. Tests cover username/email lookup, case-insensitivity, unknown user, `owner:me` for an authenticated user, anonymous `owner:me` → 400, the superuser non-inflation guarantee, and combination with other operators. OpenAPI help text and the frontend operator popover updated. --- dandiapi/api/services/search/filters.py | 35 ++++++- dandiapi/api/services/search/parser.py | 1 + dandiapi/api/tests/test_dandiset.py | 111 +++++++++++++++++++++ dandiapi/api/tests/test_search_parser.py | 6 ++ dandiapi/api/views/serializers.py | 4 +- web/src/components/DandisetSearchField.vue | 1 + 6 files changed, 155 insertions(+), 3 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index db3c0cf7c..1b5af3534 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -6,14 +6,16 @@ import re from typing import TYPE_CHECKING -from django.db.models import OuterRef, Subquery +from django.contrib.auth.models import User +from django.db.models import OuterRef, Q, Subquery from dandiapi.api.models import Version +from dandiapi.api.services.permissions.dandiset import get_owned_dandisets from dandiapi.api.services.search.parser import SearchSyntaxError from dandiapi.search.models import AssetSearch if TYPE_CHECKING: - from django.contrib.auth.models import AnonymousUser, User + from django.contrib.auth.models import AnonymousUser from django.db.models import QuerySet from dandiapi.api.models import Dandiset @@ -39,6 +41,7 @@ } ) _ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) +_OWNER_OPS = frozenset({'owner'}) def _annotate_latest_version_modified(queryset): @@ -104,6 +107,32 @@ def _apply_asset_filter(queryset, operator: str, value: str): raise ValueError(f'unknown asset operator: {operator}') # pragma: no cover +def _apply_owner_filter( + queryset: QuerySet[Dandiset], value: str, *, request_user: User | AnonymousUser +) -> QuerySet[Dandiset]: + """Filter dandisets to those owned by the given username/email. + + `owner:me` resolves to the requesting user; otherwise we look up the User + by exact (case-insensitive) username or email. Unknown user → empty result + (not an error — searching for a nonexistent owner is a valid 0-hit query). + `with_superuser=False` so `owner:some_admin` returns only what they + actually own, not the entire archive. + """ + if value.lower() == 'me': + if request_user.is_anonymous: + raise SearchSyntaxError( + 'owner:me requires authentication. Sign in or specify a username.' + ) + owner_pks = get_owned_dandisets(request_user, include_superusers=False).values('pk') + return queryset.filter(pk__in=owner_pks) + + matched_user = User.objects.filter(Q(username__iexact=value) | Q(email__iexact=value)).first() + if matched_user is None: + return queryset.none() + owner_pks = get_owned_dandisets(matched_user, include_superusers=False).values('pk') + return queryset.filter(pk__in=owner_pks) + + _MODIFIED_ALIAS = '_search_latest_version_modified' _PUBLISHED_ALIAS = '_search_latest_published_created' @@ -174,6 +203,8 @@ def apply_search_filters( if asset_qs is None: asset_qs = AssetSearch.objects.visible_to(user) asset_qs = _apply_asset_filter(asset_qs, key, value) + elif key in _OWNER_OPS: + queryset = _apply_owner_filter(queryset, value, request_user=user) if asset_qs is not None: # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index 817642341..37a012788 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -31,6 +31,7 @@ 'technique', 'standard', 'file_type', + 'owner', } ) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 712399fc3..be50f0eb1 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2086,3 +2086,114 @@ def test_advanced_search_species_respects_embargo_visibility(api_client): # Anonymous request: embargoed must be filtered out. assert _search_ids(api_client, 'species:mouse') == {open_ds.identifier} + + +# --- owner: operator ----------------------------------------------------------------------------- + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_by_username_returns_owned_dandisets(api_client): + alice = UserFactory.create(username='alice', email='alice@example.com') + bob = UserFactory.create(username='bob', email='bob@example.com') + alice_ds = DandisetFactory.create(owners=[alice]) + bob_ds = DandisetFactory.create(owners=[bob]) + DraftVersionFactory.create(dandiset=alice_ds) + DraftVersionFactory.create(dandiset=bob_ds) + + assert _search_ids(api_client, 'owner:alice') == {alice_ds.identifier} + assert _search_ids(api_client, 'owner:bob') == {bob_ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_by_email_matches(api_client): + alice = UserFactory.create(username='alice', email='alice@example.com') + alice_ds = DandisetFactory.create(owners=[alice]) + DraftVersionFactory.create(dandiset=alice_ds) + + assert _search_ids(api_client, 'owner:alice@example.com') == {alice_ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_lookup_is_case_insensitive(api_client): + alice = UserFactory.create(username='Alice', email='Alice@Example.com') + alice_ds = DandisetFactory.create(owners=[alice]) + DraftVersionFactory.create(dandiset=alice_ds) + + assert _search_ids(api_client, 'owner:alice') == {alice_ds.identifier} + assert _search_ids(api_client, 'owner:ALICE@example.COM') == {alice_ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_unknown_user_returns_zero(api_client): + DraftVersionFactory.create(dandiset=DandisetFactory.create()) + # No SearchSyntaxError — a search for a nonexistent owner is a valid + # zero-hit query, not a malformed query. + assert _search_ids(api_client, 'owner:no_such_user_anywhere') == set() + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_me_resolves_to_authenticated_user(api_client): + alice = UserFactory.create() + bob = UserFactory.create() + alice_ds = DandisetFactory.create(owners=[alice]) + bob_ds = DandisetFactory.create(owners=[bob]) + DraftVersionFactory.create(dandiset=alice_ds) + DraftVersionFactory.create(dandiset=bob_ds) + + api_client.force_authenticate(user=alice) + assert _search_ids(api_client, 'owner:me') == {alice_ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_me_anonymous_returns_400(api_client): + response = api_client.get( + '/api/dandisets/', + {'draft': 'true', 'empty': 'true', 'search': 'owner:me'}, + ) + assert response.status_code == 400 + assert 'requires authentication' in response.json()['search'] + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_does_not_inflate_to_superuser_archive(api_client): + # Guardian's get_objects_for_user(with_superuser=True) returns ALL objects + # for superusers — wrong semantics for owner: searches. We pass + # with_superuser=False so `owner:admin` returns only what admin + # explicitly owns, not the entire archive. + admin = UserFactory.create(username='admin', is_superuser=True) + other = UserFactory.create() + DraftVersionFactory.create(dandiset=DandisetFactory.create(owners=[other])) + admin_owned = DandisetFactory.create(owners=[admin]) + DraftVersionFactory.create(dandiset=admin_owned) + + assert _search_ids(api_client, 'owner:admin') == {admin_owned.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_combines_with_other_operators(api_client): + alice = UserFactory.create(username='alice') + bob = UserFactory.create(username='bob') + alice_old = DandisetFactory.create(owners=[alice]) + alice_new = DandisetFactory.create(owners=[alice]) + bob_new = DandisetFactory.create(owners=[bob]) + for ds in (alice_old, alice_new, bob_new): + DraftVersionFactory.create(dandiset=ds) + + cutoff = timezone.now() - datetime.timedelta(days=1) + Dandiset.objects.filter(pk=alice_old.pk).update( + created=cutoff - datetime.timedelta(days=30) + ) + + after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() + # Only alice_new satisfies BOTH owner:alice AND created_after. + assert _search_ids(api_client, f'owner:alice created_after:{after_str}') == { + alice_new.identifier + } diff --git a/dandiapi/api/tests/test_search_parser.py b/dandiapi/api/tests/test_search_parser.py index 2ab5b42f8..fa99496a5 100644 --- a/dandiapi/api/tests/test_search_parser.py +++ b/dandiapi/api/tests/test_search_parser.py @@ -45,6 +45,10 @@ # Quoted token that *looks* like an operator is treated as free text — # this is the documented escape hatch for searching for a literal colon. ('"foo:bar" hippocampus', ['foo:bar', 'hippocampus'], []), + # Owner operator + ('owner:jdoe', [], [('owner', 'jdoe')]), + # Owner with email value (the parser doesn't validate the value shape) + ('owner:user@example.com', [], [('owner', 'user@example.com')]), ], ids=[ 'empty', @@ -57,6 +61,8 @@ 'repeated-operator-key', 'special-chars-in-quoted-value', 'quoted-operator-like-token-is-free-text', + 'owner-username', + 'owner-email', ], ) def test_parse_search(query, expected_free_text, expected_operators): diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 0d64477c8..a80c1cc80 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -311,7 +311,9 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'published_before, published_after (all take YYYY-MM-DD); ' 'species, approach, technique, standard (case-insensitive ' 'substring against the corresponding asset_metadata array); ' - 'file_type (nwb, image, text, video — or any MIME prefix). ' + 'file_type (nwb, image, text, video — or any MIME prefix); ' + 'owner (exact username or email; "owner:me" resolves to the ' + 'requesting user). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 5b3b5a6db..557d47644 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -95,6 +95,7 @@ const operatorHelp = [ { example: 'technique:"patch clamp"', description: 'Has assets using a measurement technique' }, { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, + { example: 'owner:jdoe', description: 'Owned by a user (username/email; or "owner:me")' }, ]; function updateSearch(search: string) { From fd0567b583bc581ce051434f9b078adf676b7ce1 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 6 May 2026 20:13:16 -0400 Subject: [PATCH 02/19] owner: also match by display name (first/last/full) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real users encounter the dandiset list with owners shown by display name (e.g. "Super User"), not by username. Searching that string was returning 0 because the lookup only matched username/email. Now matches case-insensitively against username, email, first_name, last_name, OR "first_name last_name" — so owner:"Super User" works the same as owner:ben.dichter@gmail.com. Multiple users may match (e.g. shared last name); we union dandisets owned by any of them via a direct DandisetUserObjectPermission query. Updated OpenAPI help text and the frontend popover example to `owner:"Jane Doe"` so users discover the new shape. --- dandiapi/api/services/search/filters.py | 48 ++++++++++++++++------ dandiapi/api/tests/test_dandiset.py | 38 +++++++++++++++++ dandiapi/api/views/serializers.py | 3 +- web/src/components/DandisetSearchField.vue | 2 +- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 1b5af3534..493e5b289 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -7,9 +7,11 @@ from typing import TYPE_CHECKING from django.contrib.auth.models import User -from django.db.models import OuterRef, Q, Subquery +from django.db.models import OuterRef, Q, Subquery, Value +from django.db.models.functions import Concat from dandiapi.api.models import Version +from dandiapi.api.models.dandiset import DandisetUserObjectPermission from dandiapi.api.services.permissions.dandiset import get_owned_dandisets from dandiapi.api.services.search.parser import SearchSyntaxError from dandiapi.search.models import AssetSearch @@ -110,13 +112,23 @@ def _apply_asset_filter(queryset, operator: str, value: str): def _apply_owner_filter( queryset: QuerySet[Dandiset], value: str, *, request_user: User | AnonymousUser ) -> QuerySet[Dandiset]: - """Filter dandisets to those owned by the given username/email. - - `owner:me` resolves to the requesting user; otherwise we look up the User - by exact (case-insensitive) username or email. Unknown user → empty result - (not an error — searching for a nonexistent owner is a valid 0-hit query). - `with_superuser=False` so `owner:some_admin` returns only what they - actually own, not the entire archive. + """Filter dandisets to those owned by the given user identifier. + + `owner:me` resolves to the requesting user; otherwise we match `value` + case-insensitively against: + - User.username + - User.email + - User.first_name + - User.last_name + - "first_name last_name" (so the display name shown in the UI works) + + Multiple users may match (common when only a first or last name is given); + we union dandisets owned by any of them. Unknown user → empty result (not + an error — a search for a nonexistent owner is a valid 0-hit query). + + Direct query against `DandisetUserObjectPermission` rather than guardian's + `get_objects_for_user` so we can intersect across multiple matched users + in a single query, and to bypass the superuser-sees-everything default. """ if value.lower() == 'me': if request_user.is_anonymous: @@ -126,11 +138,21 @@ def _apply_owner_filter( owner_pks = get_owned_dandisets(request_user, include_superusers=False).values('pk') return queryset.filter(pk__in=owner_pks) - matched_user = User.objects.filter(Q(username__iexact=value) | Q(email__iexact=value)).first() - if matched_user is None: - return queryset.none() - owner_pks = get_owned_dandisets(matched_user, include_superusers=False).values('pk') - return queryset.filter(pk__in=owner_pks) + matched_user_pks = ( + User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name')) + .filter( + Q(username__iexact=value) + | Q(email__iexact=value) + | Q(first_name__iexact=value) + | Q(last_name__iexact=value) + | Q(_full_name__iexact=value) + ) + .values_list('pk', flat=True) + ) + owned_pks = DandisetUserObjectPermission.objects.filter( + user__in=matched_user_pks, permission__codename='owner' + ).values('content_object') + return queryset.filter(pk__in=owned_pks) _MODIFIED_ALIAS = '_search_latest_version_modified' diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index be50f0eb1..6253a3a30 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2197,3 +2197,41 @@ def test_advanced_search_owner_combines_with_other_operators(api_client): assert _search_ids(api_client, f'owner:alice created_after:{after_str}') == { alice_new.identifier } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_by_full_name_matches(api_client): + # The dandiset list shows owners by display name (first + ' ' + last). + # `owner:"Super User"` should match a user with that full name even + # though their username is an email address. + user = UserFactory.create( + username='ben.dichter@gmail.com', + email='ben.dichter@gmail.com', + first_name='Super', + last_name='User', + ) + user_ds = DandisetFactory.create(owners=[user]) + DraftVersionFactory.create(dandiset=user_ds) + + assert _search_ids(api_client, 'owner:"Super User"') == {user_ds.identifier} + # First-name-only and last-name-only also work. + assert _search_ids(api_client, 'owner:Super') == {user_ds.identifier} + assert _search_ids(api_client, 'owner:User') == {user_ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_owner_unions_multiple_matched_users(api_client): + # Two distinct users share a last name. `owner:Smith` should return + # dandisets owned by either of them. + alice = UserFactory.create(username='alice', last_name='Smith') + bob = UserFactory.create(username='bob', last_name='Smith') + eve = UserFactory.create(username='eve', last_name='Jones') + alice_ds = DandisetFactory.create(owners=[alice]) + bob_ds = DandisetFactory.create(owners=[bob]) + DandisetFactory.create(owners=[eve]) + for ds in Dandiset.objects.all(): + DraftVersionFactory.create(dandiset=ds) + + assert _search_ids(api_client, 'owner:Smith') == {alice_ds.identifier, bob_ds.identifier} diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index a80c1cc80..45e85bf6a 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -312,7 +312,8 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'species, approach, technique, standard (case-insensitive ' 'substring against the corresponding asset_metadata array); ' 'file_type (nwb, image, text, video — or any MIME prefix); ' - 'owner (exact username or email; "owner:me" resolves to the ' + 'owner (case-insensitive match against username, email, first ' + 'name, last name, or "first last"; "owner:me" resolves to the ' 'requesting user). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 557d47644..fc1bc8625 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -95,7 +95,7 @@ const operatorHelp = [ { example: 'technique:"patch clamp"', description: 'Has assets using a measurement technique' }, { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, - { example: 'owner:jdoe', description: 'Owned by a user (username/email; or "owner:me")' }, + { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, or email; or "owner:me")' }, ]; function updateSearch(search: string) { From 74fcc5b8480350bcc0427af609cbb8acf11261e4 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Fri, 8 May 2026 08:59:38 -0400 Subject: [PATCH 03/19] Apply ruff format to test_dandiset.py --- dandiapi/api/tests/test_dandiset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 6253a3a30..67618faf7 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2188,9 +2188,7 @@ def test_advanced_search_owner_combines_with_other_operators(api_client): DraftVersionFactory.create(dandiset=ds) cutoff = timezone.now() - datetime.timedelta(days=1) - Dandiset.objects.filter(pk=alice_old.pk).update( - created=cutoff - datetime.timedelta(days=30) - ) + Dandiset.objects.filter(pk=alice_old.pk).update(created=cutoff - datetime.timedelta(days=30)) after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() # Only alice_new satisfies BOTH owner:alice AND created_after. From a55ec5643ee1e844c84f21ed3d70aaf0f3e705e4 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Fri, 8 May 2026 12:12:18 -0400 Subject: [PATCH 04/19] owner: keep owner:me magic; add quoted-form escape; consolidate tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 review feedback on #2821: - @yarikoptic flagged that owner:me silently shadows a real user named "Me". Fix: distinguish quoted vs unquoted at the parser level. Unquoted owner:me → magic alias for the requesting user. Quoted owner:"me" → literal lookup (matches a user whose first/last name is "Me"). Same pattern lets owner:"Me Someoneyou" reach the literal full-name match while keeping the convenient owner:me shortcut. Implementation: ParsedSearch.operators is now a list of `Operator` dataclasses (key, value, quoted) instead of bare tuples. Filters consume the new shape and the owner filter switches on the quoted flag. - Replaced personal email (ben.dichter@gmail.com) in the full-name test fixture with a generic example user. - Consolidated 10 small owner-tests into 3 denser ones that share setup per @yarikoptic's "make each test matter more" feedback. Coverage is unchanged (every documented lookup path is asserted; cross-key AND with another operator; multi-user union via shared last name; unknown user → 0; superuser non-inflation; owner:me magic; owner:"me" literal-escape; anonymous owner:me → 400). DB setup runs ~3x instead of ~10x. Updated OpenAPI help text and the search popover to mention the owner:me alias and the quoted-escape. --- dandiapi/api/services/search/filters.py | 36 +++-- dandiapi/api/services/search/parser.py | 20 ++- dandiapi/api/tests/test_dandiset.py | 174 +++++++++------------ dandiapi/api/tests/test_search_parser.py | 37 +++-- dandiapi/api/views/serializers.py | 5 +- web/src/components/DandisetSearchField.vue | 2 +- 6 files changed, 141 insertions(+), 133 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 493e5b289..ec01a1c9f 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -110,30 +110,35 @@ def _apply_asset_filter(queryset, operator: str, value: str): def _apply_owner_filter( - queryset: QuerySet[Dandiset], value: str, *, request_user: User | AnonymousUser + queryset: QuerySet[Dandiset], + value: str, + *, + quoted: bool, + request_user: User | AnonymousUser, ) -> QuerySet[Dandiset]: """Filter dandisets to those owned by the given user identifier. - `owner:me` resolves to the requesting user; otherwise we match `value` - case-insensitively against: - - User.username - - User.email - - User.first_name - - User.last_name - - "first_name last_name" (so the display name shown in the UI works) + The unquoted token `owner:me` resolves to the requesting user. To search + for a literal user named "Me" instead, quote the value: `owner:"me"`. + The quoted form bypasses the magic alias and goes straight to the + case-insensitive lookup. + Lookups (for any non-magic value) match case-insensitively against + `User.username`, `User.email`, `User.first_name`, `User.last_name`, or + `"first_name last_name"` (so the display name shown in the UI works). Multiple users may match (common when only a first or last name is given); we union dandisets owned by any of them. Unknown user → empty result (not an error — a search for a nonexistent owner is a valid 0-hit query). Direct query against `DandisetUserObjectPermission` rather than guardian's - `get_objects_for_user` so we can intersect across multiple matched users - in a single query, and to bypass the superuser-sees-everything default. + `get_objects_for_user` so we can union across multiple matched users in a + single query, and to bypass the superuser-sees-everything default. """ - if value.lower() == 'me': + if not quoted and value.lower() == 'me': if request_user.is_anonymous: raise SearchSyntaxError( - 'owner:me requires authentication. Sign in or specify a username.' + 'owner:me requires authentication. Sign in, or use owner:"me" ' + 'to search for a literal user named Me.' ) owner_pks = get_owned_dandisets(request_user, include_superusers=False).values('pk') return queryset.filter(pk__in=owner_pks) @@ -208,8 +213,9 @@ def apply_search_filters( asset_qs = None annotated: set[str] = set() - for key, raw_value in parsed.operators: - value = raw_value.strip() + for op in parsed.operators: + key = op.key + value = op.value.strip() if not value: raise SearchSyntaxError(f'Operator "{key}" requires a value (e.g. {key}:something).') @@ -226,7 +232,7 @@ def apply_search_filters( asset_qs = AssetSearch.objects.visible_to(user) asset_qs = _apply_asset_filter(asset_qs, key, value) elif key in _OWNER_OPS: - queryset = _apply_owner_filter(queryset, value, request_user=user) + queryset = _apply_owner_filter(queryset, value, quoted=op.quoted, request_user=user) if asset_qs is not None: # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index 37a012788..d20367284 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -61,10 +61,24 @@ class SearchSyntaxError(ValueError): """Raised when a search query can't be parsed.""" +@dataclass +class Operator: + """One parsed `key:value` operator. + + `quoted` records whether the value came from a quoted form (`key:"value"`). + Most operators ignore this, but it lets `owner:` distinguish the magic + `owner:me` (current user) from `owner:"me"` (literal user named "Me"). + """ + + key: str + value: str + quoted: bool + + @dataclass class ParsedSearch: free_text: list[str] = field(default_factory=list) - operators: list[tuple[str, str]] = field(default_factory=list) + operators: list[Operator] = field(default_factory=list) def _check_balanced_quotes(query: str) -> None: @@ -100,7 +114,7 @@ def parse_search(query: str) -> ParsedSearch: for match in _TOKEN_RE.finditer(query): if (key := match.group('op_key')) is not None: _validate_operator_key(key) - parsed.operators.append((key, match.group('op_qval'))) + parsed.operators.append(Operator(key, match.group('op_qval'), quoted=True)) elif (free := match.group('free_quoted')) is not None: parsed.free_text.append(free) else: @@ -108,7 +122,7 @@ def parse_search(query: str) -> ParsedSearch: if op_match := _BARE_OP_RE.match(bare): key = op_match.group(1) _validate_operator_key(key) - parsed.operators.append((key, op_match.group(2))) + parsed.operators.append(Operator(key, op_match.group(2), quoted=False)) else: parsed.free_text.append(bare) return parsed diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 67618faf7..0ebb4b920 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2093,65 +2093,83 @@ def test_advanced_search_species_respects_embargo_visibility(api_client): @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_owner_by_username_returns_owned_dandisets(api_client): - alice = UserFactory.create(username='alice', email='alice@example.com') - bob = UserFactory.create(username='bob', email='bob@example.com') - alice_ds = DandisetFactory.create(owners=[alice]) - bob_ds = DandisetFactory.create(owners=[bob]) - DraftVersionFactory.create(dandiset=alice_ds) - DraftVersionFactory.create(dandiset=bob_ds) - - assert _search_ids(api_client, 'owner:alice') == {alice_ds.identifier} - assert _search_ids(api_client, 'owner:bob') == {bob_ds.identifier} +def test_advanced_search_owner_lookup_paths_and_combinations(api_client): + """One setup, many assertions for the owner: operator. + Resolves users by every documented lookup path, unions across multiple + matched users, returns 0 for unknown values, is case-insensitive, and + combines correctly with other operators (cross-key AND on the same + dandiset). + """ + # Three users with overlapping last names so we can exercise every lookup + # path AND the multi-user union in a single setup. + alice = UserFactory.create( + username='Alice', email='Alice@Example.com', first_name='Alice', last_name='Smith' + ) + bob = UserFactory.create( + username='bob', email='bob@example.com', first_name='Bob', last_name='Smith' + ) + carol = UserFactory.create( + username='carol', email='carol@example.com', first_name='Carol', last_name='Jones' + ) + alice_old = DandisetFactory.create(owners=[alice]) + alice_new = DandisetFactory.create(owners=[alice]) + bob_ds = DandisetFactory.create(owners=[bob]) + carol_ds = DandisetFactory.create(owners=[carol]) + for ds in (alice_old, alice_new, bob_ds, carol_ds): + DraftVersionFactory.create(dandiset=ds) -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_by_email_matches(api_client): - alice = UserFactory.create(username='alice', email='alice@example.com') - alice_ds = DandisetFactory.create(owners=[alice]) - DraftVersionFactory.create(dandiset=alice_ds) + # Backdate alice_old so we can intersect with a date operator below. + cutoff = timezone.now() - datetime.timedelta(days=1) + Dandiset.objects.filter(pk=alice_old.pk).update(created=cutoff - datetime.timedelta(days=30)) + after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() - assert _search_ids(api_client, 'owner:alice@example.com') == {alice_ds.identifier} + alice_dsets = {alice_old.identifier, alice_new.identifier} + # username (case-insensitive) + assert _search_ids(api_client, 'owner:alice') == alice_dsets + assert _search_ids(api_client, 'owner:ALICE') == alice_dsets -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_lookup_is_case_insensitive(api_client): - alice = UserFactory.create(username='Alice', email='Alice@Example.com') - alice_ds = DandisetFactory.create(owners=[alice]) - DraftVersionFactory.create(dandiset=alice_ds) + # email (case-insensitive) + assert _search_ids(api_client, 'owner:alice@example.com') == alice_dsets + assert _search_ids(api_client, 'owner:ALICE@Example.com') == alice_dsets - assert _search_ids(api_client, 'owner:alice') == {alice_ds.identifier} - assert _search_ids(api_client, 'owner:ALICE@example.COM') == {alice_ds.identifier} + # first / last / full name + assert _search_ids(api_client, 'owner:Bob') == {bob_ds.identifier} + assert _search_ids(api_client, 'owner:Jones') == {carol_ds.identifier} + assert _search_ids(api_client, 'owner:"Carol Jones"') == {carol_ds.identifier} + # union: shared last name returns dandisets from both users + assert _search_ids(api_client, 'owner:Smith') == alice_dsets | {bob_ds.identifier} -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_unknown_user_returns_zero(api_client): - DraftVersionFactory.create(dandiset=DandisetFactory.create()) - # No SearchSyntaxError — a search for a nonexistent owner is a valid - # zero-hit query, not a malformed query. + # unknown user → 0 results, not 400 (a valid 0-hit query) assert _search_ids(api_client, 'owner:no_such_user_anywhere') == set() + # combines with other operators: cross-key AND on the same dandiset. + # Only alice_new satisfies BOTH owner:alice AND created_after. + assert _search_ids(api_client, f'owner:alice created_after:{after_str}') == { + alice_new.identifier + } + @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_owner_me_resolves_to_authenticated_user(api_client): - alice = UserFactory.create() - bob = UserFactory.create() +def test_advanced_search_owner_me_magic_and_literal_escape(api_client): + """`owner:me` (unquoted) is the magic alias for "the current user". + + Anonymous → 400. To search for a real user named "Me" instead, quote + the value (`owner:"me"`) — the quoted form opts out of the magic and + falls back to the case-insensitive lookup against username / email / + first / last / full name. + """ + alice = UserFactory.create(username='alice') + me_user = UserFactory.create(username='me_actual_user', first_name='Me', last_name='Someoneyou') alice_ds = DandisetFactory.create(owners=[alice]) - bob_ds = DandisetFactory.create(owners=[bob]) + me_ds = DandisetFactory.create(owners=[me_user]) DraftVersionFactory.create(dandiset=alice_ds) - DraftVersionFactory.create(dandiset=bob_ds) - - api_client.force_authenticate(user=alice) - assert _search_ids(api_client, 'owner:me') == {alice_ds.identifier} + DraftVersionFactory.create(dandiset=me_ds) - -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_me_anonymous_returns_400(api_client): + # Anonymous + `owner:me` → 400 with explicit message response = api_client.get( '/api/dandisets/', {'draft': 'true', 'empty': 'true', 'search': 'owner:me'}, @@ -2159,6 +2177,17 @@ def test_advanced_search_owner_me_anonymous_returns_400(api_client): assert response.status_code == 400 assert 'requires authentication' in response.json()['search'] + # Authenticated + unquoted `owner:me` → only alice's dandisets + # (does NOT match the literal user named "Me"). + api_client.force_authenticate(user=alice) + assert _search_ids(api_client, 'owner:me') == {alice_ds.identifier} + + # Authenticated + quoted `owner:"me"` → escapes the magic and matches + # the literal user "Me" by first_name (NOT alice). + assert _search_ids(api_client, 'owner:"me"') == {me_ds.identifier} + # Full display name lookup also works for that user. + assert _search_ids(api_client, 'owner:"Me Someoneyou"') == {me_ds.identifier} + @pytest.mark.ai_generated @pytest.mark.django_db @@ -2174,62 +2203,3 @@ def test_advanced_search_owner_does_not_inflate_to_superuser_archive(api_client) DraftVersionFactory.create(dandiset=admin_owned) assert _search_ids(api_client, 'owner:admin') == {admin_owned.identifier} - - -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_combines_with_other_operators(api_client): - alice = UserFactory.create(username='alice') - bob = UserFactory.create(username='bob') - alice_old = DandisetFactory.create(owners=[alice]) - alice_new = DandisetFactory.create(owners=[alice]) - bob_new = DandisetFactory.create(owners=[bob]) - for ds in (alice_old, alice_new, bob_new): - DraftVersionFactory.create(dandiset=ds) - - cutoff = timezone.now() - datetime.timedelta(days=1) - Dandiset.objects.filter(pk=alice_old.pk).update(created=cutoff - datetime.timedelta(days=30)) - - after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() - # Only alice_new satisfies BOTH owner:alice AND created_after. - assert _search_ids(api_client, f'owner:alice created_after:{after_str}') == { - alice_new.identifier - } - - -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_by_full_name_matches(api_client): - # The dandiset list shows owners by display name (first + ' ' + last). - # `owner:"Super User"` should match a user with that full name even - # though their username is an email address. - user = UserFactory.create( - username='ben.dichter@gmail.com', - email='ben.dichter@gmail.com', - first_name='Super', - last_name='User', - ) - user_ds = DandisetFactory.create(owners=[user]) - DraftVersionFactory.create(dandiset=user_ds) - - assert _search_ids(api_client, 'owner:"Super User"') == {user_ds.identifier} - # First-name-only and last-name-only also work. - assert _search_ids(api_client, 'owner:Super') == {user_ds.identifier} - assert _search_ids(api_client, 'owner:User') == {user_ds.identifier} - - -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_unions_multiple_matched_users(api_client): - # Two distinct users share a last name. `owner:Smith` should return - # dandisets owned by either of them. - alice = UserFactory.create(username='alice', last_name='Smith') - bob = UserFactory.create(username='bob', last_name='Smith') - eve = UserFactory.create(username='eve', last_name='Jones') - alice_ds = DandisetFactory.create(owners=[alice]) - bob_ds = DandisetFactory.create(owners=[bob]) - DandisetFactory.create(owners=[eve]) - for ds in Dandiset.objects.all(): - DraftVersionFactory.create(dandiset=ds) - - assert _search_ids(api_client, 'owner:Smith') == {alice_ds.identifier, bob_ds.identifier} diff --git a/dandiapi/api/tests/test_search_parser.py b/dandiapi/api/tests/test_search_parser.py index fa99496a5..4a72fb36c 100644 --- a/dandiapi/api/tests/test_search_parser.py +++ b/dandiapi/api/tests/test_search_parser.py @@ -3,6 +3,7 @@ import pytest from dandiapi.api.services.search.parser import ( + Operator, SearchSyntaxError, parse_search, ) @@ -10,6 +11,17 @@ pytestmark = pytest.mark.ai_generated +# Convenience aliases so the parametrize table stays readable. +def _u(key: str, value: str) -> Operator: + """Unquoted operator (e.g. parsed from `key:value`).""" + return Operator(key, value, quoted=False) + + +def _q(key: str, value: str) -> Operator: + """Quoted operator (e.g. parsed from `key:"value"`).""" + return Operator(key, value, quoted=True) + + @pytest.mark.parametrize( ('query', 'expected_free_text', 'expected_operators'), [ @@ -22,33 +34,36 @@ ( 'species:mouse created_after:2024-01-01', [], - [('species', 'mouse'), ('created_after', '2024-01-01')], + [_u('species', 'mouse'), _u('created_after', '2024-01-01')], ), # Mixed ( 'place cells species:mouse created_after:2024-01-01 ca1', ['place', 'cells', 'ca1'], - [('species', 'mouse'), ('created_after', '2024-01-01')], + [_u('species', 'mouse'), _u('created_after', '2024-01-01')], ), # Quoted phrase as free text ('"place cells" hippocampus', ['place cells', 'hippocampus'], []), - # Quoted operator value (multi-word) - ('technique:"patch clamp"', [], [('technique', 'patch clamp')]), + # Quoted operator value (multi-word) — `quoted=True` + ('technique:"patch clamp"', [], [_q('technique', 'patch clamp')]), # Repeated operator keeps every entry (AND'd downstream) ( 'species:mouse species:rat', [], - [('species', 'mouse'), ('species', 'rat')], + [_u('species', 'mouse'), _u('species', 'rat')], ), # Special characters preserved inside quoted operator value - ('species:"C57BL/6"', [], [('species', 'C57BL/6')]), + ('species:"C57BL/6"', [], [_q('species', 'C57BL/6')]), # Quoted token that *looks* like an operator is treated as free text — - # this is the documented escape hatch for searching for a literal colon. + # documented escape hatch for searching for a literal colon. ('"foo:bar" hippocampus', ['foo:bar', 'hippocampus'], []), - # Owner operator - ('owner:jdoe', [], [('owner', 'jdoe')]), + # Owner operator (unquoted vs quoted distinguished — used by the + # `owner:me` magic alias which `owner:"me"` opts out of). + ('owner:jdoe', [], [_u('owner', 'jdoe')]), + ('owner:me', [], [_u('owner', 'me')]), + ('owner:"me"', [], [_q('owner', 'me')]), # Owner with email value (the parser doesn't validate the value shape) - ('owner:user@example.com', [], [('owner', 'user@example.com')]), + ('owner:user@example.com', [], [_u('owner', 'user@example.com')]), ], ids=[ 'empty', @@ -62,6 +77,8 @@ 'special-chars-in-quoted-value', 'quoted-operator-like-token-is-free-text', 'owner-username', + 'owner-me-unquoted', + 'owner-me-quoted', 'owner-email', ], ) diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 45e85bf6a..44f1d3aad 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -313,8 +313,9 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'substring against the corresponding asset_metadata array); ' 'file_type (nwb, image, text, video — or any MIME prefix); ' 'owner (case-insensitive match against username, email, first ' - 'name, last name, or "first last"; "owner:me" resolves to the ' - 'requesting user). ' + 'name, last name, or "first last"; the magic value "owner:me" ' + 'resolves to the requesting user — quote it as "owner:\\"me\\"" ' + 'to match a literal user named Me instead). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index fc1bc8625..e136c5fb2 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -95,7 +95,7 @@ const operatorHelp = [ { example: 'technique:"patch clamp"', description: 'Has assets using a measurement technique' }, { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, - { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, or email; or "owner:me")' }, + { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, email; or owner:me)' }, ]; function updateSearch(search: string) { From a328da1d281195dbd02b5afe86f5eab226bad3e7 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 11:42:22 -0400 Subject: [PATCH 05/19] Drop owner:me magic alias (defer to a follow-up PR) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unquoted owner:me → current-user shortcut required threading a `quoted` flag through the parser and a `request_user` arg through the filter dispatch — non-trivial machinery to support one alias. Per #2822 review discussion, removing it from this PR keeps the owner operator focused on literal lookup-by-value (username / email / first / last / "first last") and avoids the design debate about the right escape mechanism for "I literally want a user named Me." The alias can come back in a focused follow-up PR if/when there's appetite for it. Concrete drops: - owner:me magic + 400-on-anonymous in `_apply_owner_filter` - `Operator.quoted` field on the parser dataclass - `quoted` and `request_user` parameters on `_apply_owner_filter` - `get_owned_dandisets` import (no longer used here) - `test_advanced_search_owner_me_magic_and_literal_escape` test - The two `owner-me-quoted` / `owner-me-unquoted` parser test cases - "owner:me" mentions in OpenAPI help text and the popover entry --- dandiapi/api/services/search/filters.py | 39 ++++------------------ dandiapi/api/services/search/parser.py | 12 ++----- dandiapi/api/tests/test_dandiset.py | 37 -------------------- dandiapi/api/tests/test_search_parser.py | 35 +++++-------------- dandiapi/api/views/serializers.py | 4 +-- web/src/components/DandisetSearchField.vue | 2 +- 6 files changed, 20 insertions(+), 109 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index ec01a1c9f..d31a74e17 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -12,7 +12,6 @@ from dandiapi.api.models import Version from dandiapi.api.models.dandiset import DandisetUserObjectPermission -from dandiapi.api.services.permissions.dandiset import get_owned_dandisets from dandiapi.api.services.search.parser import SearchSyntaxError from dandiapi.search.models import AssetSearch @@ -109,40 +108,14 @@ def _apply_asset_filter(queryset, operator: str, value: str): raise ValueError(f'unknown asset operator: {operator}') # pragma: no cover -def _apply_owner_filter( - queryset: QuerySet[Dandiset], - value: str, - *, - quoted: bool, - request_user: User | AnonymousUser, -) -> QuerySet[Dandiset]: +def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Dandiset]: """Filter dandisets to those owned by the given user identifier. - The unquoted token `owner:me` resolves to the requesting user. To search - for a literal user named "Me" instead, quote the value: `owner:"me"`. - The quoted form bypasses the magic alias and goes straight to the - case-insensitive lookup. - - Lookups (for any non-magic value) match case-insensitively against - `User.username`, `User.email`, `User.first_name`, `User.last_name`, or - `"first_name last_name"` (so the display name shown in the UI works). - Multiple users may match (common when only a first or last name is given); - we union dandisets owned by any of them. Unknown user → empty result (not - an error — a search for a nonexistent owner is a valid 0-hit query). - - Direct query against `DandisetUserObjectPermission` rather than guardian's - `get_objects_for_user` so we can union across multiple matched users in a - single query, and to bypass the superuser-sees-everything default. + `value` is matched case-insensitively against `User.username`, `User.email`, + `User.first_name`, `User.last_name`, or `"first_name last_name"` (so the + display name shown in the UI works). Multiple users may match; we union + dandisets owned by any of them. Unknown user → empty result. """ - if not quoted and value.lower() == 'me': - if request_user.is_anonymous: - raise SearchSyntaxError( - 'owner:me requires authentication. Sign in, or use owner:"me" ' - 'to search for a literal user named Me.' - ) - owner_pks = get_owned_dandisets(request_user, include_superusers=False).values('pk') - return queryset.filter(pk__in=owner_pks) - matched_user_pks = ( User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name')) .filter( @@ -232,7 +205,7 @@ def apply_search_filters( asset_qs = AssetSearch.objects.visible_to(user) asset_qs = _apply_asset_filter(asset_qs, key, value) elif key in _OWNER_OPS: - queryset = _apply_owner_filter(queryset, value, quoted=op.quoted, request_user=user) + queryset = _apply_owner_filter(queryset, value) if asset_qs is not None: # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index d20367284..c3b688222 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -63,16 +63,10 @@ class SearchSyntaxError(ValueError): @dataclass class Operator: - """One parsed `key:value` operator. - - `quoted` records whether the value came from a quoted form (`key:"value"`). - Most operators ignore this, but it lets `owner:` distinguish the magic - `owner:me` (current user) from `owner:"me"` (literal user named "Me"). - """ + """One parsed `key:value` operator.""" key: str value: str - quoted: bool @dataclass @@ -114,7 +108,7 @@ def parse_search(query: str) -> ParsedSearch: for match in _TOKEN_RE.finditer(query): if (key := match.group('op_key')) is not None: _validate_operator_key(key) - parsed.operators.append(Operator(key, match.group('op_qval'), quoted=True)) + parsed.operators.append(Operator(key, match.group('op_qval'))) elif (free := match.group('free_quoted')) is not None: parsed.free_text.append(free) else: @@ -122,7 +116,7 @@ def parse_search(query: str) -> ParsedSearch: if op_match := _BARE_OP_RE.match(bare): key = op_match.group(1) _validate_operator_key(key) - parsed.operators.append(Operator(key, op_match.group(2), quoted=False)) + parsed.operators.append(Operator(key, op_match.group(2))) else: parsed.free_text.append(bare) return parsed diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 0ebb4b920..871c1b062 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2152,43 +2152,6 @@ def test_advanced_search_owner_lookup_paths_and_combinations(api_client): } -@pytest.mark.ai_generated -@pytest.mark.django_db -def test_advanced_search_owner_me_magic_and_literal_escape(api_client): - """`owner:me` (unquoted) is the magic alias for "the current user". - - Anonymous → 400. To search for a real user named "Me" instead, quote - the value (`owner:"me"`) — the quoted form opts out of the magic and - falls back to the case-insensitive lookup against username / email / - first / last / full name. - """ - alice = UserFactory.create(username='alice') - me_user = UserFactory.create(username='me_actual_user', first_name='Me', last_name='Someoneyou') - alice_ds = DandisetFactory.create(owners=[alice]) - me_ds = DandisetFactory.create(owners=[me_user]) - DraftVersionFactory.create(dandiset=alice_ds) - DraftVersionFactory.create(dandiset=me_ds) - - # Anonymous + `owner:me` → 400 with explicit message - response = api_client.get( - '/api/dandisets/', - {'draft': 'true', 'empty': 'true', 'search': 'owner:me'}, - ) - assert response.status_code == 400 - assert 'requires authentication' in response.json()['search'] - - # Authenticated + unquoted `owner:me` → only alice's dandisets - # (does NOT match the literal user named "Me"). - api_client.force_authenticate(user=alice) - assert _search_ids(api_client, 'owner:me') == {alice_ds.identifier} - - # Authenticated + quoted `owner:"me"` → escapes the magic and matches - # the literal user "Me" by first_name (NOT alice). - assert _search_ids(api_client, 'owner:"me"') == {me_ds.identifier} - # Full display name lookup also works for that user. - assert _search_ids(api_client, 'owner:"Me Someoneyou"') == {me_ds.identifier} - - @pytest.mark.ai_generated @pytest.mark.django_db def test_advanced_search_owner_does_not_inflate_to_superuser_archive(api_client): diff --git a/dandiapi/api/tests/test_search_parser.py b/dandiapi/api/tests/test_search_parser.py index 4a72fb36c..2498a0b20 100644 --- a/dandiapi/api/tests/test_search_parser.py +++ b/dandiapi/api/tests/test_search_parser.py @@ -11,17 +11,6 @@ pytestmark = pytest.mark.ai_generated -# Convenience aliases so the parametrize table stays readable. -def _u(key: str, value: str) -> Operator: - """Unquoted operator (e.g. parsed from `key:value`).""" - return Operator(key, value, quoted=False) - - -def _q(key: str, value: str) -> Operator: - """Quoted operator (e.g. parsed from `key:"value"`).""" - return Operator(key, value, quoted=True) - - @pytest.mark.parametrize( ('query', 'expected_free_text', 'expected_operators'), [ @@ -34,36 +23,32 @@ def _q(key: str, value: str) -> Operator: ( 'species:mouse created_after:2024-01-01', [], - [_u('species', 'mouse'), _u('created_after', '2024-01-01')], + [Operator('species', 'mouse'), Operator('created_after', '2024-01-01')], ), # Mixed ( 'place cells species:mouse created_after:2024-01-01 ca1', ['place', 'cells', 'ca1'], - [_u('species', 'mouse'), _u('created_after', '2024-01-01')], + [Operator('species', 'mouse'), Operator('created_after', '2024-01-01')], ), # Quoted phrase as free text ('"place cells" hippocampus', ['place cells', 'hippocampus'], []), - # Quoted operator value (multi-word) — `quoted=True` - ('technique:"patch clamp"', [], [_q('technique', 'patch clamp')]), + # Quoted operator value (multi-word) + ('technique:"patch clamp"', [], [Operator('technique', 'patch clamp')]), # Repeated operator keeps every entry (AND'd downstream) ( 'species:mouse species:rat', [], - [_u('species', 'mouse'), _u('species', 'rat')], + [Operator('species', 'mouse'), Operator('species', 'rat')], ), # Special characters preserved inside quoted operator value - ('species:"C57BL/6"', [], [_q('species', 'C57BL/6')]), + ('species:"C57BL/6"', [], [Operator('species', 'C57BL/6')]), # Quoted token that *looks* like an operator is treated as free text — # documented escape hatch for searching for a literal colon. ('"foo:bar" hippocampus', ['foo:bar', 'hippocampus'], []), - # Owner operator (unquoted vs quoted distinguished — used by the - # `owner:me` magic alias which `owner:"me"` opts out of). - ('owner:jdoe', [], [_u('owner', 'jdoe')]), - ('owner:me', [], [_u('owner', 'me')]), - ('owner:"me"', [], [_q('owner', 'me')]), - # Owner with email value (the parser doesn't validate the value shape) - ('owner:user@example.com', [], [_u('owner', 'user@example.com')]), + # Owner operator + ('owner:jdoe', [], [Operator('owner', 'jdoe')]), + ('owner:user@example.com', [], [Operator('owner', 'user@example.com')]), ], ids=[ 'empty', @@ -77,8 +62,6 @@ def _q(key: str, value: str) -> Operator: 'special-chars-in-quoted-value', 'quoted-operator-like-token-is-free-text', 'owner-username', - 'owner-me-unquoted', - 'owner-me-quoted', 'owner-email', ], ) diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 44f1d3aad..fb032a157 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -313,9 +313,7 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'substring against the corresponding asset_metadata array); ' 'file_type (nwb, image, text, video — or any MIME prefix); ' 'owner (case-insensitive match against username, email, first ' - 'name, last name, or "first last"; the magic value "owner:me" ' - 'resolves to the requesting user — quote it as "owner:\\"me\\"" ' - 'to match a literal user named Me instead). ' + 'name, last name, or "first last"). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index e136c5fb2..30f6738dc 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -95,7 +95,7 @@ const operatorHelp = [ { example: 'technique:"patch clamp"', description: 'Has assets using a measurement technique' }, { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, - { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, email; or owner:me)' }, + { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, or email)' }, ]; function updateSearch(search: string) { From 6ea71f6d14b0dfeaf91d2e97635e9ceabdee20ac Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Sat, 9 May 2026 11:38:17 -0400 Subject: [PATCH 06/19] Add contributor: + 28 per-role operators with ORCID/ROR identifier lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 29 new operators total: catch-all `contributor:` plus one per dandi-schema RoleType (`author`, `data_curator`, `funder`, `contact_person`, etc.). Independent-operator semantics — `author:Doe funder:NIH` returns dandisets where SOME contributor has Doe-as-Author AND SOME contributor (possibly different) has NIH-as-Funder. Each role-specific operator constrains a single contributor[] element to have BOTH the name match AND the role. Implementation: - A single `_CONTRIBUTOR_ROLE_OPS` dict drives both the parser allowlist and the filter dispatch; adding a future role is one new entry. - `_contributor_jsonpath()` builds a Postgres jsonb_path_exists predicate that ORs across `name`, `email`, AND `identifier` (so ORCID for Persons and ROR URL for Organizations both work, including bare-ID substring forms like `01cwqze88` matching the full ROR URL). - All contributor operators in a single query AND on the same Version's metadata so a draft + published version with disjoint contributor lists never combine into a spurious match. Why 29 separate operators rather than a `contributor: + role:` pair: independent operators compose cleanly (cross-key AND falls out naturally; no ambiguity about which role applies to which contributor when there are multiple). Same precedent as Gmail's `from:`/`to:`/`cc:`. The 28 role names come straight from `dandischema.RoleType`. Test: one consolidated test covers catch-all + role-specific lookup, case-insensitivity, identifier (ORCID + ROR + bare-ID substring), role-substring matching `dcite:`-prefixed stored values, role + ORCID composition (positive and negative), and independent cross-role AND. Plus a separate test for the typo → 400-with-suggestion path. Anonymous test fixtures use generic Doe placeholders, no real names. OpenAPI help text and the search popover updated. --- dandiapi/api/services/search/filters.py | 116 ++++++++++++++- dandiapi/api/services/search/parser.py | 34 +++++ dandiapi/api/tests/test_dandiset.py | 163 +++++++++++++++++++++ dandiapi/api/views/serializers.py | 14 +- web/src/components/DandisetSearchField.vue | 6 + 5 files changed, 331 insertions(+), 2 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index d31a74e17..10d16d10b 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -3,6 +3,7 @@ from __future__ import annotations from datetime import UTC, datetime +import json import re from typing import TYPE_CHECKING @@ -44,6 +45,47 @@ _ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) _OWNER_OPS = frozenset({'owner'}) +# Contributor / per-role operators. The catch-all `contributor:` matches any +# role; each named operator additionally requires the matched contributor's +# `roleName` array to contain the corresponding `dcite:Role` value. Keys MUST +# be in OPERATOR_KEYS (parser allowlist) — keep the two in sync. +# +# Independent-operator semantics: each operator constrains a single +# `contributor[]` element, but two operators (e.g. `author:Baker funder:NIH`) +# are AND'd against the same Version's metadata — they may match the same OR +# different contributor elements. Same composability as the asset operators. +_CONTRIBUTOR_ROLE_OPS: dict[str, str | None] = { + 'contributor': None, # catch-all, no role constraint + 'author': 'Author', + 'conceptualization': 'Conceptualization', + 'contact_person': 'ContactPerson', + 'data_collector': 'DataCollector', + 'data_curator': 'DataCurator', + 'data_manager': 'DataManager', + 'formal_analysis': 'FormalAnalysis', + 'funding_acquisition': 'FundingAcquisition', + 'investigation': 'Investigation', + 'maintainer': 'Maintainer', + 'methodology': 'Methodology', + 'producer': 'Producer', + 'project_leader': 'ProjectLeader', + 'project_manager': 'ProjectManager', + 'project_member': 'ProjectMember', + 'project_administration': 'ProjectAdministration', + 'researcher': 'Researcher', + 'resources': 'Resources', + 'software': 'Software', + 'supervision': 'Supervision', + 'validation': 'Validation', + 'visualization': 'Visualization', + 'funder': 'Funder', + 'sponsor': 'Sponsor', + 'study_participant': 'StudyParticipant', + 'affiliation': 'Affiliation', + 'ethics_approval': 'EthicsApproval', + 'other': 'Other', +} + def _annotate_latest_version_modified(queryset): latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[:1] @@ -133,6 +175,68 @@ def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Da return queryset.filter(pk__in=owned_pks) +def _contributor_jsonpath(value: str, role: str | None) -> tuple[str, list[str]]: + """Build a (where_clause, params) for a single contributor element predicate. + + Matches a `contributor[]` element whose `name`, `email`, OR `identifier` + contains `value` (case-insensitive). The identifier covers ORCIDs for + Person contributors (e.g. `0000-0002-2990-9889`) and ROR URLs for + Organization contributors (e.g. `https://ror.org/01cwqze88`); the + substring match means bare-ID forms like `01cwqze88` work too. + + If `role` is given, additionally requires that element's `roleName` array + to contain a string matching `role` (also case-insensitive substring). + + Uses the third argument of `jsonb_path_exists` to bind named variables + (`$val`, `$role`) so the user values are properly quoted by Postgres + rather than concatenated into the path string. + """ + name_or_email_or_id = ( + '@.name like_regex $val flag "i" ' + ' || @.email like_regex $val flag "i" ' + ' || @.identifier like_regex $val flag "i"' + ) + if role is None: + jsonpath = f'$.contributor[*] ? ({name_or_email_or_id})' + vars_obj = {'val': re.escape(value)} + else: + jsonpath = ( + '$.contributor[*] ? ' + f'(({name_or_email_or_id}) ' + ' && exists(@.roleName[*] ? (@ like_regex $role flag "i")))' + ) + vars_obj = {'val': re.escape(value), 'role': re.escape(role)} + where = f"jsonb_path_exists(metadata, '{jsonpath}'::jsonpath, %s::jsonb)" + return where, [json.dumps(vars_obj)] + + +def _apply_contributor_filters( + queryset: QuerySet[Dandiset], specs: list[tuple[str, str | None]] +) -> QuerySet[Dandiset]: + """Filter dandisets by contributor / per-role operators. + + `specs` is a list of `(value, role_or_None)` pairs. The returned queryset + is restricted to dandisets that have at least ONE Version whose + `metadata.contributor[]` satisfies ALL the predicates simultaneously. + Multiple operators thus AND on the same Version (so a draft and a + published version with disjoint contributor lists never combine into a + spurious match). + + Each operator is independent: `author:Baker funder:NIH` matches if SOME + contributor element has Baker as Author AND SOME contributor element (the + same OR a different one) has NIH as Funder. + """ + matching_versions = Version.objects.all() + for value, role in specs: + where, params = _contributor_jsonpath(value, role) + # Trusted jsonpath template (no user value interpolated); user value + # is bound via the jsonb vars param and additionally regex-escaped. + matching_versions = matching_versions.extra( # noqa: S610 + where=[where], params=params + ) + return queryset.filter(versions__pk__in=matching_versions.values('pk')) + + _MODIFIED_ALIAS = '_search_latest_version_modified' _PUBLISHED_ALIAS = '_search_latest_published_created' @@ -163,7 +267,7 @@ def _apply_date_filter(queryset, operator: str, ts: datetime, annotated: set[str raise ValueError(f'unknown date operator: {operator}') # pragma: no cover -def apply_search_filters( +def apply_search_filters( # noqa: C901 (one branch per operator category — splitting the dispatch loop wouldn't make it more readable) queryset: QuerySet[Dandiset], parsed: ParsedSearch, *, @@ -185,6 +289,11 @@ def apply_search_filters( # single asset to match both substrings — same as GitHub's default. asset_qs = None annotated: set[str] = set() + # Contributor specs collected here, then applied in a single batch so all + # operators AND on the same Version (avoids cross-version weirdness when + # a dandiset has both a draft and a published version with disjoint + # contributor lists). + contributor_specs: list[tuple[str, str | None]] = [] for op in parsed.operators: key = op.key @@ -206,6 +315,11 @@ def apply_search_filters( asset_qs = _apply_asset_filter(asset_qs, key, value) elif key in _OWNER_OPS: queryset = _apply_owner_filter(queryset, value) + elif key in _CONTRIBUTOR_ROLE_OPS: + contributor_specs.append((value, _CONTRIBUTOR_ROLE_OPS[key])) + + if contributor_specs: + queryset = _apply_contributor_filters(queryset, contributor_specs) if asset_qs is not None: # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index c3b688222..512468492 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -20,18 +20,52 @@ OPERATOR_KEYS: frozenset[str] = frozenset( { + # Date-range 'created_before', 'created_after', 'modified_before', 'modified_after', 'published_before', 'published_after', + # Asset metadata 'species', 'approach', 'technique', 'standard', 'file_type', + # Permissions 'owner', + # Contributor (catch-all + one operator per dandi-schema RoleType, + # snake_cased and stripped of the "dcite:" prefix) + 'contributor', + 'author', + 'conceptualization', + 'contact_person', + 'data_collector', + 'data_curator', + 'data_manager', + 'formal_analysis', + 'funding_acquisition', + 'investigation', + 'maintainer', + 'methodology', + 'producer', + 'project_leader', + 'project_manager', + 'project_member', + 'project_administration', + 'researcher', + 'resources', + 'software', + 'supervision', + 'validation', + 'visualization', + 'funder', + 'sponsor', + 'study_participant', + 'affiliation', + 'ethics_approval', + 'other', } ) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 871c1b062..b25b0e26b 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2166,3 +2166,166 @@ def test_advanced_search_owner_does_not_inflate_to_superuser_archive(api_client) DraftVersionFactory.create(dandiset=admin_owned) assert _search_ids(api_client, 'owner:admin') == {admin_owned.identifier} + + +# --- contributor / role operators ---------------------------------------------------------------- + + +def _seed_dandiset_with_contributors(*, contributors: list[dict]) -> Dandiset: + """Create a draft dandiset with the given contributor list on its version. + + Used by the contributor/role tests below. + """ + dandiset = DandisetFactory.create() + version = DraftVersionFactory.create(dandiset=dandiset) + version.metadata = {**version.metadata, 'contributor': contributors} + version.save() + return dandiset + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_contributor_and_role_operators(api_client): + """One setup, many assertions for the contributor + role operators. + + Covers the catch-all `contributor:`, several role-specific operators + (author/data_curator/funder/contact_person), independent-operator AND + semantics across roles, name vs email lookup, case-insensitive substring, + role substring (`data_curator:` matches `dcite:DataCurator`), and the + "different elements may satisfy different operators" composability that + distinguishes Option D from same-element semantics. + """ + # Three dandisets with overlapping names but different role assignments. + # Realistic identifiers: ORCID for Persons, ROR URL for Organizations. + ds_baker_curator = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'email': 'jane.doe.com', + 'identifier': '0000-0002-2990-9889', + 'roleName': ['dcite:DataCurator', 'dcite:Author'], + 'schemaKey': 'Person', + }, + { + 'name': 'National Institutes of Health', + 'identifier': 'https://ror.org/01cwqze88', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, + ], + ) + ds_baker_author_only = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'identifier': '0000-0001-2222-3333', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + }, + ], + ) + ds_smith_curator = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Smith, Alice', + 'roleName': ['dcite:DataCurator'], + 'schemaKey': 'Person', + }, + ], + ) + + # `contributor:` (catch-all) — any role + assert _search_ids(api_client, 'contributor:Doe') == { + ds_baker_curator.identifier, + ds_baker_author_only.identifier, + } + assert _search_ids(api_client, 'contributor:NIH') == set() # full org name only + assert _search_ids(api_client, 'contributor:"National Institutes"') == { + ds_baker_curator.identifier + } + # Email lookup also works via the catch-all. + assert _search_ids(api_client, 'contributor:jane.doe.com') == { + ds_baker_curator.identifier + } + + # role-specific: author:Doe matches the two dandisets where Doe has Author role + assert _search_ids(api_client, 'author:Doe') == { + ds_baker_curator.identifier, + ds_baker_author_only.identifier, + } + # data_curator:Doe matches only the dandiset where Doe is also a curator + assert _search_ids(api_client, 'data_curator:Doe') == {ds_baker_curator.identifier} + # data_curator:Smith matches the smith dandiset + assert _search_ids(api_client, 'data_curator:Smith') == {ds_smith_curator.identifier} + # funder:"National Institutes" only the one with NIH funder + assert _search_ids(api_client, 'funder:"National Institutes"') == {ds_baker_curator.identifier} + + # Case-insensitive on both name and role + assert _search_ids(api_client, 'AUTHOR:baker') == { + ds_baker_curator.identifier, + ds_baker_author_only.identifier, + } + + # Identifier lookup: full ORCID for Person, full ROR URL for Org, AND + # bare-id substring forms (`01cwqze88` matches the ROR URL via icontains). + assert _search_ids(api_client, 'contributor:0000-0002-2990-9889') == { + ds_baker_curator.identifier + } + assert _search_ids(api_client, 'contributor:"https://ror.org/01cwqze88"') == { + ds_baker_curator.identifier + } + assert _search_ids(api_client, 'contributor:01cwqze88') == {ds_baker_curator.identifier} + # Identifier lookup composes with role: a role-specific operator with an + # ORCID also requires the matched contributor to hold that role. + assert _search_ids(api_client, 'data_curator:0000-0002-2990-9889') == { + ds_baker_curator.identifier + } + # Wrong role for that ORCID → 0 (Doe @ ds_baker_curator IS a curator, + # but Doe @ ds_baker_author_only has a different ORCID). + assert _search_ids(api_client, 'data_curator:0000-0001-2222-3333') == set() + + # Independent-operator AND across DIFFERENT contributor elements: + # `author:Doe funder:"National Institutes"` matches the dandiset where + # one contributor element is Doe-as-Author AND a different contributor + # element is NIH-as-Funder. (This is the Option D semantic the user + # specifically wanted — with same-element semantics the query would + # require Doe to himself be a Funder, which is wrong.) + assert _search_ids(api_client, 'author:Doe funder:"National Institutes"') == { + ds_baker_curator.identifier + } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_contributor_role_substring_match(api_client): + """Role substring matches the dcite:-prefixed stored value. + + `data_curator:` should match the stored value `dcite:DataCurator` via + case-insensitive substring on `roleName` — users don't have to type + the `dcite:` prefix. + """ + ds = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Curator, Connie', + 'roleName': ['dcite:DataCurator'], # stored with dcite: prefix + 'schemaKey': 'Person', + }, + ], + ) + + # The operator name maps to the substring "DataCurator" (without the + # "dcite:" prefix) and matches the stored "dcite:DataCurator" via + # case-insensitive regex inside the jsonpath. + assert _search_ids(api_client, 'data_curator:Curator') == {ds.identifier} + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_unknown_role_operator_returns_400_with_suggestion(api_client): + response = api_client.get( + '/api/dandisets/', + {'draft': 'true', 'empty': 'true', 'search': 'data_curatr:Doe'}, + ) + assert response.status_code == 400 + assert 'data_curator' in response.json()['search'] diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index fb032a157..e2e9e1c28 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -313,7 +313,19 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'substring against the corresponding asset_metadata array); ' 'file_type (nwb, image, text, video — or any MIME prefix); ' 'owner (case-insensitive match against username, email, first ' - 'name, last name, or "first last"). ' + 'name, last name, or "first last"); ' + 'contributor (matches a contributor by name, email, or identifier ' + '— ORCID for Persons, ROR URL for Organizations; any role); ' + 'plus one operator per dandi-schema RoleType for filtering by ' + 'role: author, contact_person, data_curator, funder, maintainer, ' + 'project_leader, sponsor, conceptualization, data_collector, ' + 'data_manager, formal_analysis, funding_acquisition, ' + 'investigation, methodology, producer, project_manager, ' + 'project_member, project_administration, researcher, resources, ' + 'software, supervision, validation, visualization, ' + 'study_participant, affiliation, ethics_approval, other — each ' + 'matches a contributor by name/email/identifier AND requires that ' + 'contributor to hold that role. ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 30f6738dc..3a96b014e 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -96,6 +96,12 @@ const operatorHelp = [ { example: 'standard:nwb', description: 'Has assets in a data standard' }, { example: 'file_type:nwb', description: 'Has assets of a file type (nwb, image, text, video)' }, { example: 'owner:"Jane Doe"', description: 'Owned by a user (name, username, or email)' }, + { example: 'contributor:"Doe, Jane"', description: 'Listed as a contributor (any role; matches name, email, ORCID, or ROR ID)' }, + { example: 'author:Doe', description: 'Listed as an Author (matches name, email, ORCID, or ROR ID)' }, + { example: 'data_curator:Doe', description: 'Listed as a Data Curator' }, + { example: 'funder:NIH', description: 'Listed as a Funder' }, + { example: 'contact_person:Doe', description: 'Listed as the Contact Person' }, + { example: 'maintainer:Doe', description: 'Listed as a Maintainer (and many more — see API docs)' }, ]; function updateSearch(search: string) { From 2ab3257f0b41d266d28b825f0fb20e5d63d6cd98 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Sat, 9 May 2026 11:44:24 -0400 Subject: [PATCH 07/19] affiliation: query nested Person.affiliation[] field, not roleName MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit treated `affiliation:` as a role-name match (looking for `dcite:Affiliation` in `contributor[].roleName`), but real DANDI data never uses that role; affiliations live in a separate nested field `contributor[].affiliation[]`. The operator silently returned 0 hits despite plenty of (e.g.) Stanford-affiliated contributors. Fix: route `affiliation:` through a dedicated jsonpath that scans `$.contributor[*].affiliation[*]` and matches against the affiliation's `name` OR `identifier` (case-insensitive substring). So: affiliation:Stanford → matches Stanford University affiliation:"University College London" → quoted multi-word affiliation:00f54p054 → matches via ROR ID substring Composes with role/contributor operators on the same Version, same as the other contributor-style operators (independent-operator AND). Also refactored `_apply_contributor_filters` to accept a list of (where, params) pairs rather than (value, role) — cleaner since both the role-based and affiliation operators now share the same dispatch. --- dandiapi/api/services/search/filters.py | 90 +++++++++++++--------- dandiapi/api/tests/test_dandiset.py | 58 +++++++++++++- dandiapi/api/views/serializers.py | 6 +- web/src/components/DandisetSearchField.vue | 1 + 4 files changed, 114 insertions(+), 41 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 10d16d10b..eddc1e605 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -81,11 +81,23 @@ 'funder': 'Funder', 'sponsor': 'Sponsor', 'study_participant': 'StudyParticipant', - 'affiliation': 'Affiliation', 'ethics_approval': 'EthicsApproval', 'other': 'Other', + # Note: `affiliation` is intentionally NOT here. Despite `dcite:Affiliation` + # existing as a RoleType, in real DANDI data affiliations live in a + # separate nested field — `Person.affiliation[]` — not as a contributor's + # role. The `affiliation:` operator queries that nested path; see + # `_AFFILIATION_JSONPATH` below. } +# Affiliation has its own jsonpath because it lives at +# `contributor[].affiliation[]`, not `contributor[].roleName[]`. +_AFFILIATION_JSONPATH = ( + '$.contributor[*].affiliation[*] ? ' + '(@.name like_regex $val flag "i" ' + ' || @.identifier like_regex $val flag "i")' +) + def _annotate_latest_version_modified(queryset): latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[:1] @@ -175,8 +187,8 @@ def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Da return queryset.filter(pk__in=owned_pks) -def _contributor_jsonpath(value: str, role: str | None) -> tuple[str, list[str]]: - """Build a (where_clause, params) for a single contributor element predicate. +def _contributor_role_jsonpath(value: str, role: str | None) -> tuple[str, dict[str, str]]: + """Jsonpath + vars for a contributor[] element predicate. Matches a `contributor[]` element whose `name`, `email`, OR `identifier` contains `value` (case-insensitive). The identifier covers ORCIDs for @@ -186,10 +198,6 @@ def _contributor_jsonpath(value: str, role: str | None) -> tuple[str, list[str]] If `role` is given, additionally requires that element's `roleName` array to contain a string matching `role` (also case-insensitive substring). - - Uses the third argument of `jsonb_path_exists` to bind named variables - (`$val`, `$role`) so the user values are properly quoted by Postgres - rather than concatenated into the path string. """ name_or_email_or_id = ( '@.name like_regex $val flag "i" ' @@ -197,38 +205,43 @@ def _contributor_jsonpath(value: str, role: str | None) -> tuple[str, list[str]] ' || @.identifier like_regex $val flag "i"' ) if role is None: - jsonpath = f'$.contributor[*] ? ({name_or_email_or_id})' - vars_obj = {'val': re.escape(value)} - else: - jsonpath = ( - '$.contributor[*] ? ' - f'(({name_or_email_or_id}) ' - ' && exists(@.roleName[*] ? (@ like_regex $role flag "i")))' - ) - vars_obj = {'val': re.escape(value), 'role': re.escape(role)} + return f'$.contributor[*] ? ({name_or_email_or_id})', {'val': re.escape(value)} + return ( + '$.contributor[*] ? ' + f'(({name_or_email_or_id}) ' + ' && exists(@.roleName[*] ? (@ like_regex $role flag "i")))', + {'val': re.escape(value), 'role': re.escape(role)}, + ) + + +def _build_jsonpath_where(jsonpath: str, vars_obj: dict[str, str]) -> tuple[str, list[str]]: + """Wrap a jsonpath + vars into a `jsonb_path_exists(metadata, ...)` predicate. + + Uses the third argument of `jsonb_path_exists` to bind named variables + so user values are properly quoted by Postgres rather than concatenated + into the path string. + """ where = f"jsonb_path_exists(metadata, '{jsonpath}'::jsonpath, %s::jsonb)" return where, [json.dumps(vars_obj)] def _apply_contributor_filters( - queryset: QuerySet[Dandiset], specs: list[tuple[str, str | None]] + queryset: QuerySet[Dandiset], wheres: list[tuple[str, list[str]]] ) -> QuerySet[Dandiset]: - """Filter dandisets by contributor / per-role operators. + """Filter dandisets by accumulated contributor predicates. - `specs` is a list of `(value, role_or_None)` pairs. The returned queryset - is restricted to dandisets that have at least ONE Version whose - `metadata.contributor[]` satisfies ALL the predicates simultaneously. - Multiple operators thus AND on the same Version (so a draft and a - published version with disjoint contributor lists never combine into a - spurious match). + `wheres` is a list of `(where_clause, params)` pairs (one per operator). + The returned queryset is restricted to dandisets that have at least ONE + Version whose `metadata` satisfies ALL the predicates simultaneously. + Operators thus AND on the same Version (a draft and a published version + with disjoint contributor lists never combine into a spurious match). - Each operator is independent: `author:Baker funder:NIH` matches if SOME - contributor element has Baker as Author AND SOME contributor element (the + Each operator is independent: `author:Doe funder:NIH` matches if SOME + contributor element has Doe as Author AND SOME contributor element (the same OR a different one) has NIH as Funder. """ matching_versions = Version.objects.all() - for value, role in specs: - where, params = _contributor_jsonpath(value, role) + for where, params in wheres: # Trusted jsonpath template (no user value interpolated); user value # is bound via the jsonb vars param and additionally regex-escaped. matching_versions = matching_versions.extra( # noqa: S610 @@ -289,11 +302,11 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s # single asset to match both substrings — same as GitHub's default. asset_qs = None annotated: set[str] = set() - # Contributor specs collected here, then applied in a single batch so all - # operators AND on the same Version (avoids cross-version weirdness when - # a dandiset has both a draft and a published version with disjoint + # Contributor predicates collected here, then applied in a single batch so + # all operators AND on the same Version (avoids cross-version weirdness + # when a dandiset has both a draft and a published version with disjoint # contributor lists). - contributor_specs: list[tuple[str, str | None]] = [] + contributor_wheres: list[tuple[str, list[str]]] = [] for op in parsed.operators: key = op.key @@ -316,10 +329,15 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s elif key in _OWNER_OPS: queryset = _apply_owner_filter(queryset, value) elif key in _CONTRIBUTOR_ROLE_OPS: - contributor_specs.append((value, _CONTRIBUTOR_ROLE_OPS[key])) - - if contributor_specs: - queryset = _apply_contributor_filters(queryset, contributor_specs) + jsonpath, vars_obj = _contributor_role_jsonpath(value, _CONTRIBUTOR_ROLE_OPS[key]) + contributor_wheres.append(_build_jsonpath_where(jsonpath, vars_obj)) + elif key == 'affiliation': + contributor_wheres.append( + _build_jsonpath_where(_AFFILIATION_JSONPATH, {'val': re.escape(value)}) + ) + + if contributor_wheres: + queryset = _apply_contributor_filters(queryset, contributor_wheres) if asset_qs is not None: # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index b25b0e26b..5fd4f7bfe 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2244,9 +2244,7 @@ def test_advanced_search_contributor_and_role_operators(api_client): ds_baker_curator.identifier } # Email lookup also works via the catch-all. - assert _search_ids(api_client, 'contributor:jane.doe.com') == { - ds_baker_curator.identifier - } + assert _search_ids(api_client, 'contributor:jane.doe.com') == {ds_baker_curator.identifier} # role-specific: author:Doe matches the two dandisets where Doe has Author role assert _search_ids(api_client, 'author:Doe') == { @@ -2329,3 +2327,57 @@ def test_advanced_search_unknown_role_operator_returns_400_with_suggestion(api_c ) assert response.status_code == 400 assert 'data_curator' in response.json()['search'] + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_affiliation_operator(api_client): + """`affiliation:` queries the nested Person.affiliation[] field, not roleName. + + Real DANDI data stores affiliations on the Person object itself (e.g. + `Doe, Jane` is affiliated with Stanford via `Doe.affiliation[0].name`), + not as a `dcite:Affiliation` roleName entry. Match by org name OR by + the affiliation's ROR identifier; substring forms work for both. + """ + ds_stanford = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + 'affiliation': [ + { + 'name': 'Stanford University', + 'identifier': 'https://ror.org/00f54p054', + 'schemaKey': 'Affiliation', + }, + ], + }, + ], + ) + ds_ucl = _seed_dandiset_with_contributors( + contributors=[ + { + 'name': 'Doe, Jane', + 'roleName': ['dcite:Author'], + 'schemaKey': 'Person', + 'affiliation': [ + { + 'name': 'University College London', + 'schemaKey': 'Affiliation', + }, + ], + }, + ], + ) + + # Affiliation by organization name + assert _search_ids(api_client, 'affiliation:Stanford') == {ds_stanford.identifier} + assert _search_ids(api_client, 'affiliation:"University College London"') == {ds_ucl.identifier} + # Affiliation by ROR identifier (full URL or bare ID via substring) + assert _search_ids(api_client, 'affiliation:00f54p054') == {ds_stanford.identifier} + + # Composes with role/contributor operators on the same Version (different + # contributor elements OK — but here both must match Doe, who is the only + # contributor in each fixture, so the joint constraint is just an AND). + assert _search_ids(api_client, 'author:Doe affiliation:Stanford') == {ds_stanford.identifier} diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index e2e9e1c28..86f5d0e7b 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -323,9 +323,11 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'investigation, methodology, producer, project_manager, ' 'project_member, project_administration, researcher, resources, ' 'software, supervision, validation, visualization, ' - 'study_participant, affiliation, ethics_approval, other — each ' - 'matches a contributor by name/email/identifier AND requires that ' + 'study_participant, ethics_approval, other — each matches a ' + 'contributor by name/email/identifier AND requires that ' 'contributor to hold that role. ' + 'affiliation (matches the nested Person.affiliation[] field by ' + 'organization name or ROR identifier — e.g. affiliation:Stanford). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 3a96b014e..03f691e7d 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -102,6 +102,7 @@ const operatorHelp = [ { example: 'funder:NIH', description: 'Listed as a Funder' }, { example: 'contact_person:Doe', description: 'Listed as the Contact Person' }, { example: 'maintainer:Doe', description: 'Listed as a Maintainer (and many more — see API docs)' }, + { example: 'affiliation:Stanford', description: 'Has a contributor affiliated with the named organization (or ROR ID)' }, ]; function updateSearch(search: string) { From 01b4bb16fe7c4428387c348729d8eb1b4256465a Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Sat, 9 May 2026 11:47:12 -0400 Subject: [PATCH 08/19] Drop ethics_approval and other operators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: `other:` would be a thin surface for "uncategorized contributors" — not a useful filter — and `ethics_approval:` isn't a contributor-style role users would search by. Removing them tightens the operator vocabulary to the 25 substantive RoleType values + the contributor catch-all + affiliation. --- dandiapi/api/services/search/filters.py | 2 -- dandiapi/api/services/search/parser.py | 2 -- dandiapi/api/views/serializers.py | 4 ++-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index eddc1e605..89b362bc1 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -81,8 +81,6 @@ 'funder': 'Funder', 'sponsor': 'Sponsor', 'study_participant': 'StudyParticipant', - 'ethics_approval': 'EthicsApproval', - 'other': 'Other', # Note: `affiliation` is intentionally NOT here. Despite `dcite:Affiliation` # existing as a RoleType, in real DANDI data affiliations live in a # separate nested field — `Person.affiliation[]` — not as a contributor's diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index 512468492..af3502786 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -64,8 +64,6 @@ 'sponsor', 'study_participant', 'affiliation', - 'ethics_approval', - 'other', } ) diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 86f5d0e7b..c2c208b7d 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -323,8 +323,8 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'investigation, methodology, producer, project_manager, ' 'project_member, project_administration, researcher, resources, ' 'software, supervision, validation, visualization, ' - 'study_participant, ethics_approval, other — each matches a ' - 'contributor by name/email/identifier AND requires that ' + 'study_participant — each matches a contributor by ' + 'name/email/identifier AND requires that ' 'contributor to hold that role. ' 'affiliation (matches the nested Person.affiliation[] field by ' 'organization name or ROR identifier — e.g. affiliation:Stanford). ' From 86ff47178f38693f6ae39ec61fb12a737a8adab4 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 11:57:44 -0400 Subject: [PATCH 09/19] Extract operator dispatch into operators.py; trim role list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two structural improvements + one product trim, in response to the review on #2822: 1. New `dandiapi/api/services/search/operators.py` (pure Python, no Django) holds every operator-vocabulary constant: DATE_OPS, ASSET_OPS, OWNER_OPS, AFFILIATION_OPS, CONTRIBUTOR_ROLE_OPS, FILE_TYPE_ALIASES, ASSET_NAME_PATH_OPS, AFFILIATION_JSONPATH. OPERATOR_KEYS is now the union of those tables — single source of truth, no more duplication between parser.py (allowlist) and filters.py (dispatch). Adding a new operator is one entry; the parser automatically knows about it. 2. Trim the role-restricting shortcuts from 25 to 9. After review discussion: most RoleType values aren't operators users actually reach for (`conceptualization:`, `methodology:`, `validation:`, `visualization:`, etc.). Kept the ones that map to common search intents: contributor (catch-all), author, contact_person, data_collector, data_curator, data_manager, maintainer, project_lead, funder, sponsor The catch-all `contributor:` still matches anyone in any role; only the role-restricting shortcuts are pruned. `project_lead:` is intentionally shorter than the schema name `ProjectLeader`. 3. Shrank the verbose docstrings on private filter helpers (the rationale stays in commit messages, not as documentation rot on internal API). 4. Added test_contributor_role_ops_match_actual_dandischema_roletype as a drift guard: every non-catch-all CONTRIBUTOR_ROLE_OPS value must be a real RoleType.name. Renames or removals on the schema side trip the test, forcing an explicit decision instead of silently changing public search syntax. OpenAPI help text and the search popover updated to reflect the trimmed list (`project_lead`, `data_collector`, `data_manager`, `sponsor` now shown; the misleading "many more" tail removed). --- dandiapi/api/services/search/filters.py | 155 ++++------------------ dandiapi/api/services/search/operators.py | 84 ++++++++++++ dandiapi/api/services/search/parser.py | 49 +------ dandiapi/api/tests/test_search_parser.py | 22 +++ dandiapi/api/views/serializers.py | 15 +-- 5 files changed, 136 insertions(+), 189 deletions(-) create mode 100644 dandiapi/api/services/search/operators.py diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 89b362bc1..f81fcd99e 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -13,6 +13,16 @@ from dandiapi.api.models import Version from dandiapi.api.models.dandiset import DandisetUserObjectPermission +from dandiapi.api.services.search.operators import ( + AFFILIATION_JSONPATH, + AFFILIATION_OPS, + ASSET_NAME_PATH_OPS, + ASSET_OPS, + CONTRIBUTOR_ROLE_OPS, + DATE_OPS, + FILE_TYPE_ALIASES, + OWNER_OPS, +) from dandiapi.api.services.search.parser import SearchSyntaxError from dandiapi.search.models import AssetSearch @@ -23,79 +33,6 @@ from dandiapi.api.models import Dandiset from dandiapi.api.services.search.parser import ParsedSearch -# Aliases for the file-type operator: short name → MIME prefix matched with -# istartswith. Keep in sync with DandisetSearchQueryParameterSerializer. -_FILE_TYPE_ALIASES = { - 'nwb': 'application/x-nwb', - 'image': 'image/', - 'text': 'text/', - 'video': 'video/', -} - -_DATE_OPS = frozenset( - { - 'created_before', - 'created_after', - 'modified_before', - 'modified_after', - 'published_before', - 'published_after', - } -) -_ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) -_OWNER_OPS = frozenset({'owner'}) - -# Contributor / per-role operators. The catch-all `contributor:` matches any -# role; each named operator additionally requires the matched contributor's -# `roleName` array to contain the corresponding `dcite:Role` value. Keys MUST -# be in OPERATOR_KEYS (parser allowlist) — keep the two in sync. -# -# Independent-operator semantics: each operator constrains a single -# `contributor[]` element, but two operators (e.g. `author:Baker funder:NIH`) -# are AND'd against the same Version's metadata — they may match the same OR -# different contributor elements. Same composability as the asset operators. -_CONTRIBUTOR_ROLE_OPS: dict[str, str | None] = { - 'contributor': None, # catch-all, no role constraint - 'author': 'Author', - 'conceptualization': 'Conceptualization', - 'contact_person': 'ContactPerson', - 'data_collector': 'DataCollector', - 'data_curator': 'DataCurator', - 'data_manager': 'DataManager', - 'formal_analysis': 'FormalAnalysis', - 'funding_acquisition': 'FundingAcquisition', - 'investigation': 'Investigation', - 'maintainer': 'Maintainer', - 'methodology': 'Methodology', - 'producer': 'Producer', - 'project_leader': 'ProjectLeader', - 'project_manager': 'ProjectManager', - 'project_member': 'ProjectMember', - 'project_administration': 'ProjectAdministration', - 'researcher': 'Researcher', - 'resources': 'Resources', - 'software': 'Software', - 'supervision': 'Supervision', - 'validation': 'Validation', - 'visualization': 'Visualization', - 'funder': 'Funder', - 'sponsor': 'Sponsor', - 'study_participant': 'StudyParticipant', - # Note: `affiliation` is intentionally NOT here. Despite `dcite:Affiliation` - # existing as a RoleType, in real DANDI data affiliations live in a - # separate nested field — `Person.affiliation[]` — not as a contributor's - # role. The `affiliation:` operator queries that nested path; see - # `_AFFILIATION_JSONPATH` below. -} - -# Affiliation has its own jsonpath because it lives at -# `contributor[].affiliation[]`, not `contributor[].roleName[]`. -_AFFILIATION_JSONPATH = ( - '$.contributor[*].affiliation[*] ? ' - '(@.name like_regex $val flag "i" ' - ' || @.identifier like_regex $val flag "i")' -) - def _annotate_latest_version_modified(queryset): latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[:1] @@ -115,29 +52,8 @@ def _annotate_latest_published_created(queryset): ) -# Each entry maps an operator to a Postgres jsonpath that selects the names -# we want to match against. `[*]` wildcards mean we match if ANY element of -# the array satisfies the predicate — important for assets that list -# multiple species, approaches, etc. Paths MUST be trusted constants: -# they're interpolated into the SQL. -_NAME_PATH_OPS = { - 'species': '$.wasAttributedTo[*].species.name', - 'approach': '$.approach[*].name', - 'technique': '$.measurementTechnique[*].name', - 'standard': '$.dataStandard[*].name', -} - - def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]: - """Build a parameterized Postgres `jsonb_path_exists` predicate. - - Matches `value` case-insensitively as a substring against any node - selected by `path`. `path` MUST come from a trusted allowlist; `value` - is parameterized and regex-escaped. - """ - # No table prefix on `asset_metadata`: Django may alias the AssetSearch - # table (e.g. inside a subquery), and qualifying the column would break - # those queries. The unqualified column is unambiguous in our usage. + """Asset-metadata jsonpath substring predicate; `path` must be trusted.""" where = ( 'jsonb_path_exists(asset_metadata, ' f"('{path} ? (@ like_regex ' " @@ -149,25 +65,17 @@ def _jsonpath_name_match(path: str, value: str) -> tuple[str, list[str]]: def _apply_asset_filter(queryset, operator: str, value: str): """Apply one parsed asset operator to an AssetSearch queryset.""" - if operator in _NAME_PATH_OPS: - where, params = _jsonpath_name_match(_NAME_PATH_OPS[operator], value) - # `where` interpolates only an allowlisted jsonpath; the user value - # is bound via params (and re-escaped against regex injection). + if operator in ASSET_NAME_PATH_OPS: + where, params = _jsonpath_name_match(ASSET_NAME_PATH_OPS[operator], value) return queryset.extra(where=[where], params=params) # noqa: S610 if operator == 'file_type': - mime_prefix = _FILE_TYPE_ALIASES.get(value.lower(), value) + mime_prefix = FILE_TYPE_ALIASES.get(value.lower(), value) return queryset.filter(asset_metadata__encodingFormat__istartswith=mime_prefix) raise ValueError(f'unknown asset operator: {operator}') # pragma: no cover def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Dandiset]: - """Filter dandisets to those owned by the given user identifier. - - `value` is matched case-insensitively against `User.username`, `User.email`, - `User.first_name`, `User.last_name`, or `"first_name last_name"` (so the - display name shown in the UI works). Multiple users may match; we union - dandisets owned by any of them. Unknown user → empty result. - """ + """Filter dandisets to those owned by users matching `value` (icontains).""" matched_user_pks = ( User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name')) .filter( @@ -186,17 +94,7 @@ def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Da def _contributor_role_jsonpath(value: str, role: str | None) -> tuple[str, dict[str, str]]: - """Jsonpath + vars for a contributor[] element predicate. - - Matches a `contributor[]` element whose `name`, `email`, OR `identifier` - contains `value` (case-insensitive). The identifier covers ORCIDs for - Person contributors (e.g. `0000-0002-2990-9889`) and ROR URLs for - Organization contributors (e.g. `https://ror.org/01cwqze88`); the - substring match means bare-ID forms like `01cwqze88` work too. - - If `role` is given, additionally requires that element's `roleName` array - to contain a string matching `role` (also case-insensitive substring). - """ + """Jsonpath + vars for a contributor[] predicate (optional role constraint).""" name_or_email_or_id = ( '@.name like_regex $val flag "i" ' ' || @.email like_regex $val flag "i" ' @@ -213,12 +111,7 @@ def _contributor_role_jsonpath(value: str, role: str | None) -> tuple[str, dict[ def _build_jsonpath_where(jsonpath: str, vars_obj: dict[str, str]) -> tuple[str, list[str]]: - """Wrap a jsonpath + vars into a `jsonb_path_exists(metadata, ...)` predicate. - - Uses the third argument of `jsonb_path_exists` to bind named variables - so user values are properly quoted by Postgres rather than concatenated - into the path string. - """ + """Wrap a jsonpath + vars into a `jsonb_path_exists(metadata, ...)` predicate.""" where = f"jsonb_path_exists(metadata, '{jsonpath}'::jsonpath, %s::jsonb)" return where, [json.dumps(vars_obj)] @@ -312,7 +205,7 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s if not value: raise SearchSyntaxError(f'Operator "{key}" requires a value (e.g. {key}:something).') - if key in _DATE_OPS: + if key in DATE_OPS: try: ts = datetime.strptime(value, '%Y-%m-%d').replace(tzinfo=UTC) except ValueError as exc: @@ -320,18 +213,18 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s f'Invalid date for "{key}": {value!r}. Use YYYY-MM-DD.' ) from exc queryset = _apply_date_filter(queryset, key, ts, annotated) - elif key in _ASSET_OPS: + elif key in ASSET_OPS: if asset_qs is None: asset_qs = AssetSearch.objects.visible_to(user) asset_qs = _apply_asset_filter(asset_qs, key, value) - elif key in _OWNER_OPS: + elif key in OWNER_OPS: queryset = _apply_owner_filter(queryset, value) - elif key in _CONTRIBUTOR_ROLE_OPS: - jsonpath, vars_obj = _contributor_role_jsonpath(value, _CONTRIBUTOR_ROLE_OPS[key]) + elif key in CONTRIBUTOR_ROLE_OPS: + jsonpath, vars_obj = _contributor_role_jsonpath(value, CONTRIBUTOR_ROLE_OPS[key]) contributor_wheres.append(_build_jsonpath_where(jsonpath, vars_obj)) - elif key == 'affiliation': + elif key in AFFILIATION_OPS: contributor_wheres.append( - _build_jsonpath_where(_AFFILIATION_JSONPATH, {'val': re.escape(value)}) + _build_jsonpath_where(AFFILIATION_JSONPATH, {'val': re.escape(value)}) ) if contributor_wheres: diff --git a/dandiapi/api/services/search/operators.py b/dandiapi/api/services/search/operators.py new file mode 100644 index 000000000..4a8f72281 --- /dev/null +++ b/dandiapi/api/services/search/operators.py @@ -0,0 +1,84 @@ +"""Operator vocabulary and dispatch tables for the advanced-search syntax. + +Pure-Python module so :mod:`parser` (which knows nothing about Django) can +import the operator allowlist alongside the filter dispatch tables. Adding +a new operator is one entry in the relevant table here; the parser picks +it up automatically via :data:`OPERATOR_KEYS`. +""" + +from __future__ import annotations + +# File-type operator: short name → MIME prefix matched with istartswith. +# Keep in sync with DandisetSearchQueryParameterSerializer. +FILE_TYPE_ALIASES = { + 'nwb': 'application/x-nwb', + 'image': 'image/', + 'text': 'text/', + 'video': 'video/', +} + +DATE_OPS = frozenset( + { + 'created_before', + 'created_after', + 'modified_before', + 'modified_after', + 'published_before', + 'published_after', + } +) + +ASSET_OPS = frozenset({'species', 'approach', 'technique', 'standard', 'file_type'}) + +OWNER_OPS = frozenset({'owner'}) + +# Per-role contributor operators: snake_case operator → PascalCase +# `dandischema.RoleType.name`. The catch-all `contributor:` matches any +# role; each named operator additionally requires the matched contributor's +# `roleName` array to contain `dcite:`. +# +# Kept as an explicit allowlist (rather than auto-derived from RoleType) so +# schema-level renames or additions can't silently change the public search +# syntax. A unit test validates each value against `RoleType.name`. +CONTRIBUTOR_ROLE_OPS: dict[str, str | None] = { + 'contributor': None, # catch-all + 'author': 'Author', + 'contact_person': 'ContactPerson', + 'data_collector': 'DataCollector', + 'data_curator': 'DataCurator', + 'data_manager': 'DataManager', + 'maintainer': 'Maintainer', + # Operator name intentionally shorter than the schema name (`ProjectLeader`). + 'project_lead': 'ProjectLeader', + 'funder': 'Funder', + 'sponsor': 'Sponsor', + # The other RoleType values exist on `dandischema` but aren't exposed as + # search operators — see PR discussion. The catch-all `contributor:` still + # finds anyone in any role; only the role-restricting shortcuts are pruned. +} + +AFFILIATION_OPS = frozenset({'affiliation'}) + +# Asset-name jsonpaths: each operator selects a different array path on +# `asset_metadata` whose elements have a `.name` we substring-match against. +# Paths MUST be trusted constants (interpolated into the SQL). +ASSET_NAME_PATH_OPS = { + 'species': '$.wasAttributedTo[*].species.name', + 'approach': '$.approach[*].name', + 'technique': '$.measurementTechnique[*].name', + 'standard': '$.dataStandard[*].name', +} + +# Affiliation jsonpath: nested under each contributor[].affiliation[], +# matched on the affiliation's own `name` or `identifier` (ROR URL). +AFFILIATION_JSONPATH = ( + '$.contributor[*].affiliation[*] ? ' + '(@.name like_regex $val flag "i" ' + ' || @.identifier like_regex $val flag "i")' +) + +# Union of every operator key. The parser uses this for its allowlist; +# adding a new operator anywhere above is automatically known to the parser. +OPERATOR_KEYS: frozenset[str] = ( + DATE_OPS | ASSET_OPS | OWNER_OPS | AFFILIATION_OPS | frozenset(CONTRIBUTOR_ROLE_OPS) +) diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index af3502786..37f978a69 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -18,54 +18,7 @@ from difflib import get_close_matches import re -OPERATOR_KEYS: frozenset[str] = frozenset( - { - # Date-range - 'created_before', - 'created_after', - 'modified_before', - 'modified_after', - 'published_before', - 'published_after', - # Asset metadata - 'species', - 'approach', - 'technique', - 'standard', - 'file_type', - # Permissions - 'owner', - # Contributor (catch-all + one operator per dandi-schema RoleType, - # snake_cased and stripped of the "dcite:" prefix) - 'contributor', - 'author', - 'conceptualization', - 'contact_person', - 'data_collector', - 'data_curator', - 'data_manager', - 'formal_analysis', - 'funding_acquisition', - 'investigation', - 'maintainer', - 'methodology', - 'producer', - 'project_leader', - 'project_manager', - 'project_member', - 'project_administration', - 'researcher', - 'resources', - 'software', - 'supervision', - 'validation', - 'visualization', - 'funder', - 'sponsor', - 'study_participant', - 'affiliation', - } -) +from dandiapi.api.services.search.operators import OPERATOR_KEYS # A token in the input is one of: # key:"quoted value" — operator with quoted value diff --git a/dandiapi/api/tests/test_search_parser.py b/dandiapi/api/tests/test_search_parser.py index 2498a0b20..6b3d3123c 100644 --- a/dandiapi/api/tests/test_search_parser.py +++ b/dandiapi/api/tests/test_search_parser.py @@ -98,3 +98,25 @@ def test_parse_search(query, expected_free_text, expected_operators): def test_parse_search_raises_on_invalid_query(query, expected_message_fragment): with pytest.raises(SearchSyntaxError, match=expected_message_fragment): parse_search(query) + + +def test_contributor_role_ops_match_actual_dandischema_roletype(): + """Guard against schema drift. + + Every non-catch-all `CONTRIBUTOR_ROLE_OPS` value must match a real + `dandischema.RoleType` member name. Renames or removals on the schema + side trip this test, forcing an explicit decision here instead of + silently changing user-facing search syntax. + """ + from dandischema.models import RoleType + + from dandiapi.api.services.search.operators import CONTRIBUTOR_ROLE_OPS + + role_names = {r.name for r in RoleType} + for op_name, role_name in CONTRIBUTOR_ROLE_OPS.items(): + if role_name is None: + continue + assert role_name in role_names, ( + f'CONTRIBUTOR_ROLE_OPS[{op_name!r}] = {role_name!r} is not a valid ' + 'dandischema.RoleType member name' + ) diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index c2c208b7d..f8556fbf1 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -316,16 +316,11 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'name, last name, or "first last"); ' 'contributor (matches a contributor by name, email, or identifier ' '— ORCID for Persons, ROR URL for Organizations; any role); ' - 'plus one operator per dandi-schema RoleType for filtering by ' - 'role: author, contact_person, data_curator, funder, maintainer, ' - 'project_leader, sponsor, conceptualization, data_collector, ' - 'data_manager, formal_analysis, funding_acquisition, ' - 'investigation, methodology, producer, project_manager, ' - 'project_member, project_administration, researcher, resources, ' - 'software, supervision, validation, visualization, ' - 'study_participant — each matches a contributor by ' - 'name/email/identifier AND requires that ' - 'contributor to hold that role. ' + 'role-restricting shortcuts that match a contributor by ' + 'name/email/identifier AND require that contributor to hold a ' + 'specific role: author, contact_person, data_collector, ' + 'data_curator, data_manager, maintainer, project_lead, funder, ' + 'sponsor. ' 'affiliation (matches the nested Person.affiliation[] field by ' 'organization name or ROR identifier — e.g. affiliation:Stanford). ' 'Invalid syntax returns HTTP 400 with the offending token; ' From 463c366abda902482d965e77391ffa84f887ea9d Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 12:00:10 -0400 Subject: [PATCH 10/19] Use project_leader (full schema name) as the operator name --- dandiapi/api/services/search/operators.py | 3 +-- dandiapi/api/views/serializers.py | 2 +- web/src/components/DandisetSearchField.vue | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dandiapi/api/services/search/operators.py b/dandiapi/api/services/search/operators.py index 4a8f72281..dbdad6962 100644 --- a/dandiapi/api/services/search/operators.py +++ b/dandiapi/api/services/search/operators.py @@ -48,8 +48,7 @@ 'data_curator': 'DataCurator', 'data_manager': 'DataManager', 'maintainer': 'Maintainer', - # Operator name intentionally shorter than the schema name (`ProjectLeader`). - 'project_lead': 'ProjectLeader', + 'project_leader': 'ProjectLeader', 'funder': 'Funder', 'sponsor': 'Sponsor', # The other RoleType values exist on `dandischema` but aren't exposed as diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index f8556fbf1..01d15b43a 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -319,7 +319,7 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'role-restricting shortcuts that match a contributor by ' 'name/email/identifier AND require that contributor to hold a ' 'specific role: author, contact_person, data_collector, ' - 'data_curator, data_manager, maintainer, project_lead, funder, ' + 'data_curator, data_manager, maintainer, project_leader, funder, ' 'sponsor. ' 'affiliation (matches the nested Person.affiliation[] field by ' 'organization name or ROR identifier — e.g. affiliation:Stanford). ' diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index 03f691e7d..e21c6e74c 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -101,7 +101,8 @@ const operatorHelp = [ { example: 'data_curator:Doe', description: 'Listed as a Data Curator' }, { example: 'funder:NIH', description: 'Listed as a Funder' }, { example: 'contact_person:Doe', description: 'Listed as the Contact Person' }, - { example: 'maintainer:Doe', description: 'Listed as a Maintainer (and many more — see API docs)' }, + { example: 'maintainer:Doe', description: 'Listed as a Maintainer' }, + { example: 'project_leader:Doe', description: 'Listed as the Project Leader (also: data_collector, data_manager, sponsor)' }, { example: 'affiliation:Stanford', description: 'Has a contributor affiliated with the named organization (or ROR ID)' }, ]; From 76c61f7e79bcb121c99439baa95c5ba77f7c537c Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 12:05:14 -0400 Subject: [PATCH 11/19] Test: rename baker/cody variable names + fix earlier sed damage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Variable renames: ds_baker_curator → ds_doe_curator, ds_baker_author_only → ds_doe_author_only (the test data was already Doe; only the variable names still carried the old name). - One stale query string `AUTHOR:baker` updated to `AUTHOR:doe`. - One fixture email field `'jane.doe.com'` (broken: no @) restored to `'jane.doe@example.com'` — leftover from the earlier perl rename that stripped @example out. --- dandiapi/api/tests/test_dandiset.py | 44 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 5fd4f7bfe..589105cf0 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2197,11 +2197,11 @@ def test_advanced_search_contributor_and_role_operators(api_client): """ # Three dandisets with overlapping names but different role assignments. # Realistic identifiers: ORCID for Persons, ROR URL for Organizations. - ds_baker_curator = _seed_dandiset_with_contributors( + ds_doe_curator = _seed_dandiset_with_contributors( contributors=[ { 'name': 'Doe, Jane', - 'email': 'jane.doe.com', + 'email': 'jane.doe@example.com', 'identifier': '0000-0002-2990-9889', 'roleName': ['dcite:DataCurator', 'dcite:Author'], 'schemaKey': 'Person', @@ -2214,7 +2214,7 @@ def test_advanced_search_contributor_and_role_operators(api_client): }, ], ) - ds_baker_author_only = _seed_dandiset_with_contributors( + ds_doe_author_only = _seed_dandiset_with_contributors( contributors=[ { 'name': 'Doe, Jane', @@ -2236,50 +2236,52 @@ def test_advanced_search_contributor_and_role_operators(api_client): # `contributor:` (catch-all) — any role assert _search_ids(api_client, 'contributor:Doe') == { - ds_baker_curator.identifier, - ds_baker_author_only.identifier, + ds_doe_curator.identifier, + ds_doe_author_only.identifier, } assert _search_ids(api_client, 'contributor:NIH') == set() # full org name only assert _search_ids(api_client, 'contributor:"National Institutes"') == { - ds_baker_curator.identifier + ds_doe_curator.identifier } # Email lookup also works via the catch-all. - assert _search_ids(api_client, 'contributor:jane.doe.com') == {ds_baker_curator.identifier} + assert _search_ids(api_client, 'contributor:jane.doe@example.com') == { + ds_doe_curator.identifier + } # role-specific: author:Doe matches the two dandisets where Doe has Author role assert _search_ids(api_client, 'author:Doe') == { - ds_baker_curator.identifier, - ds_baker_author_only.identifier, + ds_doe_curator.identifier, + ds_doe_author_only.identifier, } # data_curator:Doe matches only the dandiset where Doe is also a curator - assert _search_ids(api_client, 'data_curator:Doe') == {ds_baker_curator.identifier} + assert _search_ids(api_client, 'data_curator:Doe') == {ds_doe_curator.identifier} # data_curator:Smith matches the smith dandiset assert _search_ids(api_client, 'data_curator:Smith') == {ds_smith_curator.identifier} # funder:"National Institutes" only the one with NIH funder - assert _search_ids(api_client, 'funder:"National Institutes"') == {ds_baker_curator.identifier} + assert _search_ids(api_client, 'funder:"National Institutes"') == {ds_doe_curator.identifier} # Case-insensitive on both name and role - assert _search_ids(api_client, 'AUTHOR:baker') == { - ds_baker_curator.identifier, - ds_baker_author_only.identifier, + assert _search_ids(api_client, 'AUTHOR:doe') == { + ds_doe_curator.identifier, + ds_doe_author_only.identifier, } # Identifier lookup: full ORCID for Person, full ROR URL for Org, AND # bare-id substring forms (`01cwqze88` matches the ROR URL via icontains). assert _search_ids(api_client, 'contributor:0000-0002-2990-9889') == { - ds_baker_curator.identifier + ds_doe_curator.identifier } assert _search_ids(api_client, 'contributor:"https://ror.org/01cwqze88"') == { - ds_baker_curator.identifier + ds_doe_curator.identifier } - assert _search_ids(api_client, 'contributor:01cwqze88') == {ds_baker_curator.identifier} + assert _search_ids(api_client, 'contributor:01cwqze88') == {ds_doe_curator.identifier} # Identifier lookup composes with role: a role-specific operator with an # ORCID also requires the matched contributor to hold that role. assert _search_ids(api_client, 'data_curator:0000-0002-2990-9889') == { - ds_baker_curator.identifier + ds_doe_curator.identifier } - # Wrong role for that ORCID → 0 (Doe @ ds_baker_curator IS a curator, - # but Doe @ ds_baker_author_only has a different ORCID). + # Wrong role for that ORCID → 0 (Doe @ ds_doe_curator IS a curator, + # but Doe @ ds_doe_author_only has a different ORCID). assert _search_ids(api_client, 'data_curator:0000-0001-2222-3333') == set() # Independent-operator AND across DIFFERENT contributor elements: @@ -2289,7 +2291,7 @@ def test_advanced_search_contributor_and_role_operators(api_client): # specifically wanted — with same-element semantics the query would # require Doe to himself be a Funder, which is wrong.) assert _search_ids(api_client, 'author:Doe funder:"National Institutes"') == { - ds_baker_curator.identifier + ds_doe_curator.identifier } From e4d03d3024023952c53362419d966027a204ab31 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 12:05:22 -0400 Subject: [PATCH 12/19] Apply ruff format --- dandiapi/api/tests/test_dandiset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 589105cf0..b3cfea2aa 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2268,9 +2268,7 @@ def test_advanced_search_contributor_and_role_operators(api_client): # Identifier lookup: full ORCID for Person, full ROR URL for Org, AND # bare-id substring forms (`01cwqze88` matches the ROR URL via icontains). - assert _search_ids(api_client, 'contributor:0000-0002-2990-9889') == { - ds_doe_curator.identifier - } + assert _search_ids(api_client, 'contributor:0000-0002-2990-9889') == {ds_doe_curator.identifier} assert _search_ids(api_client, 'contributor:"https://ror.org/01cwqze88"') == { ds_doe_curator.identifier } From 5d6c2777fc0f559ce8d3843d202e899b139d6628 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 12:26:01 -0400 Subject: [PATCH 13/19] Asset operators: AND at the dandiset level, not at the asset level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per #2822 review discussion: the old semantics required all asset operators to be satisfied by a SINGLE asset, which meant `species:mouse species:rat` only matched dandisets with a multi-species recording (rare). The natural user reading is "the dandiset has mouse data AND has rat data" — those can be on different assets, and that's the common case for comparative-species dandisets. Implementation: each asset operator now builds an independent AssetSearch subquery and the dandiset queryset is filtered with `id__in=...` per operator. Django generates one subquery per operator and AND's them at the dandiset level. Cross-key likewise: `species:mouse approach:electrophysiological` now matches any dandiset that has SOME mouse asset AND SOME ephys asset, not just dandisets with a mouse-ephys asset. Tests updated: - `test_advanced_search_repeated_same_key_operator_combines_with_and` is now `..._combines_at_dandiset_level`, with a new fixture that has two separate assets (one mouse, one rat) to actually exercise the cross-asset case the old semantic excluded. - `test_advanced_search_repeated_asset_operators_intersect` is now `test_advanced_search_asset_operators_combine_at_dandiset_level`, with a similar two-assets-split fixture that demonstrates the new inclusive behavior. Contributor / affiliation semantics unchanged — those still AND on the same Version's metadata (since contributors live per-version, not per-asset). Within that single version, predicates can match different contributor[] entries. --- dandiapi/api/services/search/filters.py | 42 +++++-------- dandiapi/api/tests/test_dandiset.py | 84 ++++++++++++++++++++----- 2 files changed, 86 insertions(+), 40 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index f81fcd99e..6f1a70ca6 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -185,18 +185,22 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s if not parsed.operators: return queryset - # `asset_qs` is built lazily so a query with no asset ops pays nothing. - # Note on semantics: chaining `.filter()` AND's the conditions on a SINGLE - # AssetSearch row, so e.g. `species:mouse approach:ephys` returns dandisets - # with at least one asset that satisfies BOTH (cross-key AND on the same - # asset). Repeated keys (`species:mouse species:rat`) likewise require a - # single asset to match both substrings — same as GitHub's default. - asset_qs = None + # Semantics: every operator describes a property of the dandiset, and + # multiple operators are AND'd at the dandiset level — NOT at the asset + # or version level. So `species:mouse species:rat` returns dandisets that + # have at least one mouse asset AND at least one rat asset (possibly the + # same asset, possibly different ones). Each asset operator therefore + # builds an independent AssetSearch subquery and we filter dandisets to + # those whose IDs appear in EVERY such subquery. + # + # Contributor predicates are different: they apply to a single Version's + # metadata.contributor[] array (since contributors live on the version, + # not on individual assets), so we accumulate them and AND on the same + # Version to avoid cross-version weirdness when a draft and a published + # version have disjoint contributor lists. Within that single version + # each predicate independently scans `contributor[*]`, so two operators + # may match different contributor entries. annotated: set[str] = set() - # Contributor predicates collected here, then applied in a single batch so - # all operators AND on the same Version (avoids cross-version weirdness - # when a dandiset has both a draft and a published version with disjoint - # contributor lists). contributor_wheres: list[tuple[str, list[str]]] = [] for op in parsed.operators: @@ -214,9 +218,8 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s ) from exc queryset = _apply_date_filter(queryset, key, ts, annotated) elif key in ASSET_OPS: - if asset_qs is None: - asset_qs = AssetSearch.objects.visible_to(user) - asset_qs = _apply_asset_filter(asset_qs, key, value) + asset_match = _apply_asset_filter(AssetSearch.objects.visible_to(user), key, value) + queryset = queryset.filter(id__in=asset_match.values('dandiset_id')) elif key in OWNER_OPS: queryset = _apply_owner_filter(queryset, value) elif key in CONTRIBUTOR_ROLE_OPS: @@ -230,15 +233,4 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s if contributor_wheres: queryset = _apply_contributor_filters(queryset, contributor_wheres) - if asset_qs is not None: - # NOTE perf: jsonb_path_exists with a runtime-built jsonpath cannot - # use the existing per-field GIN indexes; the path-scan operators - # (species/approach/technique/standard) currently sequential-scan the - # asset_search materialized view. The view is small enough today - # (~one row per asset) that this is acceptable, but if it becomes a - # hot path the fix is expression GIN indexes on each path or - # denormalized text columns + trgm_ops indexes. - matching_dandiset_ids = asset_qs.values_list('dandiset_id', flat=True).distinct() - queryset = queryset.filter(id__in=matching_dandiset_ids) - return queryset diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index b3cfea2aa..0412b88fc 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2003,24 +2003,47 @@ def test_advanced_search_file_type_alias_and_mime(api_client): @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_repeated_asset_operators_intersect(api_client): - # Cross-key semantics: a SINGLE asset must satisfy ALL constraints. So - # `species:mouse approach:electrophysiological` requires one asset that - # is both attributed to a mouse AND uses an electrophysiological approach. - # An asset that has the species but a different approach (and vice versa - # for a sibling dandiset) does NOT qualify. - mouse_ephys = _seed_dandiset_with_asset( +def test_advanced_search_asset_operators_combine_at_dandiset_level(api_client): + # Multiple asset operators are AND'd at the DANDISET level, not at the + # asset level. So `species:mouse approach:electrophysiological` returns + # any dandiset that has SOME mouse asset AND SOME ephys asset — they may + # be the same asset, but they don't have to be. + one_asset_both = _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], 'approach': [{'name': 'electrophysiological approach'}], }, ) + # Two distinct assets, each satisfying one of the operators. + two_assets_split = DandisetFactory.create() + two_assets_split_version = DraftVersionFactory.create(dandiset=two_assets_split) + two_assets_split_version.assets.add( + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], + }, + ), + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'approach': [{'name': 'electrophysiological approach'}], + }, + ), + ) + add_version_asset_paths(two_assets_split_version) + # Has mouse but no ephys — should NOT match. _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], 'approach': [{'name': 'behavioral approach'}], }, ) + # Has ephys but no mouse — should NOT match. _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [{'species': {'name': 'Norway rat'}}], @@ -2030,17 +2053,20 @@ def test_advanced_search_repeated_asset_operators_intersect(api_client): _refresh_asset_search() query = 'species:mouse approach:electrophysiological' - assert _search_ids(api_client, query) == {mouse_ephys.identifier} + assert _search_ids(api_client, query) == { + one_asset_both.identifier, + two_assets_split.identifier, + } @pytest.mark.ai_generated @pytest.mark.django_db -def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client): - # Same-key semantics: `species:mouse species:rat` requires a single - # asset whose species set contains BOTH "mouse" and "rat" (matches - # GitHub's default for repeated keys). Pinning this so a future change - # to OR-within-key is a deliberate decision, not a regression. - multi = _seed_dandiset_with_asset( +def test_advanced_search_repeated_same_key_operator_combines_at_dandiset_level(api_client): + # `species:mouse species:rat` returns dandisets that have a mouse asset + # AND a rat asset. The two assets can be the same row (multi-species) or + # two different rows — what matters is that the dandiset, as a whole, + # has both kinds of data represented. + multi_species_one_asset = _seed_dandiset_with_asset( asset_metadata={ 'wasAttributedTo': [ {'species': {'name': 'House mouse'}}, @@ -2048,6 +2074,31 @@ def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client ], }, ) + # Two assets: one mouse, one rat. This is the case that distinguishes the + # dandiset-level semantic from a same-asset semantic — it would have been + # excluded under the older "single asset matches both" rule. + two_assets_mouse_and_rat = DandisetFactory.create() + two_assets_mouse_and_rat_version = DraftVersionFactory.create(dandiset=two_assets_mouse_and_rat) + two_assets_mouse_and_rat_version.assets.add( + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'House mouse'}}], + }, + ), + DraftAssetFactory.create( + metadata={ + 'schemaVersion': DANDI_SCHEMA_VERSION, + 'schemaKey': 'Asset', + 'encodingFormat': 'application/x-nwb', + 'wasAttributedTo': [{'species': {'name': 'Norway rat'}}], + }, + ), + ) + add_version_asset_paths(two_assets_mouse_and_rat_version) + # Only mouse, only rat — neither should match the combined query. _seed_dandiset_with_asset( asset_metadata={'wasAttributedTo': [{'species': {'name': 'House mouse'}}]}, ) @@ -2056,7 +2107,10 @@ def test_advanced_search_repeated_same_key_operator_combines_with_and(api_client ) _refresh_asset_search() - assert _search_ids(api_client, 'species:mouse species:rat') == {multi.identifier} + assert _search_ids(api_client, 'species:mouse species:rat') == { + multi_species_one_asset.identifier, + two_assets_mouse_and_rat.identifier, + } @pytest.mark.ai_generated From 9fee8a5e340d4bb699a231c7720e0790c5a6f463 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 13:12:21 -0400 Subject: [PATCH 14/19] Fix SQL for contributor + affiliation operators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Postgres jsonpath quirk: `like_regex` requires its pattern to be a STRING LITERAL inside the jsonpath text — not a `$variable`. The contributor + affiliation builders I wrote tried to use the `vars` argument of `jsonb_path_exists` for the regex pattern, which Postgres rejects with `syntax error at or near "$val" of jsonpath input`. (The asset operators avoid this by concatenating `to_jsonb(?::text)::text` into the jsonpath at SQL execution time — the regex pattern ends up as a properly-quoted JSON string literal in the path. The user value is still bound as a parameter, never inlined into the SQL.) Refactor: applied the same SQL-time concatenation trick to the contributor + affiliation builders. Three new helpers — `_contributor_where`, `_affiliation_where`, and a shared `_LIKE_REGEX_PATTERN` constant — replace the old `_contributor_role_jsonpath` + `_build_jsonpath_where` pair that relied on the broken `vars` mechanism. Removed the unused `AFFILIATION_JSONPATH` constant from operators.py and dropped the `json` import from filters.py since we no longer marshal `vars` objects. Net behavior unchanged; the failing CI tests should pass now. --- dandiapi/api/services/search/filters.py | 70 +++++++++++++++-------- dandiapi/api/services/search/operators.py | 8 --- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 6f1a70ca6..6ae499838 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -3,7 +3,6 @@ from __future__ import annotations from datetime import UTC, datetime -import json import re from typing import TYPE_CHECKING @@ -14,7 +13,6 @@ from dandiapi.api.models import Version from dandiapi.api.models.dandiset import DandisetUserObjectPermission from dandiapi.api.services.search.operators import ( - AFFILIATION_JSONPATH, AFFILIATION_OPS, ASSET_NAME_PATH_OPS, ASSET_OPS, @@ -93,27 +91,54 @@ def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Da return queryset.filter(pk__in=owned_pks) -def _contributor_role_jsonpath(value: str, role: str | None) -> tuple[str, dict[str, str]]: - """Jsonpath + vars for a contributor[] predicate (optional role constraint).""" - name_or_email_or_id = ( - '@.name like_regex $val flag "i" ' - ' || @.email like_regex $val flag "i" ' - ' || @.identifier like_regex $val flag "i"' +# Postgres jsonpath quirk: `like_regex` requires its pattern to be a STRING +# LITERAL inside the jsonpath text — not a `$variable`. So we can't use the +# `vars` arg of `jsonb_path_exists` for the regex pattern; we have to build +# the jsonpath at SQL execution time by concatenating `to_jsonb(?::text)::text` +# (a properly-quoted JSON string literal) into the path. The user value is +# still bound as a parameter — never inlined into the SQL — so the value +# remains SQL-injection-safe. `re.escape` neutralizes regex metachars. +_LIKE_REGEX_PATTERN = ' like_regex \' || to_jsonb(%s::text)::text || \' flag "i"' + + +def _contributor_where(value: str, role: str | None) -> tuple[str, list[str]]: + """Build a `jsonb_path_exists(metadata, ...)` where clause for a contributor[] predicate. + + Matches a `contributor[]` element whose `name`, `email`, OR `identifier` + contains `value` (case-insensitive). If `role` is given, additionally + requires that element's `roleName` array to contain a string matching + `role` (also case-insensitive substring). + """ + val_clause = ( + f'@.name{_LIKE_REGEX_PATTERN}' + f' || @.email{_LIKE_REGEX_PATTERN}' + f' || @.identifier{_LIKE_REGEX_PATTERN}' ) + params = [re.escape(value)] * 3 if role is None: - return f'$.contributor[*] ? ({name_or_email_or_id})', {'val': re.escape(value)} - return ( - '$.contributor[*] ? ' - f'(({name_or_email_or_id}) ' - ' && exists(@.roleName[*] ? (@ like_regex $role flag "i")))', - {'val': re.escape(value), 'role': re.escape(role)}, - ) + jsonpath_expr = f"'$.contributor[*] ? ({val_clause})'" + else: + jsonpath_expr = ( + f"'$.contributor[*] ? (({val_clause})" + f" && exists(@.roleName[*] ? (@{_LIKE_REGEX_PATTERN})))'" + ) + params.append(re.escape(role)) + where = f'jsonb_path_exists(metadata, ({jsonpath_expr})::jsonpath)' + return where, params + +def _affiliation_where(value: str) -> tuple[str, list[str]]: + """Build a `jsonb_path_exists(metadata, ...)` where clause for the affiliation predicate. -def _build_jsonpath_where(jsonpath: str, vars_obj: dict[str, str]) -> tuple[str, list[str]]: - """Wrap a jsonpath + vars into a `jsonb_path_exists(metadata, ...)` predicate.""" - where = f"jsonb_path_exists(metadata, '{jsonpath}'::jsonpath, %s::jsonb)" - return where, [json.dumps(vars_obj)] + Affiliations live at `contributor[].affiliation[]`, each with a `name` and + optionally an `identifier` (ROR URL). Matches case-insensitive substring + on either. + """ + clause = f'@.name{_LIKE_REGEX_PATTERN} || @.identifier{_LIKE_REGEX_PATTERN}' + where = ( + f"jsonb_path_exists(metadata, ('$.contributor[*].affiliation[*] ? ({clause})')::jsonpath)" + ) + return where, [re.escape(value)] * 2 def _apply_contributor_filters( @@ -223,12 +248,9 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s elif key in OWNER_OPS: queryset = _apply_owner_filter(queryset, value) elif key in CONTRIBUTOR_ROLE_OPS: - jsonpath, vars_obj = _contributor_role_jsonpath(value, CONTRIBUTOR_ROLE_OPS[key]) - contributor_wheres.append(_build_jsonpath_where(jsonpath, vars_obj)) + contributor_wheres.append(_contributor_where(value, CONTRIBUTOR_ROLE_OPS[key])) elif key in AFFILIATION_OPS: - contributor_wheres.append( - _build_jsonpath_where(AFFILIATION_JSONPATH, {'val': re.escape(value)}) - ) + contributor_wheres.append(_affiliation_where(value)) if contributor_wheres: queryset = _apply_contributor_filters(queryset, contributor_wheres) diff --git a/dandiapi/api/services/search/operators.py b/dandiapi/api/services/search/operators.py index dbdad6962..331460c02 100644 --- a/dandiapi/api/services/search/operators.py +++ b/dandiapi/api/services/search/operators.py @@ -68,14 +68,6 @@ 'standard': '$.dataStandard[*].name', } -# Affiliation jsonpath: nested under each contributor[].affiliation[], -# matched on the affiliation's own `name` or `identifier` (ROR URL). -AFFILIATION_JSONPATH = ( - '$.contributor[*].affiliation[*] ? ' - '(@.name like_regex $val flag "i" ' - ' || @.identifier like_regex $val flag "i")' -) - # Union of every operator key. The parser uses this for its allowlist; # adding a new operator anywhere above is automatically known to the parser. OPERATOR_KEYS: frozenset[str] = ( From 6fa2f4b5b59b1cbad2641333663997e7e3b7db08 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 11 May 2026 13:14:19 -0400 Subject: [PATCH 15/19] Parser: case-insensitive operator keys CI surfaced an assertion that AUTHOR:doe should match the same set as author:doe. The old _TOKEN_RE / _BARE_OP_RE only accepted lowercase operator keys, so uppercase tokens fell through to free text and returned 0 results. Accept either case in the regex and lowercase the captured key before validation/dispatch. Matches user expectations (GitHub's search operators are case-insensitive on the key side too). --- dandiapi/api/services/search/parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dandiapi/api/services/search/parser.py b/dandiapi/api/services/search/parser.py index 37f978a69..87282ebbd 100644 --- a/dandiapi/api/services/search/parser.py +++ b/dandiapi/api/services/search/parser.py @@ -28,12 +28,14 @@ # # We deliberately match `key:"value"` and `"value"` *before* the bare-token # alternative so quoted segments stay together. +# Operator keys are matched case-insensitively (`AUTHOR:doe` works the same +# as `author:doe`) — we lowercase the captured key before validation/dispatch. _TOKEN_RE = re.compile( - r'(?P[a-z_]+):"(?P[^"]*)"' + r'(?P[A-Za-z_]+):"(?P[^"]*)"' r'|"(?P[^"]*)"' r'|(?P\S+)' ) -_BARE_OP_RE = re.compile(r'^([a-z_]+):(.+)$') +_BARE_OP_RE = re.compile(r'^([A-Za-z_]+):(.+)$') # Defense-in-depth: cap search-term length so an unauthenticated caller can't @@ -92,6 +94,7 @@ def parse_search(query: str) -> ParsedSearch: for match in _TOKEN_RE.finditer(query): if (key := match.group('op_key')) is not None: + key = key.lower() _validate_operator_key(key) parsed.operators.append(Operator(key, match.group('op_qval'))) elif (free := match.group('free_quoted')) is not None: @@ -99,7 +102,7 @@ def parse_search(query: str) -> ParsedSearch: else: bare = match.group('bare') if op_match := _BARE_OP_RE.match(bare): - key = op_match.group(1) + key = op_match.group(1).lower() _validate_operator_key(key) parsed.operators.append(Operator(key, op_match.group(2))) else: From a994d9046842898e94cffd0393f197635e61948e Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 13 May 2026 15:52:40 -0400 Subject: [PATCH 16/19] Update dandiapi/api/services/search/filters.py Co-authored-by: Isaac To --- dandiapi/api/services/search/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index 6ae499838..f3881bf49 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -73,7 +73,7 @@ def _apply_asset_filter(queryset, operator: str, value: str): def _apply_owner_filter(queryset: QuerySet[Dandiset], value: str) -> QuerySet[Dandiset]: - """Filter dandisets to those owned by users matching `value` (icontains).""" + """Filter dandisets to those owned by users matching `value` (iexact).""" matched_user_pks = ( User.objects.annotate(_full_name=Concat('first_name', Value(' '), 'last_name')) .filter( From f9a8155b25279e9e1bbe7d6b4b717a9c4138ffe9 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 13 May 2026 16:00:42 -0400 Subject: [PATCH 17/19] test: expand affiliation fixture with Organization contributors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @candleindark's review: a contributor can be an Organization as well as a Person, and the affiliation jsonpath (which traverses `contributor[*].affiliation[*]`) should walk past Organizations (which have no `affiliation` field of their own) without exploding. Added Organization contributors to both `ds_stanford` and `ds_ucl`: NIH as a Funder on ds_stanford and Wellcome Trust as a Funder on ds_ucl. The new assertions confirm: - `affiliation:Stanford` (and the other affiliation queries) keep working with mixed Person/Organization contributors. - The Organization's own `identifier` is NOT matched by `affiliation:` (it's not an affiliation; the test pins this). - Cross-key with `funder:NIH affiliation:Stanford` works — different contributor elements on the same Version. Also: used `National Institutes of Health (NIH)` for the org name so the `funder:NIH` substring test actually matches (the abbreviation isn't part of the spelled-out form alone). Realistic — DANDI contributors often use this parenthetical form. --- dandiapi/api/tests/test_dandiset.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 0412b88fc..03192b4de 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2407,10 +2407,26 @@ def test_advanced_search_affiliation_operator(api_client): }, ], }, + # An Organization contributor (no `affiliation` field of its own — + # affiliations live on Persons in the schema). The affiliation + # jsonpath must walk past this element without exploding. + { + 'name': 'National Institutes of Health (NIH)', + 'identifier': 'https://ror.org/01cwqze88', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, ], ) ds_ucl = _seed_dandiset_with_contributors( contributors=[ + # Same shape but the Organization comes first this time — the + # jsonpath shouldn't be order-sensitive. + { + 'name': 'Wellcome Trust', + 'roleName': ['dcite:Funder'], + 'schemaKey': 'Organization', + }, { 'name': 'Doe, Jane', 'roleName': ['dcite:Author'], @@ -2430,8 +2446,15 @@ def test_advanced_search_affiliation_operator(api_client): assert _search_ids(api_client, 'affiliation:"University College London"') == {ds_ucl.identifier} # Affiliation by ROR identifier (full URL or bare ID via substring) assert _search_ids(api_client, 'affiliation:00f54p054') == {ds_stanford.identifier} + # The Organization contributors' own identifiers are NOT matched by + # `affiliation:` — that operator queries `contributor[].affiliation[]`, + # not the contributor's own identifier. (Use `funder:` instead.) + assert _search_ids(api_client, 'affiliation:01cwqze88') == set() # Composes with role/contributor operators on the same Version (different - # contributor elements OK — but here both must match Doe, who is the only - # contributor in each fixture, so the joint constraint is just an AND). + # contributor elements OK — here `author:Doe` matches the Person and + # `affiliation:Stanford` matches via that Person's affiliation, while + # the unrelated Organization contributor is harmlessly ignored). assert _search_ids(api_client, 'author:Doe affiliation:Stanford') == {ds_stanford.identifier} + # Cross-key with role on the Organization side also works. + assert _search_ids(api_client, 'funder:NIH affiliation:Stanford') == {ds_stanford.identifier} From 37a78b43b42e568eac3c2dda3f62aec4ca287750 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 13 May 2026 19:56:31 -0400 Subject: [PATCH 18/19] Add `num_subjects:` advanced-search operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `num_subjects:N` matches dandisets whose `assetsSummary.numberOfSubjects` is at least N. The threshold reads as "studies with ≥N subjects", which is the search intent users actually have (an upper bound is rarely useful and isn't worth the syntax cost). A `num_sessions:` operator was considered but is intentionally NOT included: dandischema's `AssetsSummary` does not aggregate sessions, and assets carry no `sessionId` field from which to derive a count. Adding it would require upstream schema work. Implementation: - `COUNT_OPS` dispatch table in `operators.py` maps op → jsonpath into `Version.metadata`; adding any of `num_files`, `num_bytes`, `num_samples`, `num_cells` is a one-line entry. - `_apply_count_filter` builds a `jsonb_path_exists(metadata, ...)` predicate. The integer is bound via the jsonpath `vars` parameter (not inlined into SQL or jsonpath text), so the value is injection-safe. Versions whose metadata lacks the count field never match — the jsonpath `?` filter drops missing/null/non-numeric values naturally. Co-Authored-By: Claude Opus 4.7 (1M context) --- dandiapi/api/services/search/filters.py | 36 +++++++++++ dandiapi/api/services/search/operators.py | 25 +++++++- dandiapi/api/tests/test_dandiset.py | 72 ++++++++++++++++++++++ dandiapi/api/views/serializers.py | 4 +- web/src/components/DandisetSearchField.vue | 1 + 5 files changed, 136 insertions(+), 2 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index f3881bf49..dcad2bf86 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -3,6 +3,7 @@ from __future__ import annotations from datetime import UTC, datetime +import json import re from typing import TYPE_CHECKING @@ -17,6 +18,7 @@ ASSET_NAME_PATH_OPS, ASSET_OPS, CONTRIBUTOR_ROLE_OPS, + COUNT_OPS, DATE_OPS, FILE_TYPE_ALIASES, OWNER_OPS, @@ -166,6 +168,38 @@ def _apply_contributor_filters( return queryset.filter(versions__pk__in=matching_versions.values('pk')) +def _apply_count_filter( + queryset: QuerySet[Dandiset], jsonpath_path: str, value: str +) -> QuerySet[Dandiset]: + """Filter dandisets where `Version.metadata` at `jsonpath_path` is >= `value`. + + The value is a non-negative integer (e.g. `num_subjects:10`). Treated as + a lower bound — "at least N" is the search intent users actually have; + upper bounds are rare enough not to be worth the syntax cost. + + `jsonpath_path` is a trusted constant pointing into `Version.metadata`. + The integer is bound via `jsonb_path_exists`'s `vars` parameter (not + inlined into SQL or jsonpath), so the value is injection-safe. + + A dandiset matches if at least one of its versions satisfies the + predicate. Versions whose metadata lacks the count field don't match — + the jsonpath `?` filter drops missing/null/non-numeric values naturally. + """ + if not value.isdigit(): + raise SearchSyntaxError( + f'Invalid count value {value!r}. Use a non-negative integer ' + f'(e.g. `num_subjects:10`).' + ) + n = int(value) + jsonpath = f'{jsonpath_path} ? (@ >= $val)' + vars_json = json.dumps({'val': n}) + where = 'jsonb_path_exists(metadata, %s::jsonpath, %s::jsonb)' + matching_versions = Version.objects.all().extra( # noqa: S610 + where=[where], params=[jsonpath, vars_json] + ) + return queryset.filter(id__in=matching_versions.values('dandiset_id')) + + _MODIFIED_ALIAS = '_search_latest_version_modified' _PUBLISHED_ALIAS = '_search_latest_published_created' @@ -251,6 +285,8 @@ def apply_search_filters( # noqa: C901 (one branch per operator category — s contributor_wheres.append(_contributor_where(value, CONTRIBUTOR_ROLE_OPS[key])) elif key in AFFILIATION_OPS: contributor_wheres.append(_affiliation_where(value)) + elif key in COUNT_OPS: + queryset = _apply_count_filter(queryset, COUNT_OPS[key], value) if contributor_wheres: queryset = _apply_contributor_filters(queryset, contributor_wheres) diff --git a/dandiapi/api/services/search/operators.py b/dandiapi/api/services/search/operators.py index 331460c02..e36dc16fb 100644 --- a/dandiapi/api/services/search/operators.py +++ b/dandiapi/api/services/search/operators.py @@ -58,6 +58,24 @@ AFFILIATION_OPS = frozenset({'affiliation'}) +# Numeric "at least N" operators: short name → jsonpath into `Version.metadata`. +# Value is a non-negative integer; `num_subjects:10` matches dandisets whose +# `assetsSummary.numberOfSubjects` is >= 10. The intuition is "studies with +# at least N ", which is what users actually search for; an upper +# bound is rarely useful in practice. +# +# Only `num_subjects` is exposed for now. The `assetsSummary` schema also +# carries `numberOfFiles`, `numberOfBytes`, `numberOfSamples`, and +# `numberOfCells` — each would be a one-line entry here. +# +# `num_sessions` is intentionally absent: dandischema's `AssetsSummary` does +# not aggregate sessions, and there is no per-asset `sessionId` field from +# which to derive a count. Adding a `num_sessions:` operator would require +# upstream schema work. +COUNT_OPS: dict[str, str] = { + 'num_subjects': '$.assetsSummary.numberOfSubjects', +} + # Asset-name jsonpaths: each operator selects a different array path on # `asset_metadata` whose elements have a `.name` we substring-match against. # Paths MUST be trusted constants (interpolated into the SQL). @@ -71,5 +89,10 @@ # Union of every operator key. The parser uses this for its allowlist; # adding a new operator anywhere above is automatically known to the parser. OPERATOR_KEYS: frozenset[str] = ( - DATE_OPS | ASSET_OPS | OWNER_OPS | AFFILIATION_OPS | frozenset(CONTRIBUTOR_ROLE_OPS) + DATE_OPS + | ASSET_OPS + | OWNER_OPS + | AFFILIATION_OPS + | frozenset(CONTRIBUTOR_ROLE_OPS) + | frozenset(COUNT_OPS) ) diff --git a/dandiapi/api/tests/test_dandiset.py b/dandiapi/api/tests/test_dandiset.py index 03192b4de..d547bb6c4 100644 --- a/dandiapi/api/tests/test_dandiset.py +++ b/dandiapi/api/tests/test_dandiset.py @@ -2458,3 +2458,75 @@ def test_advanced_search_affiliation_operator(api_client): assert _search_ids(api_client, 'author:Doe affiliation:Stanford') == {ds_stanford.identifier} # Cross-key with role on the Organization side also works. assert _search_ids(api_client, 'funder:NIH affiliation:Stanford') == {ds_stanford.identifier} + + +def _seed_dandiset_with_subject_count(n: int | None) -> Dandiset: + """Create a dandiset whose draft version's `assetsSummary.numberOfSubjects` is `n`. + + Pass `None` to omit the field entirely (simulates a dandiset that hasn't + had its assets summarized — the count operator should not match these). + """ + dandiset = DandisetFactory.create() + version = DraftVersionFactory.create(dandiset=dandiset) + summary = {'schemaKey': 'AssetsSummary', 'numberOfBytes': 0, 'numberOfFiles': 0} + if n is not None: + summary['numberOfSubjects'] = n + version.metadata = {**version.metadata, 'assetsSummary': summary} + version.save() + return dandiset + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_num_subjects_operator(api_client): + """`num_subjects:N` is "at least N subjects" against assetsSummary.numberOfSubjects. + + Covers the threshold (exact + above match, below doesn't), missing + field (doesn't match), case-insensitivity, and composition with another + operator on the same dandiset. + """ + ds_2 = _seed_dandiset_with_subject_count(2) + ds_10 = _seed_dandiset_with_subject_count(10) + ds_50 = _seed_dandiset_with_subject_count(50) + ds_unknown = _seed_dandiset_with_subject_count(None) # no numberOfSubjects field + + # >= threshold semantics + assert _search_ids(api_client, 'num_subjects:10') == {ds_10.identifier, ds_50.identifier} + assert _search_ids(api_client, 'num_subjects:0') == { + ds_2.identifier, + ds_10.identifier, + ds_50.identifier, + } + # ds_unknown never matches: a missing count must not be coerced to 0 + assert ds_unknown.identifier not in _search_ids(api_client, 'num_subjects:0') + + # Case-insensitive operator key + assert _search_ids(api_client, 'NUM_SUBJECTS:10') == {ds_10.identifier, ds_50.identifier} + + # Composes with another operator (here `name` free-text would compose + # via the existing path; use a date operator to stay structured-only). + # Constrain `ds_50` only via its created date. + cutoff = timezone.now() - datetime.timedelta(days=1) + Dandiset.objects.filter(pk=ds_2.pk).update(created=cutoff - datetime.timedelta(days=30)) + after_str = (cutoff + datetime.timedelta(seconds=1)).date().isoformat() + # `num_subjects:2 created_after:` → dandisets with >=2 subjects + # AND created in the last day → ds_10 + ds_50 (ds_2 is older). + assert _search_ids(api_client, f'num_subjects:2 created_after:{after_str}') == { + ds_10.identifier, + ds_50.identifier, + } + + +@pytest.mark.ai_generated +@pytest.mark.django_db +def test_advanced_search_num_subjects_invalid_value_returns_400(api_client): + # Non-integer values are rejected with a helpful message; negatives and + # decimals fail the digits-only check. `num_subjects:` alone (no value) + # doesn't reach this path — the parser treats it as free text since the + # operator regex requires at least one character after the colon. + for bad in ('abc', '-5', '5.5'): + response = api_client.get( + '/api/dandisets/', + {'draft': 'true', 'empty': 'true', 'search': f'num_subjects:{bad}'}, + ) + assert response.status_code == 400, bad diff --git a/dandiapi/api/views/serializers.py b/dandiapi/api/views/serializers.py index 01d15b43a..2bf87efe5 100644 --- a/dandiapi/api/views/serializers.py +++ b/dandiapi/api/views/serializers.py @@ -322,7 +322,9 @@ class DandisetQueryParameterSerializer(serializers.Serializer): 'data_curator, data_manager, maintainer, project_leader, funder, ' 'sponsor. ' 'affiliation (matches the nested Person.affiliation[] field by ' - 'organization name or ROR identifier — e.g. affiliation:Stanford). ' + 'organization name or ROR identifier — e.g. affiliation:Stanford); ' + 'num_subjects (matches dandisets with at least N subjects per ' + 'assetsSummary.numberOfSubjects — e.g. num_subjects:10). ' 'Invalid syntax returns HTTP 400 with the offending token; ' 'unknown operators get a "Did you mean?" suggestion.' ), diff --git a/web/src/components/DandisetSearchField.vue b/web/src/components/DandisetSearchField.vue index e21c6e74c..7c8b47a7a 100644 --- a/web/src/components/DandisetSearchField.vue +++ b/web/src/components/DandisetSearchField.vue @@ -104,6 +104,7 @@ const operatorHelp = [ { example: 'maintainer:Doe', description: 'Listed as a Maintainer' }, { example: 'project_leader:Doe', description: 'Listed as the Project Leader (also: data_collector, data_manager, sponsor)' }, { example: 'affiliation:Stanford', description: 'Has a contributor affiliated with the named organization (or ROR ID)' }, + { example: 'num_subjects:10', description: 'Has at least N subjects (assetsSummary.numberOfSubjects)' }, ]; function updateSearch(search: string) { From 9b8b9ad3f2321971072433eec84a99637b6edb3f Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Wed, 13 May 2026 20:57:44 -0400 Subject: [PATCH 19/19] ruff format Co-Authored-By: Claude Opus 4.7 (1M context) --- dandiapi/api/services/search/filters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dandiapi/api/services/search/filters.py b/dandiapi/api/services/search/filters.py index dcad2bf86..467ed2bea 100644 --- a/dandiapi/api/services/search/filters.py +++ b/dandiapi/api/services/search/filters.py @@ -187,8 +187,7 @@ def _apply_count_filter( """ if not value.isdigit(): raise SearchSyntaxError( - f'Invalid count value {value!r}. Use a non-negative integer ' - f'(e.g. `num_subjects:10`).' + f'Invalid count value {value!r}. Use a non-negative integer (e.g. `num_subjects:10`).' ) n = int(value) jsonpath = f'{jsonpath_path} ? (@ >= $val)'