diff --git a/packages/api/src/cell_explorer_api/services/zarr_adapter.py b/packages/api/src/cell_explorer_api/services/zarr_adapter.py index 63141d2..63064b7 100644 --- a/packages/api/src/cell_explorer_api/services/zarr_adapter.py +++ b/packages/api/src/cell_explorer_api/services/zarr_adapter.py @@ -17,6 +17,12 @@ # Candidate var columns (in priority order) that hold human-readable gene symbols. # Mirrors the frontend's GENE_SYMBOL_COLUMNS so the CLI and browser agree on how # genes are named. The first column present in a given dataset wins. +# +# `gene` is the most ambiguous candidate (a future dataset could in theory store +# something else under that name) so it sits at the end — a dataset that has +# both `feature_name` and `gene` still picks `feature_name`. Per-dataset +# overrides via a Dataset.gene_label_column field are tracked separately for +# the rare cases where this heuristic is wrong. GENE_SYMBOL_COLUMNS: tuple[str, ...] = ( "feature_name", "gene_symbol", @@ -27,6 +33,7 @@ "gene_short_name", "symbol", "name", + "gene", ) diff --git a/packages/api/tests/services/test_zarr_adapter.py b/packages/api/tests/services/test_zarr_adapter.py index e0dd36d..eb6c8af 100644 --- a/packages/api/tests/services/test_zarr_adapter.py +++ b/packages/api/tests/services/test_zarr_adapter.py @@ -52,6 +52,39 @@ async def test_var_names_from_var_dataframe(): assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"] +@pytest.mark.asyncio +async def test_var_names_resolves_gene_column_when_index_is_ensembl(): + """A dataset with Ensembl IDs as the var index and gene symbols in a + column literally named `gene` resolves to symbols (egfr_all_cells case).""" + store = _make_fake_anndata_store() + store.var = AsyncMock( + return_value=pd.DataFrame( + {"gene": ["CD8A", "CD4", "MS4A1"]}, + index=["ENSG00000153563", "ENSG00000010610", "ENSG00000156738"], + ) + ) + adapter = AnnDataZarrAccess(store) + assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"] + + +@pytest.mark.asyncio +async def test_var_names_prefers_feature_name_over_gene(): + """Priority order: feature_name wins over gene when both are present, so + datasets with both columns get the canonical symbol column.""" + store = _make_fake_anndata_store() + store.var = AsyncMock( + return_value=pd.DataFrame( + { + "feature_name": ["CD8A", "CD4", "MS4A1"], + "gene": ["alt-CD8A", "alt-CD4", "alt-MS4A1"], + }, + index=["ENSG00000153563", "ENSG00000010610", "ENSG00000156738"], + ) + ) + adapter = AnnDataZarrAccess(store) + assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"] + + import numpy as np