cBioPortal · hweej · May 22, 2026 · May 22, 2026
diff --git a/packages/api/src/cell_explorer_api/services/zarr_adapter.py b/packages/api/src/cell_explorer_api/services/zarr_adapter.py
@@ -17,6 +17,12 @@
 # Candidate var columns (in priority order) that hold human-readable gene symbols.
 # Mirrors the frontend's GENE_SYMBOL_COLUMNS so the CLI and browser agree on how
 # genes are named. The first column present in a given dataset wins.
+#
+# `gene` is the most ambiguous candidate (a future dataset could in theory store
+# something else under that name) so it sits at the end — a dataset that has
+# both `feature_name` and `gene` still picks `feature_name`. Per-dataset
+# overrides via a Dataset.gene_label_column field are tracked separately for
+# the rare cases where this heuristic is wrong.
 GENE_SYMBOL_COLUMNS: tuple[str, ...] = (
     "feature_name",
     "gene_symbol",
@@ -27,6 +33,7 @@
     "gene_short_name",
     "symbol",
     "name",
+    "gene",
 )
 
 

diff --git a/packages/api/tests/services/test_zarr_adapter.py b/packages/api/tests/services/test_zarr_adapter.py
@@ -52,6 +52,39 @@ async def test_var_names_from_var_dataframe():
     assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"]
 
 
+@pytest.mark.asyncio
+async def test_var_names_resolves_gene_column_when_index_is_ensembl():
+    """A dataset with Ensembl IDs as the var index and gene symbols in a
+    column literally named `gene` resolves to symbols (egfr_all_cells case)."""
+    store = _make_fake_anndata_store()
+    store.var = AsyncMock(
+        return_value=pd.DataFrame(
+            {"gene": ["CD8A", "CD4", "MS4A1"]},
+            index=["ENSG00000153563", "ENSG00000010610", "ENSG00000156738"],
+        )
+    )
+    adapter = AnnDataZarrAccess(store)
+    assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"]
+
+
+@pytest.mark.asyncio
+async def test_var_names_prefers_feature_name_over_gene():
+    """Priority order: feature_name wins over gene when both are present, so
+    datasets with both columns get the canonical symbol column."""
+    store = _make_fake_anndata_store()
+    store.var = AsyncMock(
+        return_value=pd.DataFrame(
+            {
+                "feature_name": ["CD8A", "CD4", "MS4A1"],
+                "gene": ["alt-CD8A", "alt-CD4", "alt-MS4A1"],
+            },
+            index=["ENSG00000153563", "ENSG00000010610", "ENSG00000156738"],
+        )
+    )
+    adapter = AnnDataZarrAccess(store)
+    assert await adapter.var_names() == ["CD8A", "CD4", "MS4A1"]
+
+
 import numpy as np