MannLabs · lucas-diedrich · Jan 22, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/docs/api.md b/docs/api.md
@@ -19,6 +19,7 @@
     pp.impute_gaussian
     pp.impute_median
     pp.impute_knn
+    pp.impute_bpca
     pp.scanpy_pycombat
 
 ```

diff --git a/docs/references.bib b/docs/references.bib
@@ -23,7 +23,21 @@ @article{Oba.2003
   urldate = {2025-10-24},
   abstract = {Abstract             Motivation: Gene expression profile analyses have been used in numerous studies covering a broad range of areas in biology. When unreliable measurements are excluded, missing values are introduced in gene expression profiles. Although existing multivariate analysis methods have difficulty with the treatment of missing values, this problem has received little attention. There are many options for dealing with missing values, each of which reaches drastically different results. Ignoring missing values is the simplest method and is frequently applied. This approach, however, has its flaws. In this article, we propose an estimation method for missing values, which is based on Bayesian principal component analysis (BPCA). Although the methodology that a probabilistic model and latent variables are estimated simultaneously within the framework of Bayes inference is not new in principle, actual BPCA implementation that makes it possible to estimate arbitrary missing variables is new in terms of statistical methodology.             Results: When applied to DNA microarray data from various experimental conditions, the BPCA method exhibited markedly better estimation ability than other recently proposed methods, such as singular value decomposition and K-nearest neighbors. While the estimation performance of existing methods depends on model parameters whose determination is difficult, our BPCA method is free from this difficulty. Accordingly, the BPCA method provides accurate and convenient estimation for missing values.             Availability: The software is available at http://hawaii.aist-nara.ac.jp/\textasciitilde shige-o/tools/},
   langid = {english},
-  file = {/Users/lucas-diedrich/Zotero/storage/46SGGFWC/Oba et al. - 2003 - A Bayesian missing value estimation method for gene expression profile data.pdf}
+}
+
+@article{Stacklies.2007,
+  title = {{{pcaMethods}}---a Bioconductor Package Providing {{PCA}} Methods for Incomplete Data},
+  author = {Stacklies, Wolfram and Redestig, Henning and Scholz, Matthias and Walther, Dirk and Selbig, Joachim},
+  year = 2007,
+  month = may,
+  journal = {Bioinformatics},
+  volume = {23},
+  number = {9},
+  pages = {1164--1167},
+  issn = {1367-4803},
+  doi = {10.1093/bioinformatics/btm069},
+  urldate = {2025-12-21},
+  abstract = {Summary: ~pcaMethods is a Bioconductor compliant library for computing principal component analysis (PCA) on incomplete data sets. The results can be analyzed directly or used to estimate missing values to enable the use of missing value sensitive statistical methods. The package was mainly developed with microarray and metabolite data sets in mind, but can be applied to any other incomplete data set as well.Availability: ~http://www.bioconductor.orgContact: ~selbig@mpimp-golm.mpg.deSupplementary information: Please visit our webpage at http://bioinformatics.mpimp-golm.mpg.de/},
 }
 
 
@@ -36,5 +50,4 @@ @inproceedings{Bishop.1998
   publisher = {MIT Press},
   urldate = {2025-12-21},
   keywords = {Bayesian PCA},
-  file = {/Users/lucas-diedrich/Zotero/storage/CSN76WHH/Bishop - 1998 - Bayesian PCA.pdf}
 }
diff --git a/src/alphapepttools/pp/__init__.py b/src/alphapepttools/pp/__init__.py
@@ -1,5 +1,5 @@
 from .batch_correction import scanpy_pycombat
 from .data import add_metadata, filter_by_metadata, filter_data_completeness, scale_and_center
-from .impute import impute_gaussian, impute_knn, impute_median
+from .impute import impute_bpca, impute_gaussian, impute_knn, impute_median
 from .norm import normalize
 from .transform import detect_special_values, nanlog
diff --git a/src/alphapepttools/pp/impute.py b/src/alphapepttools/pp/impute.py
@@ -6,6 +6,7 @@
 import anndata as ad
 import numpy as np
 import pandas as pd
+from bpca import BPCA
 from sklearn.impute import KNNImputer
 
 # logging configuration
@@ -20,7 +21,7 @@ def impute_gaussian(
     layer: str | None = None,
     *,
     copy: bool = False,
-) -> ad.AnnData:
+) -> ad.AnnData | None:
     """Impute missing values in each column by random sampling from a gaussian distribution.
 
     The distribution is centered at std_offset * feature standard deviation below the
@@ -248,7 +249,7 @@ def impute_knn(
     weights: Literal["distance", "uniform"] = "distance",
     copy: bool = False,
     **kwargs,
-) -> ad.AnnData:
+) -> ad.AnnData | None:
     """Impute missing values using median imputation
 
     Replace missing (NaN) values for each feature in the data matrix with the estimate based on non-missing
@@ -347,3 +348,127 @@ def impute_knn(
         adata.layers[layer] = data
 
     return adata if copy else None
+
+
+def _impute_bpca(data: np.ndarray, n_components: int, **kwargs) -> np.ndarray:
+    r"""Imputes missing data with Bayesian Principal Component Analysis
+
+    .. :math:
+
+        X_{estimated} = \approx U\times L + \overline{X}
+
+    References
+    ----------
+    - :cite:p:`Stacklies.2007`
+    """
+    bpca = BPCA(n_components=n_components, **kwargs)
+
+    usage = bpca.fit_transform(data)
+    loadings = bpca.components_
+    mean = np.nanmean(data, axis=0)
+
+    X_reconstructed = usage @ loadings + mean
+
+    return np.where(np.isnan(data), X_reconstructed, data)
+
+
+def impute_bpca(
+    adata: ad.AnnData,
+    *,
+    n_components: int = 50,
+    layer: str | None = None,
+    group_column: str | None = None,
+    copy: bool = False,
+    **kwargs,
+) -> ad.AnnData | None:
+    r"""Impute missing values using Bayesian Principal Component Analysis (BPCA)
+
+    Estimates the latent covariance structure of the log-transformed data via Bayesian Principal Component Analysis. The imputation method uses the obtained
+    principal component usages :math:`U` and components :math:`L` to reconstruct the full log-transformed data matrix
+
+    .. :math:
+
+        X_{estimated} = \approx U\times L + \overline{X}
+
+    Where :math:`\overline{X}` is the mean of the data. Missing values are replaced with the estimated values from
+    this procedure.
+
+    Parameters
+    ----------
+    adata
+        AnnData object.
+    n_components
+        Number of components to use for the model fit. The more components are used, the more granular the model
+        fits the data. This might increase model accuracy but also propagates more measurement noise in the
+        data reconstruction.
+    layer
+        Layer to use for imputation. The data should be log transformed to match the noise model of the BPCA method.
+        If `None`, uses the `adata.X` attribute.
+    group_column
+        Column name in `adata.obs` defining groups for group-wise imputation.
+            - `None` (default, recommended), imputes all samples.
+            - `str` Computes median separately for each group
+        If `group_column` contains NaNs, the respective observations are ignored.
+    copy
+        Whether to return a modified copy (True) of the anndata object. If False (default)
+        modifies the object inplace
+    **kwargs
+        Passed to :class:`bpca.BPCA`
+
+    Returns
+    -------
+    None | anndata.AnnData
+        AnnData object with imputed values in layer.
+        If `copy=False` modifies the anndata object at layer inplace and returns None. If `copy=True`,
+        returns a modified copy.
+
+    Raises
+    ------
+    Warning
+        If `group_column` contains NaNs
+    Warning
+        If a feature contains only NaNs
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        # Log transform data
+        at.pp.nanlog(adata)
+
+        # Imputes .X inplace
+        at.pp.impute_bpca(adata, n_components=50, layer=None)
+
+        # Returns a new anndata object with imputed .X layer
+        adata_new = at.pp.impute_bpca(adata, n_components=50, layer=None, copy=True)
+
+    References
+    ----------
+    This implementation follows the reference implementation in :cite:p:`Stacklies.2007`
+
+    See Also
+    --------
+    :class:`bpca.BPCA`
+    """
+    adata = adata.copy() if copy else adata
+
+    data = adata.X if layer is None else adata.layers[layer]
+
+    if group_column is None:
+        _check_all_nan(data)
+        data = _impute_bpca(data, n_components=n_components, **kwargs)
+    else:
+        groups = adata.obs.groupby(group_column, dropna=True).indices
+
+        for group_indices in groups.values():
+            group = data[group_indices]
+            _check_all_nan(group)
+            data[group_indices, :] = _impute_bpca(group, n_components=n_components, **kwargs)
+
+    if layer is None:
+        adata.X = data
+    else:
+        adata.layers[layer] = data
+
+    return adata if copy else None
diff --git a/tests/pp/test_impute.py b/tests/pp/test_impute.py
@@ -1,10 +1,12 @@
+from typing import Any
+
 import anndata as ad
 import numpy as np
 import pandas as pd
 import pytest
 
-from alphapepttools.pp import impute_gaussian, impute_knn, impute_median
-from alphapepttools.pp.impute import _check_all_nan, _impute_knn, _impute_nanmedian
+from alphapepttools.pp import impute_bpca, impute_gaussian, impute_knn, impute_median
+from alphapepttools.pp.impute import _check_all_nan, _impute_bpca, _impute_knn, _impute_nanmedian
 
 
 @pytest.fixture
@@ -67,6 +69,22 @@ def knn_imputation_dummy_data(imputation_dummy_data) -> tuple[np.ndarray, np.nda
     return imputation_dummy_data, X_ref, kwargs
 
 
+@pytest.fixture
+def bpca_imputation_dummy_data(imputation_dummy_data: np.ndarray) -> tuple[np.ndarray, np.ndarray, dict[str, str]]:
+    """Test data for median imputation"""
+    X_ref = np.array(
+        [
+            [0.0, 0.0, 2.0, 0.94077611, 0.0],
+            [1.0, 1.0, 3.0, 1.0, 10.0],
+            [0.0, 2.0, 4.0, 3.03508935, 20.0],
+            [0.33059225, 3.0, 5.0, 3.0, 18.02331366],
+        ]
+    )
+    kwargs = {"n_components": 1}
+
+    return imputation_dummy_data, X_ref, kwargs
+
+
 def test___check_all_nan(dummy_data_all_nan) -> None:
     with pytest.raises(ValueError, match=r"Features with index \[4\]"):
         _check_all_nan(dummy_data_all_nan)
@@ -90,6 +108,15 @@ def test__impute_knn(knn_imputation_dummy_data) -> None:
     assert np.all(np.isclose(X_imputed, X_ref, equal_nan=True))
 
 
+def test__impute_bpca(bpca_imputation_dummy_data: tuple[np.ndarray, np.ndarray, dict[str, Any]]) -> None:
+    """Test knn imputation for data with nan values"""
+    X, X_ref, kwargs = bpca_imputation_dummy_data
+
+    X_imputed = _impute_bpca(X, **kwargs)
+
+    assert np.allclose(X_imputed, X_ref, equal_nan=False)
+
+
 class TestImputeGaussian:
     @pytest.fixture
     def gaussian_imputation_dummy_data(self):
@@ -357,3 +384,93 @@ def test_impute_median__missing_layer(
 
         with pytest.raises(KeyError):
             _ = impute_knn(adata, layer="non_existent_layer", copy=True)
+
+
+class TestImputeBPCAAnnData:
+    @pytest.fixture
+    def bpca_imputation_dummy_anndata(
+        self,
+        bpca_imputation_dummy_data,
+    ) -> tuple[ad.AnnData, np.ndarray, np.ndarray, dict[str, Any]]:
+        """Test data for BPCA imputation"""
+        obs = pd.DataFrame(
+            {
+                "sample_id": ["A", "B", "C", "D"],
+                "sample_group": ["A", "A", "B", "B"],
+                "sample_group_with_nan": ["A", "A", np.nan, np.nan],
+            }
+        )
+
+        X, X_ref, kwargs = bpca_imputation_dummy_data
+        # Reference for grouped imputation (computed separately per group with n_components=1)
+        X_ref_grouped = np.array(
+            [
+                [0.0, 0.0, 2.0, 1.0, 0.0],
+                [1.0, 1.0, 3.0, 1.0, 10.0],
+                [0.0, 2.0, 4.0, 3.0, 20.0],
+                [0.0, 3.0, 5.0, 3.0, 20.0],
+            ]
+        )
+
+        return ad.AnnData(X, obs=obs, layers={"layer2": X.copy()}), X_ref, X_ref_grouped, kwargs
+
+    @pytest.fixture
+    def bpca_imputation_dummy_anndata_all_nan(self, dummy_data_all_nan: np.ndarray) -> ad.AnnData:
+        """AnnData object with a feature that contains only NaNs"""
+
+        obs = pd.DataFrame(
+            {
+                "sample_id": ["A", "B", "C", "D"],
+                "sample_group": ["A", "A", "B", "B"],
+                "sample_group_with_nan": ["A", "A", np.nan, np.nan],
+            }
+        )
+
+        return ad.AnnData(X=dummy_data_all_nan, obs=obs)
+
+    @pytest.mark.parametrize("copy", [True, False])
+    @pytest.mark.parametrize("layer", [None, "layer2"])
+    def test_impute_bpca(self, bpca_imputation_dummy_anndata, layer: str, *, copy: bool) -> None:
+        """Test BPCA imputation for data with nan values"""
+        adata, X_ref, _, kwargs = bpca_imputation_dummy_anndata
+
+        result = impute_bpca(adata, layer=layer, **kwargs, copy=copy)
+
+        if copy:
+            assert isinstance(result, ad.AnnData)
+            adata_imputed = result
+        else:
+            assert result is None
+            adata_imputed = adata
+
+        X_imputed = adata_imputed.X if layer is None else adata_imputed.layers[layer]
+
+        assert np.allclose(X_imputed, X_ref, equal_nan=False)
+
+    @pytest.mark.parametrize("group_column", [None, "sample_group"])
+    def test_impute_bpca__feature_all_nan(self, bpca_imputation_dummy_anndata_all_nan, group_column: str) -> None:
+        """Test BPCA imputation raises if a feature contains all nan"""
+        adata = bpca_imputation_dummy_anndata_all_nan
+
+        with pytest.raises(ValueError, match=r"Features with index"):
+            _ = impute_bpca(adata, n_components=1, group_column=group_column)
+
+    def test_impute_bpca__missing_group_column(
+        self,
+        bpca_imputation_dummy_anndata,
+    ) -> None:
+        """Test that KeyError is raised if `group_column` does not exist in `adata.obs`"""
+        adata, _, _, kwargs = bpca_imputation_dummy_anndata
+
+        with pytest.raises(KeyError):
+            impute_bpca(adata, group_column="non_existent_column", **kwargs)
+
+    def test_impute_bpca__missing_layer(
+        self,
+        bpca_imputation_dummy_anndata,
+    ) -> None:
+        """Test that KeyError is raised if `layer` does not exist in `adata`"""
+        adata, _, _, kwargs = bpca_imputation_dummy_anndata
+
+        with pytest.raises(KeyError):
+            impute_bpca(adata, layer="non_existent_layer", **kwargs)