diff --git a/docs/api.md b/docs/api.md index fa4e3a32..259638d7 100644 --- a/docs/api.md +++ b/docs/api.md @@ -19,6 +19,7 @@ pp.impute_gaussian pp.impute_median pp.impute_knn + pp.impute_bpca pp.scanpy_pycombat ``` diff --git a/docs/references.bib b/docs/references.bib index a57e4caf..bbd57366 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -23,7 +23,21 @@ @article{Oba.2003 urldate = {2025-10-24}, abstract = {Abstract Motivation: Gene expression profile analyses have been used in numerous studies covering a broad range of areas in biology. When unreliable measurements are excluded, missing values are introduced in gene expression profiles. Although existing multivariate analysis methods have difficulty with the treatment of missing values, this problem has received little attention. There are many options for dealing with missing values, each of which reaches drastically different results. Ignoring missing values is the simplest method and is frequently applied. This approach, however, has its flaws. In this article, we propose an estimation method for missing values, which is based on Bayesian principal component analysis (BPCA). Although the methodology that a probabilistic model and latent variables are estimated simultaneously within the framework of Bayes inference is not new in principle, actual BPCA implementation that makes it possible to estimate arbitrary missing variables is new in terms of statistical methodology. Results: When applied to DNA microarray data from various experimental conditions, the BPCA method exhibited markedly better estimation ability than other recently proposed methods, such as singular value decomposition and K-nearest neighbors. While the estimation performance of existing methods depends on model parameters whose determination is difficult, our BPCA method is free from this difficulty. Accordingly, the BPCA method provides accurate and convenient estimation for missing values. Availability: The software is available at http://hawaii.aist-nara.ac.jp/\textasciitilde shige-o/tools/}, langid = {english}, - file = {/Users/lucas-diedrich/Zotero/storage/46SGGFWC/Oba et al. - 2003 - A Bayesian missing value estimation method for gene expression profile data.pdf} +} + +@article{Stacklies.2007, + title = {{{pcaMethods}}---a Bioconductor Package Providing {{PCA}} Methods for Incomplete Data}, + author = {Stacklies, Wolfram and Redestig, Henning and Scholz, Matthias and Walther, Dirk and Selbig, Joachim}, + year = 2007, + month = may, + journal = {Bioinformatics}, + volume = {23}, + number = {9}, + pages = {1164--1167}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btm069}, + urldate = {2025-12-21}, + abstract = {Summary: ~pcaMethods is a Bioconductor compliant library for computing principal component analysis (PCA) on incomplete data sets. The results can be analyzed directly or used to estimate missing values to enable the use of missing value sensitive statistical methods. The package was mainly developed with microarray and metabolite data sets in mind, but can be applied to any other incomplete data set as well.Availability: ~http://www.bioconductor.orgContact: ~selbig@mpimp-golm.mpg.deSupplementary information: Please visit our webpage at http://bioinformatics.mpimp-golm.mpg.de/}, } @@ -36,5 +50,4 @@ @inproceedings{Bishop.1998 publisher = {MIT Press}, urldate = {2025-12-21}, keywords = {Bayesian PCA}, - file = {/Users/lucas-diedrich/Zotero/storage/CSN76WHH/Bishop - 1998 - Bayesian PCA.pdf} } diff --git a/src/alphapepttools/pp/__init__.py b/src/alphapepttools/pp/__init__.py index 960215b5..dde457b0 100644 --- a/src/alphapepttools/pp/__init__.py +++ b/src/alphapepttools/pp/__init__.py @@ -1,5 +1,5 @@ from .batch_correction import scanpy_pycombat from .data import add_metadata, filter_by_metadata, filter_data_completeness, scale_and_center -from .impute import impute_gaussian, impute_knn, impute_median +from .impute import impute_bpca, impute_gaussian, impute_knn, impute_median from .norm import normalize from .transform import detect_special_values, nanlog diff --git a/src/alphapepttools/pp/impute.py b/src/alphapepttools/pp/impute.py index e2cf5811..03c210a2 100644 --- a/src/alphapepttools/pp/impute.py +++ b/src/alphapepttools/pp/impute.py @@ -6,6 +6,7 @@ import anndata as ad import numpy as np import pandas as pd +from bpca import BPCA from sklearn.impute import KNNImputer logger = logging.getLogger(__name__) @@ -137,7 +138,7 @@ def impute_gaussian( random_state: int = 42, *, copy: bool = False, -) -> ad.AnnData: +) -> ad.AnnData | None: """Impute missing values in each column by random sampling from a gaussian distribution. The distribution is centered at std_offset * feature standard deviation below the @@ -380,7 +381,7 @@ def impute_knn( *, copy: bool = False, **kwargs, -) -> ad.AnnData: +) -> ad.AnnData | None: """Impute missing values using median imputation Replace missing (NaN) values for each feature in the data matrix with the estimate based on non-missing @@ -479,3 +480,127 @@ def impute_knn( adata.layers[layer] = data return adata if copy else None + + +def _impute_bpca(data: np.ndarray, n_components: int, **kwargs) -> np.ndarray: + r"""Imputes missing data with Bayesian Principal Component Analysis + + .. :math: + + X_{estimated} = \approx U\times L + \overline{X} + + References + ---------- + - :cite:p:`Stacklies.2007` + """ + bpca = BPCA(n_components=n_components, **kwargs) + + usage = bpca.fit_transform(data) + loadings = bpca.components_ + mean = bpca.mu + + X_reconstructed = usage @ loadings + mean + + return np.where(np.isnan(data), X_reconstructed, data) + + +def impute_bpca( + adata: ad.AnnData, + *, + n_components: int = 50, + layer: str | None = None, + group_column: str | None = None, + copy: bool = False, + **kwargs, +) -> ad.AnnData | None: + r"""Impute missing values using Bayesian Principal Component Analysis (BPCA) + + Estimates the latent covariance structure of the log-transformed data via Bayesian Principal Component Analysis. The imputation method uses the obtained + principal component usages :math:`U` and components :math:`L` to reconstruct the full log-transformed data matrix + + .. :math: + + X_{estimated} = \approx U\times L + \overline{X} + + Where :math:`\overline{X}` is the mean of the data. Missing values are replaced with the estimated values from + this procedure. + + Parameters + ---------- + adata + AnnData object. + n_components + Number of components to use for the model fit. The more components are used, the more granular the model + fits the data. This might increase model accuracy but also propagates more measurement noise in the + data reconstruction. + layer + Layer to use for imputation. The data should be log transformed to match the noise model of the BPCA method. + If `None`, uses the `adata.X` attribute. + group_column + Column name in `adata.obs` defining groups for group-wise imputation. + - `None` (default, recommended), imputes all samples. + - `str` Computes median separately for each group + If `group_column` contains NaNs, the respective observations are ignored. + copy + Whether to return a modified copy (True) of the anndata object. If False (default) + modifies the object inplace + **kwargs + Passed to :class:`bpca.BPCA` + + Returns + ------- + None | anndata.AnnData + AnnData object with imputed values in layer. + If `copy=False` modifies the anndata object at layer inplace and returns None. If `copy=True`, + returns a modified copy. + + Raises + ------ + Warning + If `group_column` contains NaNs + Warning + If a feature contains only NaNs + + Example + ------- + + .. code-block:: python + + # Log transform data + at.pp.nanlog(adata) + + # Imputes .X inplace + at.pp.impute_bpca(adata, n_components=50, layer=None) + + # Returns a new anndata object with imputed .X layer + adata_new = at.pp.impute_bpca(adata, n_components=50, layer=None, copy=True) + + References + ---------- + This implementation follows the reference implementation in :cite:p:`Stacklies.2007` + + See Also + -------- + :class:`bpca.BPCA` + """ + adata = adata.copy() if copy else adata + + data = adata.X if layer is None else adata.layers[layer] + + if group_column is None: + _raise_on_nan_values(data, mode="all") + data = _impute_bpca(data, n_components=n_components, **kwargs) + else: + groups = adata.obs.groupby(group_column, dropna=True).indices + + for group_indices in groups.values(): + group = data[group_indices] + _raise_on_nan_values(group, mode="all") + data[group_indices, :] = _impute_bpca(group, n_components=n_components, **kwargs) + + if layer is None: + adata.X = data + else: + adata.layers[layer] = data + + return adata if copy else None diff --git a/tests/pp/test_impute.py b/tests/pp/test_impute.py index 3afb5ffe..49e64631 100644 --- a/tests/pp/test_impute.py +++ b/tests/pp/test_impute.py @@ -1,10 +1,18 @@ +from typing import Any + import anndata as ad import numpy as np import pandas as pd import pytest -from alphapepttools.pp import impute_gaussian, impute_knn, impute_median -from alphapepttools.pp.impute import _impute_gaussian, _impute_knn, _impute_nanmedian, _raise_on_nan_values +from alphapepttools.pp import impute_bpca, impute_gaussian, impute_knn, impute_median +from alphapepttools.pp.impute import ( + _impute_bpca, + _impute_gaussian, + _impute_knn, + _impute_nanmedian, + _raise_on_nan_values, +) @pytest.fixture @@ -100,7 +108,23 @@ def gaussian_imputation_dummy_data(imputation_dummy_data) -> tuple[np.ndarray, n return imputation_dummy_data, X -def test___check_all_nan(dummy_data_all_nan) -> None: +@pytest.fixture +def bpca_imputation_dummy_data(imputation_dummy_data: np.ndarray) -> tuple[np.ndarray, np.ndarray, dict[str, str]]: + """Test data for BPCA imputation""" + X_ref = np.array( + [ + [0.0, 0.0, 2.0, 0.94077611, 0.0], + [1.0, 1.0, 3.0, 1.0, 10.0], + [0.0, 2.0, 4.0, 3.03508935, 20.0], + [0.33059225, 3.0, 5.0, 3.0, 18.02331366], + ] + ) + kwargs = {"n_components": 1} + + return imputation_dummy_data, X_ref, kwargs + + +def test___raise_on_nan_values(dummy_data_all_nan) -> None: with pytest.raises(ValueError, match=r"Features with index \[4\]"): _raise_on_nan_values(dummy_data_all_nan, mode="all") @@ -132,6 +156,15 @@ def test__impute_gaussian(gaussian_imputation_dummy_data) -> None: assert np.all(np.isclose(X_imputed, X_ref, equal_nan=True)) +def test__impute_bpca(bpca_imputation_dummy_data: tuple[np.ndarray, np.ndarray, dict[str, Any]]) -> None: + """Test knn imputation for data with nan values""" + X, X_ref, kwargs = bpca_imputation_dummy_data + + X_imputed = _impute_bpca(X, **kwargs) + + assert np.allclose(X_imputed, X_ref, equal_nan=False) + + class TestImputeGaussianAnnData: @pytest.fixture def gaussian_imputation_dummy_anndata( @@ -474,3 +507,93 @@ def test_impute_median__missing_layer( with pytest.raises(KeyError): _ = impute_knn(adata, layer="non_existent_layer", copy=True) + + +class TestImputeBPCAAnnData: + @pytest.fixture + def bpca_imputation_dummy_anndata( + self, + bpca_imputation_dummy_data, + ) -> tuple[ad.AnnData, np.ndarray, np.ndarray, dict[str, Any]]: + """Test data for BPCA imputation""" + obs = pd.DataFrame( + { + "sample_id": ["A", "B", "C", "D"], + "sample_group": ["A", "A", "B", "B"], + "sample_group_with_nan": ["A", "A", np.nan, np.nan], + } + ) + + X, X_ref, kwargs = bpca_imputation_dummy_data + # Reference for grouped imputation (computed separately per group with n_components=1) + X_ref_grouped = np.array( + [ + [0.0, 0.0, 2.0, 1.0, 0.0], + [1.0, 1.0, 3.0, 1.0, 10.0], + [0.0, 2.0, 4.0, 3.0, 20.0], + [0.0, 3.0, 5.0, 3.0, 20.0], + ] + ) + + return ad.AnnData(X, obs=obs, layers={"layer2": X.copy()}), X_ref, X_ref_grouped, kwargs + + @pytest.fixture + def bpca_imputation_dummy_anndata_all_nan(self, dummy_data_all_nan: np.ndarray) -> ad.AnnData: + """AnnData object with a feature that contains only NaNs""" + + obs = pd.DataFrame( + { + "sample_id": ["A", "B", "C", "D"], + "sample_group": ["A", "A", "B", "B"], + "sample_group_with_nan": ["A", "A", np.nan, np.nan], + } + ) + + return ad.AnnData(X=dummy_data_all_nan, obs=obs) + + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize("layer", [None, "layer2"]) + def test_impute_bpca(self, bpca_imputation_dummy_anndata, layer: str, *, copy: bool) -> None: + """Test BPCA imputation for data with nan values""" + adata, X_ref, _, kwargs = bpca_imputation_dummy_anndata + + result = impute_bpca(adata, layer=layer, **kwargs, copy=copy) + + if copy: + assert isinstance(result, ad.AnnData) + adata_imputed = result + else: + assert result is None + adata_imputed = adata + + X_imputed = adata_imputed.X if layer is None else adata_imputed.layers[layer] + + assert np.allclose(X_imputed, X_ref, equal_nan=False) + + @pytest.mark.parametrize("group_column", [None, "sample_group"]) + def test_impute_bpca__feature_all_nan(self, bpca_imputation_dummy_anndata_all_nan, group_column: str) -> None: + """Test BPCA imputation raises if a feature contains all nan""" + adata = bpca_imputation_dummy_anndata_all_nan + + with pytest.raises(ValueError, match=r"Features with index"): + _ = impute_bpca(adata, n_components=1, group_column=group_column) + + def test_impute_bpca__missing_group_column( + self, + bpca_imputation_dummy_anndata, + ) -> None: + """Test that KeyError is raised if `group_column` does not exist in `adata.obs`""" + adata, _, _, kwargs = bpca_imputation_dummy_anndata + + with pytest.raises(KeyError): + impute_bpca(adata, group_column="non_existent_column", **kwargs) + + def test_impute_bpca__missing_layer( + self, + bpca_imputation_dummy_anndata, + ) -> None: + """Test that KeyError is raised if `layer` does not exist in `adata`""" + adata, _, _, kwargs = bpca_imputation_dummy_anndata + + with pytest.raises(KeyError): + impute_bpca(adata, layer="non_existent_layer", **kwargs)