Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
pp.impute_gaussian
pp.impute_median
pp.impute_knn
pp.impute_bpca
pp.scanpy_pycombat

```
Expand Down
17 changes: 15 additions & 2 deletions docs/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,21 @@ @article{Oba.2003
urldate = {2025-10-24},
abstract = {Abstract Motivation: Gene expression profile analyses have been used in numerous studies covering a broad range of areas in biology. When unreliable measurements are excluded, missing values are introduced in gene expression profiles. Although existing multivariate analysis methods have difficulty with the treatment of missing values, this problem has received little attention. There are many options for dealing with missing values, each of which reaches drastically different results. Ignoring missing values is the simplest method and is frequently applied. This approach, however, has its flaws. In this article, we propose an estimation method for missing values, which is based on Bayesian principal component analysis (BPCA). Although the methodology that a probabilistic model and latent variables are estimated simultaneously within the framework of Bayes inference is not new in principle, actual BPCA implementation that makes it possible to estimate arbitrary missing variables is new in terms of statistical methodology. Results: When applied to DNA microarray data from various experimental conditions, the BPCA method exhibited markedly better estimation ability than other recently proposed methods, such as singular value decomposition and K-nearest neighbors. While the estimation performance of existing methods depends on model parameters whose determination is difficult, our BPCA method is free from this difficulty. Accordingly, the BPCA method provides accurate and convenient estimation for missing values. Availability: The software is available at http://hawaii.aist-nara.ac.jp/\textasciitilde shige-o/tools/},
langid = {english},
file = {/Users/lucas-diedrich/Zotero/storage/46SGGFWC/Oba et al. - 2003 - A Bayesian missing value estimation method for gene expression profile data.pdf}
}

@article{Stacklies.2007,
title = {{{pcaMethods}}---a Bioconductor Package Providing {{PCA}} Methods for Incomplete Data},
author = {Stacklies, Wolfram and Redestig, Henning and Scholz, Matthias and Walther, Dirk and Selbig, Joachim},
year = 2007,
month = may,
journal = {Bioinformatics},
volume = {23},
number = {9},
pages = {1164--1167},
issn = {1367-4803},
doi = {10.1093/bioinformatics/btm069},
urldate = {2025-12-21},
abstract = {Summary: ~pcaMethods is a Bioconductor compliant library for computing principal component analysis (PCA) on incomplete data sets. The results can be analyzed directly or used to estimate missing values to enable the use of missing value sensitive statistical methods. The package was mainly developed with microarray and metabolite data sets in mind, but can be applied to any other incomplete data set as well.Availability: ~http://www.bioconductor.orgContact: ~selbig@mpimp-golm.mpg.deSupplementary information: Please visit our webpage at http://bioinformatics.mpimp-golm.mpg.de/},
}


Expand All @@ -36,5 +50,4 @@ @inproceedings{Bishop.1998
publisher = {MIT Press},
urldate = {2025-12-21},
keywords = {Bayesian PCA},
file = {/Users/lucas-diedrich/Zotero/storage/CSN76WHH/Bishop - 1998 - Bayesian PCA.pdf}
}
2 changes: 1 addition & 1 deletion src/alphapepttools/pp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .batch_correction import scanpy_pycombat
from .data import add_metadata, filter_by_metadata, filter_data_completeness, scale_and_center
from .impute import impute_gaussian, impute_knn, impute_median
from .impute import impute_bpca, impute_gaussian, impute_knn, impute_median
from .norm import normalize
from .transform import detect_special_values, nanlog
129 changes: 127 additions & 2 deletions src/alphapepttools/pp/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import anndata as ad
import numpy as np
import pandas as pd
from bpca import BPCA
from sklearn.impute import KNNImputer

# logging configuration
Expand All @@ -20,7 +21,7 @@ def impute_gaussian(
layer: str | None = None,
*,
copy: bool = False,
) -> ad.AnnData:
) -> ad.AnnData | None:
"""Impute missing values in each column by random sampling from a gaussian distribution.

The distribution is centered at std_offset * feature standard deviation below the
Expand Down Expand Up @@ -248,7 +249,7 @@ def impute_knn(
weights: Literal["distance", "uniform"] = "distance",
copy: bool = False,
**kwargs,
) -> ad.AnnData:
) -> ad.AnnData | None:
"""Impute missing values using median imputation

Replace missing (NaN) values for each feature in the data matrix with the estimate based on non-missing
Comment thread
lucas-diedrich marked this conversation as resolved.
Expand Down Expand Up @@ -347,3 +348,127 @@ def impute_knn(
adata.layers[layer] = data

return adata if copy else None


def _impute_bpca(data: np.ndarray, n_components: int, **kwargs) -> np.ndarray:
r"""Imputes missing data with Bayesian Principal Component Analysis

.. :math:

X_{estimated} = \approx U\times L + \overline{X}

References
----------
- :cite:p:`Stacklies.2007`
"""
bpca = BPCA(n_components=n_components, **kwargs)

usage = bpca.fit_transform(data)
loadings = bpca.components_
mean = np.nanmean(data, axis=0)

X_reconstructed = usage @ loadings + mean

return np.where(np.isnan(data), X_reconstructed, data)


def impute_bpca(
adata: ad.AnnData,
*,
n_components: int = 50,
layer: str | None = None,
group_column: str | None = None,
copy: bool = False,
**kwargs,
) -> ad.AnnData | None:
r"""Impute missing values using Bayesian Principal Component Analysis (BPCA)

Estimates the latent covariance structure of the log-transformed data via Bayesian Principal Component Analysis. The imputation method uses the obtained
principal component usages :math:`U` and components :math:`L` to reconstruct the full log-transformed data matrix

.. :math:

X_{estimated} = \approx U\times L + \overline{X}

Where :math:`\overline{X}` is the mean of the data. Missing values are replaced with the estimated values from
this procedure.

Parameters
----------
adata
AnnData object.
n_components
Number of components to use for the model fit. The more components are used, the more granular the model
fits the data. This might increase model accuracy but also propagates more measurement noise in the
data reconstruction.
layer
Layer to use for imputation. The data should be log transformed to match the noise model of the BPCA method.
Comment thread
lucas-diedrich marked this conversation as resolved.
If `None`, uses the `adata.X` attribute.
group_column
Column name in `adata.obs` defining groups for group-wise imputation.
- `None` (default, recommended), imputes all samples.
- `str` Computes median separately for each group
If `group_column` contains NaNs, the respective observations are ignored.
copy
Whether to return a modified copy (True) of the anndata object. If False (default)
modifies the object inplace
**kwargs
Passed to :class:`bpca.BPCA`

Returns
-------
None | anndata.AnnData
AnnData object with imputed values in layer.
If `copy=False` modifies the anndata object at layer inplace and returns None. If `copy=True`,
returns a modified copy.

Raises
------
Warning
If `group_column` contains NaNs
Warning
If a feature contains only NaNs

Example
-------

.. code-block:: python

# Log transform data
at.pp.nanlog(adata)

# Imputes .X inplace
at.pp.impute_bpca(adata, n_components=50, layer=None)

# Returns a new anndata object with imputed .X layer
adata_new = at.pp.impute_bpca(adata, n_components=50, layer=None, copy=True)

References
----------
This implementation follows the reference implementation in :cite:p:`Stacklies.2007`

See Also
--------
:class:`bpca.BPCA`
"""
adata = adata.copy() if copy else adata

data = adata.X if layer is None else adata.layers[layer]

if group_column is None:
_check_all_nan(data)
data = _impute_bpca(data, n_components=n_components, **kwargs)
else:
groups = adata.obs.groupby(group_column, dropna=True).indices

for group_indices in groups.values():
group = data[group_indices]
_check_all_nan(group)
data[group_indices, :] = _impute_bpca(group, n_components=n_components, **kwargs)

if layer is None:
Comment thread
lucas-diedrich marked this conversation as resolved.
adata.X = data
else:
adata.layers[layer] = data

return adata if copy else None
121 changes: 119 additions & 2 deletions tests/pp/test_impute.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import Any

import anndata as ad
import numpy as np
import pandas as pd
import pytest

from alphapepttools.pp import impute_gaussian, impute_knn, impute_median
from alphapepttools.pp.impute import _check_all_nan, _impute_knn, _impute_nanmedian
from alphapepttools.pp import impute_bpca, impute_gaussian, impute_knn, impute_median
from alphapepttools.pp.impute import _check_all_nan, _impute_bpca, _impute_knn, _impute_nanmedian


@pytest.fixture
Expand Down Expand Up @@ -67,6 +69,22 @@ def knn_imputation_dummy_data(imputation_dummy_data) -> tuple[np.ndarray, np.nda
return imputation_dummy_data, X_ref, kwargs


@pytest.fixture
def bpca_imputation_dummy_data(imputation_dummy_data: np.ndarray) -> tuple[np.ndarray, np.ndarray, dict[str, str]]:
"""Test data for median imputation"""
X_ref = np.array(
[
[0.0, 0.0, 2.0, 0.94077611, 0.0],
[1.0, 1.0, 3.0, 1.0, 10.0],
[0.0, 2.0, 4.0, 3.03508935, 20.0],
[0.33059225, 3.0, 5.0, 3.0, 18.02331366],
]
)
kwargs = {"n_components": 1}

return imputation_dummy_data, X_ref, kwargs


def test___check_all_nan(dummy_data_all_nan) -> None:
with pytest.raises(ValueError, match=r"Features with index \[4\]"):
_check_all_nan(dummy_data_all_nan)
Expand All @@ -90,6 +108,15 @@ def test__impute_knn(knn_imputation_dummy_data) -> None:
assert np.all(np.isclose(X_imputed, X_ref, equal_nan=True))


def test__impute_bpca(bpca_imputation_dummy_data: tuple[np.ndarray, np.ndarray, dict[str, Any]]) -> None:
"""Test knn imputation for data with nan values"""
X, X_ref, kwargs = bpca_imputation_dummy_data

X_imputed = _impute_bpca(X, **kwargs)

assert np.allclose(X_imputed, X_ref, equal_nan=False)


class TestImputeGaussian:
@pytest.fixture
def gaussian_imputation_dummy_data(self):
Expand Down Expand Up @@ -357,3 +384,93 @@ def test_impute_median__missing_layer(

with pytest.raises(KeyError):
_ = impute_knn(adata, layer="non_existent_layer", copy=True)


class TestImputeBPCAAnnData:
@pytest.fixture
def bpca_imputation_dummy_anndata(
self,
bpca_imputation_dummy_data,
) -> tuple[ad.AnnData, np.ndarray, np.ndarray, dict[str, Any]]:
"""Test data for BPCA imputation"""
obs = pd.DataFrame(
{
"sample_id": ["A", "B", "C", "D"],
"sample_group": ["A", "A", "B", "B"],
"sample_group_with_nan": ["A", "A", np.nan, np.nan],
}
)

X, X_ref, kwargs = bpca_imputation_dummy_data
# Reference for grouped imputation (computed separately per group with n_components=1)
X_ref_grouped = np.array(
[
[0.0, 0.0, 2.0, 1.0, 0.0],
[1.0, 1.0, 3.0, 1.0, 10.0],
[0.0, 2.0, 4.0, 3.0, 20.0],
[0.0, 3.0, 5.0, 3.0, 20.0],
]
)

return ad.AnnData(X, obs=obs, layers={"layer2": X.copy()}), X_ref, X_ref_grouped, kwargs

@pytest.fixture
def bpca_imputation_dummy_anndata_all_nan(self, dummy_data_all_nan: np.ndarray) -> ad.AnnData:
"""AnnData object with a feature that contains only NaNs"""

obs = pd.DataFrame(
{
"sample_id": ["A", "B", "C", "D"],
"sample_group": ["A", "A", "B", "B"],
"sample_group_with_nan": ["A", "A", np.nan, np.nan],
}
)

return ad.AnnData(X=dummy_data_all_nan, obs=obs)

@pytest.mark.parametrize("copy", [True, False])
@pytest.mark.parametrize("layer", [None, "layer2"])
def test_impute_bpca(self, bpca_imputation_dummy_anndata, layer: str, *, copy: bool) -> None:
"""Test BPCA imputation for data with nan values"""
adata, X_ref, _, kwargs = bpca_imputation_dummy_anndata

result = impute_bpca(adata, layer=layer, **kwargs, copy=copy)

if copy:
assert isinstance(result, ad.AnnData)
adata_imputed = result
else:
assert result is None
adata_imputed = adata

X_imputed = adata_imputed.X if layer is None else adata_imputed.layers[layer]

assert np.allclose(X_imputed, X_ref, equal_nan=False)

@pytest.mark.parametrize("group_column", [None, "sample_group"])
def test_impute_bpca__feature_all_nan(self, bpca_imputation_dummy_anndata_all_nan, group_column: str) -> None:
"""Test BPCA imputation raises if a feature contains all nan"""
adata = bpca_imputation_dummy_anndata_all_nan

with pytest.raises(ValueError, match=r"Features with index"):
_ = impute_bpca(adata, n_components=1, group_column=group_column)

def test_impute_bpca__missing_group_column(
self,
bpca_imputation_dummy_anndata,
) -> None:
"""Test that KeyError is raised if `group_column` does not exist in `adata.obs`"""
adata, _, _, kwargs = bpca_imputation_dummy_anndata

with pytest.raises(KeyError):
impute_bpca(adata, group_column="non_existent_column", **kwargs)

def test_impute_bpca__missing_layer(
self,
bpca_imputation_dummy_anndata,
) -> None:
"""Test that KeyError is raised if `layer` does not exist in `adata`"""
adata, _, _, kwargs = bpca_imputation_dummy_anndata

with pytest.raises(KeyError):
impute_bpca(adata, layer="non_existent_layer", **kwargs)
Loading