diff --git a/docs/api.md b/docs/api.md index 15a28bb9..5c622751 100644 --- a/docs/api.md +++ b/docs/api.md @@ -35,6 +35,7 @@ tl.get_id2gene_map tl.map_genes_to_protein_groups + tl.find_protease_cut_sites tl.nan_safe_bh_correction tl.nan_safe_ttest_ind tl.diff_exp_ttest diff --git a/src/alphapepttools/tl/__init__.py b/src/alphapepttools/tl/__init__.py index c7c42d92..e6b224a5 100644 --- a/src/alphapepttools/tl/__init__.py +++ b/src/alphapepttools/tl/__init__.py @@ -10,4 +10,4 @@ prepare_scree_data_to_plot, ) from .stats import nan_safe_bh_correction -from .tools import get_id2gene_map, map_genes_to_protein_groups +from .tools import find_protease_cut_sites, get_id2gene_map, map_genes_to_protein_groups diff --git a/src/alphapepttools/tl/tools.py b/src/alphapepttools/tl/tools.py index 872ab204..ea698334 100644 --- a/src/alphapepttools/tl/tools.py +++ b/src/alphapepttools/tl/tools.py @@ -5,7 +5,9 @@ from io import StringIO from pathlib import Path +import anndata as ad import numpy as np +import pandas as pd import regex as re from Bio import SeqIO @@ -158,3 +160,42 @@ def map_genes_to_protein_groups( out_gene_names.append(";".join(gene_names)) return out_gene_names + + +def find_protease_cut_sites( + adata: ad.AnnData, + sequence_column: str = "sequence", + cleavage_pattern: str = r"(? pd.Series: + """Find internal protease cut sites in peptide sequences to detect miscleavages + + The cleavage pattern can be defined as a regex pattern, and looks for tryptic + cleavage sites by default (K or R not followed by P). The function counts only + internal cleavage sites by excluding the C-terminal residue from the search. + + Parameters + ---------- + adata + AnnData object containing peptide-level data. Must have `sequence_column` in `adata.var`. + sequence_column + Column name in `adata.var` containing peptide sequences. Defaults to "sequence". + cleavage_pattern + Regular expression pattern defining the protease cleavage sites. Defaults to r"(?tr|ID0|ID0_HUMAN Protein1 OS=Homo sapiens OX=9606 GN=GN0 PE=1 SV=1 @@ -136,6 +136,37 @@ def test_drop_features_with_too_few_valid_values(example_adata): assert list(filtered_adata.var_names) == expected_columns +# Test find_protease_cut_sites function +@pytest.mark.parametrize( + ("sequences", "expected_counts"), + [ + # Fully tryptic peptides (0 miscleavages) + (["AAAAAA", "AAAAAAK", "AAAAAAAR"], [0, 0, 0]), + # One miscleavage + (["AAAAAKAAAAAR", "AAAAARAAAAAAK"], [1, 1]), + # Two miscleavages + (["AAAAAKAAAAAKAAAAAAR", "AAAAARAAAAARAAAAAAK"], [2, 2]), + # Proline rule: KP and RP should NOT be counted as cleavage sites + (["AAAAKPAAAAAR", "AAAARPAAAAAAK"], [0, 0]), + # Mixed cases + (["AAAAAA", "AAAAAAK", "AAAAAKAAAAAR", "AAAAAKAAAAAKAAAAAAR"], [0, 0, 1, 2]), + # Edge case: empty sequence + ([""], [0]), + # Edge case: single residue + (["K"], [0]), + ], +) +def test_find_protease_cut_sites(sequences, expected_counts): + adata = ad.AnnData( + np.zeros((1, len(sequences))), + var=pd.DataFrame({"sequence": sequences}), + ) + + counts = find_protease_cut_sites(adata, sequence_column="sequence") + + assert list(counts) == expected_counts + + # Test the find_iterable_kwargs function @pytest.fixture def example_kwargs_with_iterables():