-
Notifications
You must be signed in to change notification settings - Fork 1
Prolif class implementation #88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
93d39f5
5f48f14
2ae6357
976722d
abed811
47060fb
445feb8
a673636
bf4723a
3e9a2b4
76b87f4
990c86f
d964589
23e218b
2d1f22e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ dependencies: | |
| - openff-units | ||
| - pip | ||
| - tqdm | ||
| - prolif | ||
| - pyyaml | ||
| # for testing | ||
| - coverage | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,234 @@ | ||||
| from __future__ import annotations | ||||
|
|
||||
| import numpy as np | ||||
| from typing import Any, Dict, Optional, Sequence, Tuple | ||||
|
|
||||
| import MDAnalysis as mda | ||||
| from MDAnalysis.analysis.base import AnalysisBase | ||||
|
|
||||
| import prolif as plf | ||||
|
|
||||
|
|
||||
| class ProLIFAnalysis(AnalysisBase): | ||||
| """ | ||||
| ProLIF interaction fingerprint analysis for an OpenFEReader Universe. | ||||
| """ | ||||
|
|
||||
| def __init__( | ||||
| self, | ||||
| universe: mda.Universe, | ||||
| ligand_ag: mda.AtomGroup, | ||||
| water_order: int = 3, | ||||
| interactions: Optional[Sequence[str] | str] = None, | ||||
| guess_bonds: bool = True, | ||||
| vdwradii: Optional[Dict[str, float]] = None, | ||||
| **kwargs, | ||||
| ) -> None: | ||||
| """ | ||||
| Initialize the ProLIF analysis. | ||||
|
|
||||
| Parameters | ||||
| ---------- | ||||
| universe | ||||
| MDAnalysis Universe containing topology and trajectory. | ||||
| ligand_ag | ||||
| mda.AtomGroup representing the ligand. | ||||
| water_order | ||||
| Maximum WaterBridge interaction order (water-water interaction). | ||||
| Only used if "WaterBridge" is tracked. | ||||
| interactions | ||||
| Which interactions to track: | ||||
| - None: ProLIF defaults | ||||
| - "all": all registered (non-bridged; depends on ProLIF version) | ||||
| - Sequence[str]: explicit list like ["VdWContact", "HBDonor"] | ||||
| guess_bonds | ||||
| If True, guess bonds for (protein, ligand, water) so ProLIF can | ||||
| recognize donors/acceptors and bonded hydrogens. | ||||
| vdwradii | ||||
| Optional dict of van der Waals radii used by MDAnalysis bond guesser. | ||||
| Useful when your topology contains types the guesser doesn't know | ||||
| (e.g. "Cl", "Na"). If None, uses coded defaults. | ||||
| """ | ||||
| self.universe = universe | ||||
| self.ligand_ag = ligand_ag | ||||
| self.water_order = water_order | ||||
|
|
||||
| super().__init__(universe.trajectory, **kwargs) | ||||
|
|
||||
| # --- Guess bonds once on stable selections so RDKit/ProLIF can detect HBonds --- | ||||
| if guess_bonds: | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a lot happening in init here, maybe it would make sense to put some of these into private functions (e.g. to guess bonds, to build the fingerprints) so that they are easier to test separately and for a better overview of the different things that are going on.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree, can split it up a little bit, since it is quite crowded currently
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @talagayev i created a small utils function for the guessing of bonds here :
Please let me know what you think about it! |
||||
| if vdwradii is None: | ||||
| # minimal overrides needed for your system (atom types include Cl/Na) | ||||
| vdwradii = { | ||||
| "Cl": 1.75, | ||||
| "CL": 1.75, | ||||
| "Br": 1.85, | ||||
| "BR": 1.85, | ||||
| "Na": 2.27, | ||||
| "NA": 2.27, | ||||
| } | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use the vdwraddi from here https://github.com/MDAnalysis/mdanalysis/blob/develop/package/MDAnalysis/guesser/tables.py#L310 |
||||
|
|
||||
| # Protein: guess on the full protein so any pocket residue later has bonds | ||||
| universe.select_atoms("protein").guess_bonds(vdwradii=vdwradii) | ||||
|
|
||||
| # Ligand: stable group | ||||
| self.ligand_ag.guess_bonds(vdwradii=vdwradii) | ||||
|
|
||||
| # Water: only if you care about water-mediated interactions | ||||
| if guess_bonds: | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should always be true since the outer loop goes through the same. Or did you mean to put something else here? |
||||
| wat_all = universe.select_atoms("water") | ||||
| if wat_all.n_atoms: | ||||
| wat_all.guess_bonds(vdwradii=vdwradii) | ||||
|
|
||||
| # Currently adding here but maybe as args? | ||||
| self.protein_ag = self.universe.select_atoms( | ||||
| "protein and byres around 12 group ligand", | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I agree I think it would be great to have these (also for water below) as args. |
||||
| ligand=self.ligand_ag, | ||||
| updating=True, | ||||
| ) | ||||
| self.water_ag = self.universe.select_atoms( | ||||
| "water and byres around 8 (group ligand or group pocket)", | ||||
| ligand=self.ligand_ag, | ||||
| pocket=self.protein_ag, | ||||
| updating=True, | ||||
| ) | ||||
|
|
||||
| available = plf.Fingerprint.list_available() | ||||
|
|
||||
| if interactions is None: | ||||
| fp_interactions = None | ||||
|
|
||||
| elif interactions == "all": | ||||
| fp_interactions = "all" | ||||
|
|
||||
| else: | ||||
| # Cover case of false interaction | ||||
| missing = [i for i in interactions if i not in available] | ||||
| if missing: | ||||
| raise ValueError( | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like ProLIF would also raise an error for missing interactions, so this may not be necessary. |
||||
| f"Unknown interaction(s): {missing}. " f"Available: {available}" | ||||
| ) | ||||
| fp_interactions = list(interactions) | ||||
|
|
||||
| self._parameters = None | ||||
| if ( | ||||
| fp_interactions is not None | ||||
| and fp_interactions != "all" | ||||
| and "WaterBridge" in fp_interactions | ||||
| ): | ||||
| if self.water_ag.n_atoms == 0: | ||||
| raise ValueError("WaterBridge selected but water selection is empty.") | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe instead raise a warning, or what would happen if there are individual frames that don't have water molecules?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes warining would be better in that case, since that is quite a possible scenario, best approach would be truly warning + remove the water_bridge interaction so that it doesn't run when there are no waters, since I would need to check, but I think currently it would possibly still try to run it 🤔 |
||||
| self._parameters = { | ||||
| "WaterBridge": {"water": self.water_ag, "order": self.water_order} | ||||
| } | ||||
|
|
||||
| if fp_interactions is None: | ||||
| self.fp = plf.Fingerprint() | ||||
| else: | ||||
| self.fp = plf.Fingerprint(interactions=fp_interactions) | ||||
|
|
||||
| def _prepare(self): | ||||
| self.results.ifp = None | ||||
| self.results.ifp_df = None | ||||
|
|
||||
| def _conclude(self): | ||||
| self.results.ifp = getattr(self.fp, "ifp", None) | ||||
|
|
||||
| def run( | ||||
|
hannahbaumann marked this conversation as resolved.
|
||||
| self, | ||||
| *, | ||||
| start: Optional[int] = None, | ||||
| stop: Optional[int] = None, | ||||
| step: Optional[int] = None, | ||||
| residues: Optional[bool] = None, | ||||
| progress: bool = True, | ||||
| n_jobs: Optional[int] = None, | ||||
| parallel_strategy: Optional[str] = None, | ||||
| converter_kwargs: Optional[Tuple[Dict[str, Any], Dict[str, Any]]] = None, | ||||
| ) -> "ProLIFAnalysis": | ||||
| """ | ||||
| Run the fingerprint calculation over a slice of the trajectory. | ||||
|
|
||||
| Parameters | ||||
| ---------- | ||||
| start, stop, step | ||||
| Trajectory slicing parameters. | ||||
| residues | ||||
| Passed to ProLIF: whether to aggregate interactions with residues. | ||||
| If None, ProLIF's default is used and interactions with atoms are identified. | ||||
| progress | ||||
| Show progress bar. | ||||
| n_jobs | ||||
| Number of workers for parallel execution.). | ||||
| parallel_strategy | ||||
| ProLIF parallel strategy. If None, this wrapper sets: | ||||
| - "chunk" for n_jobs None/1 | ||||
| - "queue" for n_jobs > 1 | ||||
| converter_kwargs | ||||
| Two dicts: (ligand_kwargs, protein_kwargs) forwarded to the MDAnalysis→RDKit | ||||
| converter. If None, we default to: | ||||
| - ligand: {"inferrer": None, "implicit_hydrogens": False} (avoid valence issues) | ||||
| - protein: {"implicit_hydrogens": False} (use topology bonds) | ||||
|
|
||||
| Returns | ||||
| ------- | ||||
| self | ||||
| Returned for fluent chaining. | ||||
| """ | ||||
| # Due to FEReader trajectory only certain strategies work with the format | ||||
| if parallel_strategy is None: | ||||
| # avoid ProLIF trying to pickle FEReader/netCDF trajectory to auto-pick strategy | ||||
| parallel_strategy = "chunk" if (n_jobs is None or n_jobs == 1) else "queue" | ||||
|
|
||||
| _slice = slice(start, stop, step) | ||||
| traj = self.universe.trajectory[_slice] | ||||
|
|
||||
| try: | ||||
| n_total = len(self.universe.trajectory) | ||||
| s0, s1, s2 = _slice.indices(n_total) | ||||
| self.frames = np.arange(s0, s1, s2, dtype=int) | ||||
| self.n_frames = len(traj) | ||||
|
|
||||
| if ( | ||||
| hasattr(self.universe.trajectory, "times") | ||||
| and self.universe.trajectory.times is not None | ||||
| ): | ||||
| self.times = np.asarray(self.universe.trajectory.times)[self.frames] | ||||
| elif getattr(self.universe.trajectory, "dt", None) is not None: | ||||
| self.times = self.frames * self.universe.trajectory.dt | ||||
| else: | ||||
| self.times = None | ||||
| except Exception: | ||||
| self.frames = None | ||||
| self.times = None | ||||
| self.n_frames = None | ||||
|
|
||||
| if converter_kwargs is None: | ||||
| # Avoid Valence errors | ||||
| converter_kwargs = ( | ||||
| {"inferrer": None, "implicit_hydrogens": False}, # ligand | ||||
| {"implicit_hydrogens": False}, # protein | ||||
| ) | ||||
|
|
||||
| self.fp.run( | ||||
| traj, | ||||
| self.ligand_ag, | ||||
| self.protein_ag, | ||||
| residues=residues, | ||||
| converter_kwargs=converter_kwargs, | ||||
| progress=progress, | ||||
| n_jobs=n_jobs, | ||||
| parallel_strategy=parallel_strategy, | ||||
| ) | ||||
|
|
||||
| self._conclude() | ||||
| return self | ||||
|
|
||||
| # For now, depending on what we do withe the data | ||||
| def to_dataframe(self, **kwargs): | ||||
| """ | ||||
| Transform fingerprint results to pd.DataFrame. | ||||
| """ | ||||
| df = self.fp.to_dataframe(**kwargs) | ||||
| self.results.ifp_df = df | ||||
| return df | ||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| import MDAnalysis as mda | ||
| import numpy as np | ||
| import pytest | ||
| from rdkit.Chem import Lipinski | ||
|
|
||
| from openfe_analysis.reader import FEReader | ||
| from openfe_analysis.prolif import ProLIFAnalysis | ||
|
|
||
|
|
||
| def test_prolifanalysis_runs_vdwcontact( | ||
| simulation_skipped_nc, hybrid_system_skipped_pdb | ||
| ): | ||
| """ | ||
| Test for identification of interactions | ||
| """ | ||
| u = mda.Universe( | ||
| hybrid_system_skipped_pdb, simulation_skipped_nc, format=FEReader, index=0 | ||
| ) | ||
| ligand_ag = u.select_atoms("resname UNK") | ||
|
|
||
| analysis = ProLIFAnalysis( | ||
| u, ligand_ag, interactions=["VdWContact"], guess_bonds=True | ||
| ) | ||
| analysis.run(stop=5, step=1, n_jobs=1, progress=False) | ||
|
|
||
| df = analysis.to_dataframe(dtype=np.uint8) | ||
| assert df.shape[0] == 5 | ||
| assert hasattr(analysis.fp, "ifp") | ||
| assert len(analysis.fp.ifp) == 5 | ||
|
|
||
| # Check AnalysisBase | ||
| assert hasattr(analysis, "results") | ||
| assert hasattr(analysis.results, "ifp") | ||
| assert analysis.results.ifp is analysis.fp.ifp | ||
| assert len(analysis.results.ifp) == 5 | ||
|
|
||
| assert analysis.results.ifp_df is df | ||
|
|
||
| # Ensure there is at least one detected interaction across all processed frames | ||
| assert sum(len(v) for v in analysis.fp.ifp.values()) > 0 | ||
|
|
||
|
|
||
| def test_guess_bonds_enables_protein_chemistry( | ||
| simulation_skipped_nc, hybrid_system_skipped_pdb | ||
| ): | ||
| """ | ||
| Test for protein connectivity | ||
| """ | ||
| u = mda.Universe( | ||
| hybrid_system_skipped_pdb, simulation_skipped_nc, format=FEReader, index=0 | ||
| ) | ||
| ligand_ag = u.select_atoms("resname UNK") | ||
|
|
||
| analysis = ProLIFAnalysis( | ||
| u, ligand_ag, interactions=["VdWContact"], guess_bonds=True | ||
| ) | ||
|
|
||
| # pick a residue from the pocket and check it has connectivity in RDKit | ||
| u.trajectory[0] | ||
| res_atoms = analysis.protein_ag.residues[0].atoms | ||
| res_mol = res_atoms.convert_to("RDKIT", implicit_hydrogens=False) | ||
| assert res_mol.GetNumBonds() > 0 | ||
|
|
||
| # ensure the protein donors/acceptors exist | ||
| prot_mol = analysis.protein_ag.convert_to("RDKIT", implicit_hydrogens=False) | ||
| assert Lipinski.NumHDonors(prot_mol) + Lipinski.NumHAcceptors(prot_mol) > 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing import for type check