Skip to content
234 changes: 144 additions & 90 deletions src/openfe_analysis/rmsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
import MDAnalysis as mda
import netCDF4 as nc
import numpy as np
import tqdm
from MDAnalysis.analysis import rms
from MDAnalysis.lib.mdamath import make_whole
from MDAnalysis.analysis.base import AnalysisBase
from MDAnalysis.transformations import unwrap
from numpy import typing as npt

from .reader import FEReader
from .transformations import Aligner, ClosestImageShift, NoJump
Expand Down Expand Up @@ -99,6 +97,139 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers
return u


class ProteinRMSD(AnalysisBase):
"""Calculate the 1D protein RMSD"""

def __init__(self, atomgroup, **kwargs):
super(ProteinRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs)

self._ag = atomgroup

def _prepare(self):
"""
Set the first frame as the reference
"""
self.results.rmsd = []
self._reference = self._ag.positions

def _single_frame(self):
rmsd = rms.rmsd(
self._ag.positions,
self._reference,
center=False,
superposition=False,
)
self.results.rmsd.append(rmsd)

def _conclude(self):
self.results.rmsd = np.asarray(self.results.rmsd)


class Protein2DRMSD(AnalysisBase):
"""
Flattened 2D RMSD matrix

For all unique frame pairs ``(i, j)`` with ``i < j``, this function
computes the RMSD between atomic coordinates after optimal alignment.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you maybe expand this to mention you're doing a center of geometry fit as well as a rotational and translational superposition usingg QCP?

"""

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explicitly define _analysis_algorithm_is_parallelizable = False (it's inherited by default, but it would be good to have it explicitly defined here) in these classes and then raise an issue about looking into parallism?

def __init__(self, atomgroup, weights=None, **kwargs):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and everywhere else, add typing where possible?

"""
Parameters
----------
atomgroup: AtomGroup
Protein atoms (e.g. CA selection)
weights: np.ndarray, optional
Per-atom weights to use in the RMSD calculation. If ``None``,
all atoms are weighted equally.
"""
super(Protein2DRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs)

self._weights = weights
self._ag = atomgroup

def _prepare(self):
self._coords = []
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you pre-allocate numpy arrays here instead?

self.results.rmsd2d = []

def _single_frame(self):
self._coords.append(self._ag.positions)

def _conclude(self):
positions = np.asarray(self._coords)
nframes, _, _ = positions.shape

output = []
for i, j in itertools.combinations(range(nframes), 2):
posi, posj = positions[i], positions[j]
rmsd = rms.rmsd(
posi,
posj,
self._weights,
center=True,
superposition=True,
)
output.append(rmsd)

self.results.rmsd2d = np.asarray(output)


class LigandRMSD(AnalysisBase):
"""
1D RMSD time series for a ligand AtomGroup.
"""

def __init__(self, atomgroup, **kwargs):
super(LigandRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_analysis_algorithm_is_parallelizable

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remember to add.

self._ag = atomgroup

def _prepare(self):
self.results.rmsd = []
self._reference = self._ag.positions
self._weights = self._ag.masses / np.mean(self._ag.masses)

def _single_frame(self):
rmsd = rms.rmsd(
self._ag.positions,
self._reference,
self._weights,
center=False,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you document why this is False here but not in 2D RMSD? Are there situations where you would want this to be True? (i.e. should it be a kwarg option?)

superposition=False,
)
self.results.rmsd.append(rmsd)

def _conclude(self):
self.results.rmsd = np.asarray(self.results.rmsd)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 initial thoughts:

  • Can we make a more general RMSD class that can be reused for protein and ligand analysis, basically it should work on any atom group and does this not already exist in MDAnalysis?
  • Do we not want to switch to use the symmetry RMSD (spyrmsd I think its called)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jthorton : There's a difference between the RMSD class in MDAnalysis in what we've been doing, and I'm not sure yet what we want. In MDAnalysis, by default they are doing a superposition (rotational and translational), while we didn't do that. Now I'm not sure, what are we actually interested in, esp. for the ligand RMSD. Should this be the RMSD of the internal conformation of the ligand, or the RMSD of the ligand in the binding pocket/ligand pose? I think in the solvent we would definitely be more interested in the internal conformation of the ligand. In the binding pocket, I'm not sure. We also calculate the ligand COM displacement as a measure of stability in the pocket, but I'm not sure what we would want for the RMSD. What do you think?



class LigandCOMDrift(AnalysisBase):
"""
Ligand center-of-mass displacement from initial position.
"""

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above re: parallelizable

def __init__(self, atomgroup, **kwargs):
super(LigandCOMDrift, self).__init__(atomgroup.universe.trajectory, **kwargs)

self._ag = atomgroup

def _prepare(self):
self.results.com_drift = []
self._initial_com = self._ag.center_of_mass()

def _single_frame(self):
# distance between start and current ligand position
# ignores PBC, but we've already centered the traj
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why ignore PBC? Could you not just pass the box kwarg argument along? Or is the box distorted because of the transformation?

Please document this in the docstring.

drift = mda.lib.distances.calc_bonds(
self._ag.center_of_mass(),
self._initial_com,
)
self.results.com_drift.append(drift)

def _conclude(self):
self.results.com_drift = np.asarray(self.results.com_drift)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: it may be more ever so slightly efficient to just pre-allocate the array ahead of time in _prepare by defining a numpy array of length self.n_frames. This also has the nice side effect of not needing a _conclude definition.



def gather_rms_data(
pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None
) -> dict[str, list[float]]:
Expand Down Expand Up @@ -159,8 +290,6 @@ def gather_rms_data(
# max against 1 to avoid skip=0 case
skip = max(n_frames // 500, 1)

pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda)

u_top = mda.Universe(pdb_topology)

for i in range(n_lambda):
Expand All @@ -171,93 +300,18 @@ def gather_rms_data(
prot = u.select_atoms("protein and name CA")
ligand = u.select_atoms("resname UNK")

# save coordinates for 2D RMSD matrix
# TODO: Some smart guard to avoid allocating a silly amount of memory?
prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32)

prot_start = prot.positions
ligand_start = ligand.positions
ligand_initial_com = ligand.center_of_mass()
ligand_weights = ligand.masses / np.mean(ligand.masses)

this_protein_rmsd = []
this_ligand_rmsd = []
this_ligand_wander = []

for ts_i, ts in enumerate(u.trajectory[::skip]):
pb.update()

if prot:
prot2d[ts_i, :, :] = prot.positions
this_protein_rmsd.append(
rms.rmsd(
prot.positions,
prot_start,
None, # prot_weights,
center=False,
superposition=False,
)
)
if ligand:
this_ligand_rmsd.append(
rms.rmsd(
ligand.positions,
ligand_start,
ligand_weights,
center=False,
superposition=False,
)
)
this_ligand_wander.append(
# distance between start and current ligand position
# ignores PBC, but we've already centered the traj
mda.lib.distances.calc_bonds(ligand.center_of_mass(), ligand_initial_com)
)

if prot:
# can ignore weights here as it's all Ca
rmsd2d = twoD_RMSD(prot2d, w=None) # prot_weights)
output["protein_RMSD"].append(this_protein_rmsd)
output["protein_2D_RMSD"].append(rmsd2d)
if ligand:
output["ligand_RMSD"].append(this_ligand_rmsd)
output["ligand_wander"].append(this_ligand_wander)

output["time(ps)"] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt)

return output


def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]:
"""
Compute a flattened 2D RMSD matrix from a trajectory.

For all unique frame pairs ``(i, j)`` with ``i < j``, this function
computes the RMSD between atomic coordinates after optimal alignment.

Parameters
----------
positions : np.ndarray
Atomic coordinates for all frames in the trajectory.
w : np.ndarray, optional
Per-atom weights to use in the RMSD calculation. If ``None``,
all atoms are weighted equally.
prot_rmsd = ProteinRMSD(prot).run(step=skip)
output["protein_RMSD"].append(prot_rmsd.results.rmsd)
prot_rmsd2d = Protein2DRMSD(prot).run(step=skip)
output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d)

Returns
-------
list of float
Flattened list of RMSD values corresponding to all frame pairs
``(i, j)`` with ``i < j``.
"""
nframes, _, _ = positions.shape

output = []

for i, j in itertools.combinations(range(nframes), 2):
posi, posj = positions[i], positions[j]

rmsd = rms.rmsd(posi, posj, w, center=True, superposition=True)
if ligand:
lig_rmsd = LigandRMSD(ligand).run(step=skip)
output["ligand_RMSD"].append(lig_rmsd.results.rmsd)
lig_com_drift = LigandCOMDrift(ligand).run(step=skip)
output["ligand_wander"].append(lig_com_drift.results.com_drift)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is historical, so it doesn't have to be here, but can we please renamed this to ligand_com_drift or anything else? wander is such an unspecific name 😅


output.append(rmsd)
output["time(ps)"] = np.arange(len(u.trajectory))[::skip] * u.trajectory.dt

return output