# Source code for camparitraj.cttrajectory

"""
cttrajectory.py

cttrajectory is the main class through which simulation trajectories are read in.

"""

##
##                                       _ _              _
##   ___ __ _ _ __ ___  _ __   __ _ _ __(_) |_ _ __ __ _ (_)
##  / __/ _ | '_  _ \| '_ \ / _ | '__| | __| '__/ _ || |
## | (_| (_| | | | | | | |_) | (_| | |  | | |_| | | (_| || |
##  \___\__,_|_| |_| |_| .__/ \__,_|_|  |_|\__|_|  \__,_|/ |
##                     |_|                             |__/
##
##
## Alex Holehouse (Pappu Lab)
## Simulation analysis package
##

import mdtraj as md
import numpy as np
from .configs import *
from .ctdata  import ALL_VALID_RESIDUE_NAMES

from .ctprotein import CTProtein
from .ctexceptions import CTException
from . import ctutils
from . import ctio

[docs]class CTTrajectory:
"""
CTrajectory class that holds a single simulation trajectory object.

"""

#oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo
#
#
[docs]    def __init__(self, trajectory_filename=None, pdb_filename=None, TRJ=None, protein_grouping=None, pdblead=False, debug=False):
"""
CAMPARITraj trajectory object initializer.

CAMPARITraj will, by default, extract out the protein component from
protein only (i.e. without salt ions getting in the way).

Note that by default the mechanism by which individual proteins are
identified is by cycling through the unique chains and determining
if they are protein or not. You can also provide manual grouping
via the protein_grouping option, which lets you define which
residues should make up an individual protein. This can be useful
if you have multiple proteins associated with the same chain, which
happens in CAMPARI if you have more than 26 separate chains (i.e.
every protein after the 26th is the 'Z' chain).

Parameters
----------
trajectory_filename : str
Filename which contains the trajectory file of interest. Normally \
this is __traj.xtc or __traj.dcd.

pdb_filename : str
Filename which contains the pdb file associated with the trajectory \
of interest. Normally this is __START.pdb.

TRJ : mdtraj.Trajectory
It is sometimes useful to re-defined a trajectory and create a new CTTraj \
object from that trajectory. This could be done by writing that new trajectory \
to file, but this is extremely slow due to the I/O impact of reading/writing \
from disk. If an mdtraj trajectory objected is passed, this is used as the \
new trajectory from which the CTTrajectory object is constructed.

Default = None

protein_grouping : list of lists of ints
Lets you manually define protein groups to be considered independently.

Default = None

Lets you set the PDB file (which is normally ONLY used as a topology \
file) to be the first frame of the trajectory. This is useful when \
the first PDB file holds some specific reference information which \
you want to use (e.g. RMSD or Q).

Default = False

debug : book
Prints warning/help information to help debug weird stuff during initial trajectory read-in.
Default = False.
"""

# first we decide if we're reading from file or from an existing trajectory
if (trajectory_filename is None) and (pdb_filename is None):
if TRJ is None:
raise CTException('No input provided! Please provide ether a pdb and trajectory file OR a pre-formed traj object')

# note the [:] means this is a COPY!
self.traj = TRJ[:]
else:
if (trajectory_filename is None):
raise CTException('No trajectory file provided!')
if (pdb_filename is None):
raise CTException('No PDB file provided!')

# read in the raw trajectory

# Next, having read in the trajectory we parse out into proteins
# extract a list of protein trajectories where each protein is assumed
# to be in its own chain
if protein_grouping == None:
(self.proteinTrajectoryList, self.resid_offset_list, self.atom_offset_list) = self.__get_proteins(self.traj, debug)
else:
(self.proteinTrajectoryList, self.resid_offset_list, self.atom_offset_list)  = self.__get_proteins_by_residue(self.traj, protein_grouping, debug)

self.num_proteins = len(self.proteinTrajectoryList)
self.n_frames = len(self.traj)

def  __repr__(self):
return "CTTrajectory (%s): %i proteins and %i frames" % (hex(id(self)), self.num_proteins, self.n_frames)

def __len__(self):
return (self.num_proteins, self.n_frames)

#oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo
#
#
"""
Internal function which parses and reads in a CAMPARI trajectory

Read a trajectory file. This was separated out into its own

Notably older versions of CAMPARI mess up the unitcell length
vectors, so will cause problems, but you can get around this by
having GROMACS rebuild the trajectory. If this happens an error
pops up but instructions on how to use GROMACS to fix it are
presented.

Parameters
-----------
trajectory_filename : str
Filename which contains the trajectory file of interest. File type
is automatically detected and dealt with mdtraj' 'load' command

pdb_filename : str
Filename which contains the pdb file associated with the trajectory
of interest. This defines the topology of the system and must match
the trajectory in terms of number of atomas

Also extract the coordinates from the PDB file and append it to
the front of the trajectory. This is useful if you are starting
an analysis where that first structure should be a reference frame
but it's not actually included in the trajectory file.

Returns
--------
mdtraj.traj
Returns an mdtraj trajectory object

"""

# straight up read the trajectory first using mdtraj's awesome

# check unit cell lengths
try:
uc_lengths = traj.unitcell_lengths[0]

# this is s custom warning for a specific edge-case we encounter a lot
if (uc_lengths[0] == 0 or uc_lengths[1] == 0 or uc_lengths[2] == 0):
ctio.warning_message("Trajectory file unit cell lengths are zero for at least one dimension. This is a probably a bug with an FRC generated __START.pdb file, because in the old version of CAMPARI used to do grid based FRC calculations the unit cell dimensions are not written correctly. This may cause issues but we're going to assume everything is OK for now. Check the validity of any analysis output. If you're worried, you can use the following workaround.\n\n:::: WORK AROUNDS ::::\nSimply run\n\ntrjconv -f __traj.xtc -s __START.pdb -box a b c -o frc.xtc \n\nAn then \n\ntrjconv -f frc.xtc -s __START.pdb -box a b c -o start.pdb -dump 0\n\n\nHere\n-f n__traj.xtc   : defines the trajectory file\n-s __start.pdb   : defines the pdb file used to parse the topology\n-box a b c       : defines the box lengths **in nanometers**\n-o frc.xtc       : is the name of the new trajectory file with updated box lengths\nSelect 0 (system) when asked to 'Select group for output'.The second step creates the equivalent PDB file with the header-line correctly defining the box unit cell lengths and angles. These two new files should then be used for analysis.\n\nAs an example, if my FRC simulation had a sphere radius of 100 angstroms then my correction command would look something like \n\ntrjconv -f __traj.xtc -s __START.pdb -box 20 20 20 -o frc.xtc\ntrjconv -f frc.xtc -s __START.pdb -box 20 20 20 -o start.pdb -dump 0")

except TypeError:
ctio.warning_message("Warning: UnitCell lengths were not provided... This may cause issues but we're going to assume everything is OK for now...")

# if pdbLead is true then load the pdb_filename as a trajectory
# and then add it to the front (the PDB file is its own topology
# file so no need to specificy the top= file here!
traj = pdbtraj+traj

# having added the PDB file now check all the unit-cells match up!
try:
uc_lengths_1 = traj.unitcell_lengths[0]
uc_lengths_2 = traj.unitcell_lengths[1]

if (uc_lengths_1[0] != uc_lengths_2[0]) or (uc_lengths_1[2] != uc_lengths_2[2]) or (uc_lengths_1[2] != uc_lengths_2[2]):
ctio.warning_message('........................\nWARNING:\nThe unit cell dimensions of the PDB file and trajectory file did not match, specifically\nPDB file = [ %s ]\nXTC file = [ %s ]\nThis may cause issues if native MDTraj untilities are used (and potentially for CTraj utilities that are based on these. It is not necessarily an issue, but PLEASE sanity check your outcome. To be save we reeommend editing the PDB-file untilcell dimenions to match.')

except TypeError:
ctio.warning_message("Warning: UnitCell lengths were not provided... This may cause issues but we're going to assume everything is OK for now...")

return traj

#oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo
#
#
def __get_proteins(self, trajectory, debug):
"""
Internal function that takes an MDTraj trajectory and returns a list of mdtraj
trajectory objects corresponding to each protein in the system, ASSUMING that
each protein is in its own chain.

The way this works is to cycle through each chain, identify if that chain
contains protein or not, and if it does grab all the atoms in that chain
and perform an atomslice using those atoms on the main trajectory.

Parameters
-----------
trajectory : mdtraj.Trajectory
An already parsed trajectory object (i.e. checked for CAMPARI-
relevant defects such as unitcell issues etc)

Returns
---------
tuple :
Returns a tuple with three lists:

proteinTrajectoryList - contains a list of 0 or more CTProtein objcts
resid_offset_list     - contains a list of 0 or more integers which are
resid offset values
atom_offset_list      - contains a list of 0 or more integers which are
atom offset values

Note all three lists must be the same length (by definition)

"""

atom_offset_list  = []
resid_offset_list = []

# extract full system topology
topology = trajectory.topology

chainAtoms = []

# for each chain in this toplogy determine if the
# first residue is protein or not
for chain in topology.chains:

# if the first residue in the chain is protein
# note that a formic acid cap ('FOR') is not recognized as protein
# so we include an edgecase here for that
if chain.residue(0).name in ALL_VALID_RESIDUE_NAMES:

# intialize an empty list of atoms
local_atoms = []

# save that index value associated with
# the first residue in the chain.
resid_offset_list.append(chain.residue(0).index)

for atom in chain.atoms:
local_atoms.append(atom.index)

chainAtoms.append(local_atoms)

# save first atom associated with this chain for offset
atom_offset_list.append(local_atoms[0])

## Code below was the old way of builing the atom sets which
## as of mdtraj 1.9.3 the above seems more efficient
"""
# for each residue in the chain
for residue in chain.residues:

# for each atom in the residue
for atom in residue.atoms:

# append all those atoms to our list of atom indicies
atoms.append(atom.index)

# now add all the atom indices for this protein
# chain as a single list to the super-list of
# chain atoms
chainAtoms.append(atoms)
"""
else:
if debug:
ctio.debug_message('Skipping residue %s from %s' %(chain.residue(0).name, chain))

# for each protein chain that we have atomic indices
# for (hopefully all of them!) cycle through and create
# sub-trajectories
proteinTrajectoryList = []
for local_chain_atoms in chainAtoms:

# generate a trajectory composed of *JUST* the
# $chain atoms. The PT object created now is self # consistent and contains an associated and fully # correct .topology object (NOTE this fixes a # previous bug in CAMPARITraj 0.1.4) PT = trajectory.atom_slice(local_chain_atoms) # gets the resid offset in a way that is ensures internal # consistency for the CTProtein object resid_offset = PT.topology.chain(0).residue(0).index # add that trajectory, along the index value associated with # the resid offset proteinTrajectoryList.append(CTProtein(PT, resid_offset)) if len(proteinTrajectoryList) == 0: ctio.warning_message('No protein chains found in the trajectory') return (proteinTrajectoryList, resid_offset_list, atom_offset_list) #oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo # # def __get_proteins_by_residue(self, trajectory, residue_grouping, debug): """ Internal function which returns a list of mdtraj trajectory objects corresponding to each protein where we *explicitly* define the residues in each protein. Unlike the __get_proteins() function, which doesn't require any manual input in identifying the proteins, here we provide a list of groups, where each group is the set of residues associated with a protein. The way this works is to cycle through each group, and for each residue in each group grabs all the atoms and uses these to carry out an atomslice on the full trajectory. Parameters ----------- trajectory : mdtraj.Trajectory An already parsed trajectory object (i.e. checked for CAMPARI- relevant defects such as unitcell issues etc) residue_grouping : list of list of integers Must be a list containing one or more lists, where each internal list contains a set of monotonically increasing residues (which correspond to the full protein trajectory). In other words, each sub-list defines a single protein. The integer indexing here - importantly - uses the CAMPARITraj internal residue indexing, meaning that indexing begins at 0 from the first residue in the PDB file. """ atom_offset_list = [] resid_offset_list = [] group_atoms = [] # extract full system topology topology = trajectory.topology # for each chain in this toplogy determine if the # first residue is protein or not for group in residue_grouping: local_atoms = [] for res_index in group: # get residue object residue = topology.residue(res_index) resid_offset_list.append(residue.index) # for each atom in the residue for atom in residue.atoms: # append all those atoms to our list of atom indicies local_atoms.append(atom.index) # save first atom associated with this group for offset atom_offset_list.append(local_atoms[0]) # now add all the atom indices for this protein # chain as a single list to the superlist of # chain atoms group_atoms.append(local_atoms) # for each protein group that we have atomic indices # for cycle through and create sub-trajectories proteinTrajectoryList=[] # cycle through each group of atoms as was defined by the residue_grouping # indices for local_group_atoms in group_atoms: # generate a trajectory composed of *JUST* the #$chain atoms. The PT object created now is self
# consistent and contains an associated and fully
# correct .topology object (NOTE this fixes a
# previous bug in CAMPARITraj 0.1.4)
PT = trajectory.atom_slice(local_group_atoms)

# gets the resid offset in a way that is ensures internal
# consistency for the CTProtein object
resid_offset = PT.topology.chain(0).residue(0).index

proteinTrajectoryList.append(CTProtein(PT, resid_offset))

if len(proteinTrajectoryList) == 0:
ctio.warning_message('No protein chains found in the trajectory')

return (proteinTrajectoryList, resid_offset_list, atom_offset_list)

#oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo
#
#
[docs]    def get_interchain_distance_map(self, proteinID1, proteinID2, resID1=None, resID2=None):
"""
Function which returns two matrices with the mean and standard deviation distances
between the residues in resID1 from proteinID1 and resID2 from proteinID2

This computes the (full) intramolecular distance map, where the "distancemap"
function computes the intermolecular distance map.

Specifically, this allows the user to define two distinct chains (i.e. an "interchain"
distance map).

Obviously this only makes sense if your system has two separate protein objects
defined, but in principle the output from:

intra_chain_distance_Map(0,0)

would be the same as:

proteinTrajectoryList[0].get_distance_map()

This is actually a useful sanity check!

Parameters
----------

proteinID1 : int
The ID of the first protein of the two being considered, where the ID is the proteins position in the
self.proteinTrajectoryList list.

proteinID2 : int
The ID of the second protein of the two being considered, where the ID is the proteins position in the
self.proteinTrajectoryList list

resID1 : list of integers, default=None
Is the list of residues from protein 1 we're considering. If this is left as None (default), then it
is assumed that all residues in proteinID1 should be used

resID2 : list of integers, default=None
Is the list of residues from protein 2 we're considering.If this is left as None (default), then it
is assumed that all residues in proteinID2 should be used

Returns
-------
tuple : tuple containing distanceMap and STDMap

distanceMap : numpy matrix
Is an [n x m] matrix where n and m are the number of proteinID1 residues
and proteinID2 residues. Each position in the matrix corresponds to the
mean distance between those two residues over the course of the simulation.

stdMap : numpy matrix
Is an [n x m] matrix where n and m are the number of proteinID1 residues
and proteinID2 residues. Each position in the matrix corresponds to the
standard devaiation associated with the distances between those two
residues.

"""

# get CTProtein objects for the two IDs passed (could be the same)
P1 = self.proteinTrajectoryList[proteinID1]
P2 = self.proteinTrajectoryList[proteinID2]

P1_atom_offset = self.atom_offset_list[proteinID1]
P2_atom_offset = self.atom_offset_list[proteinID2]

# get all the raw CA atom indices we're gonna be working with
CA_p1_raw = P1.get_multiple_CA_index(resID1)
CA_p2_raw = P2.get_multiple_CA_index(resID2)

# atom indices in subtrajectories (i.e. in the trajectories found in proteinTrajectory)
# always start from 0 from their own trajectory! To get around this and relate atom
# numbers in subtrajectories back to the FULL trajectory we have to add the atomic
# offset associated with each subtrajectory
CA_p1 = []
CA_p2 = []
for atom in CA_p1_raw:
CA_p1.append(atom + P1_atom_offset)

for atom in CA_p2_raw:
CA_p2.append(atom + P2_atom_offset)

# create the empty distance maps
distanceMap = np.zeros([len(CA_p1),len(CA_p2),])
stdMap      = np.zeros([len(CA_p1),len(CA_p2),])

# calculate the FULL distance map (so 2N rather than N time).
# note this is actually the non-redundant map, because only in the limit
# of perfect sampling is it necesessarily true that
#
# P1-R5 ::: P2-R10 == P1-R10 :::: P2-R5
#
#

count = 0
for CA1 in CA_p1:
pairs = []
for CA2 in CA_p2:

pairs.append([CA1, CA2])

data = 10*md.compute_distances(self.traj, np.array(pairs))
print("On residue %i (protein 1) out of %i" % (count+1, len(CA_p1)))

# calculate mean and standard deviation over the trajectory data
mean_data = np.mean(data,0)
std_data = np.std(data,0)

# assign rows in the matrix
distanceMap[count] = mean_data
stdMap[count]      = std_data

# increment the outercounter
count = count + 1

return (distanceMap, stdMap)

#oxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxoxoxoxoxoxoxoxoxoxooxoxo
#
#
[docs]    def get_interchain_distance(self, proteinID1, proteinID2, R1, R2, A1='CA', A2='CA', stride=1, mode='atom'):
"""
Function which returns the distance between two specific atoms on two residues, or between
two residues based on mdtraj' atomselection mode rules (discussed below). Required input are protein
ID selectors and the resid being used. Resids should be used as would be normally used for the CTProtein
objects associated with proteinID1 and proteinID2.

For inter-atomic distances, atoms are selected from the passed residue and their 'name' field from the topology
selection language (e.g. "CA", "CB" "NH" etc). By default CA atoms are used, but one can define
any residue of interest. We do not perform any sanity checking on the atom name - this gets
really hard - so have an explicit try/except block which will warn you that you've probably
selected an illegal atom name from the residues.

For inter-residue distances the associated rules are defined by the 'mode' selector. By default
mode is set to 'atom', which means the variables A1 and A2 are used (with CA as default) to define
inter-residue distance. However, if one of the other modes are used the A1/A2 parameters are ignored
and alternative rules for computing inter-residue distance are used. These modes are detailed below.

Distance is returned in Angstroms.

Parameters
----------
R1 : int
Residue index of first residue

R2 : int
Residue index of second residue

A1 : str
Atom name of the atom in R1 we're looking at. Default = 'CA'

A2 : str
Atom name of the atom in R2 we're looking at. Default='CA'

stride : int
Defines the spacing between frames to compare with - i.e. take every \$stride-th frame.
Setting stride=1 would mean every frame is used, which would mean you're doing an
all vs. all comparisons, which would be ideal BUT may be slow. Default = 1

mode : str
Mode allows the user to define different modes for computing atomic distance.

The default is 'atom' whereby a pair of atoms (A1 and A2) are provided. Other options
are detailed below and are identical to those offered by mdtraj in compute_contacts.

Note that if modes other than 'atom' are used the A1 and A2 options are ignored.

+ 'ca' - same as setting 'atom' and A1='CA' and A2='CA', this uses the C-alpha atoms.
+ 'closest' - closest atom associated with each of the residues, i.e. the is the point \
of closest approach between the two residues.
+ 'closest-heavy' - same as 'closest', except only non-hydrogen atoms are considered.
+ 'sidechain' - closest atom where that atom is in the sidechain. Note this requires \
mdtraj version 1.8.0 or higher.
+ 'sidechain-heavy' - closest atom where that atom is in the sidechain and is heavy. \
Note this requires mdtraj version 1.8.0 or higher.

Returns
-----------
np.array
Returns a 1D numpy array with the distance-per-frame betwee the specified residues

"""

# check mode keyword is valid
if mode in [ 'closest', 'ca',  'closest-heavy',  'sidechain', 'sidechain-heavy',  'atom']:
pass
else:
raise CTException("Provided mode keyword must be one of 'closest', 'ca', 'closest-heavy', 'sidechain', sidechain-heavy', or 'atom'. Provided keyword was [%s]" % (mode))

# get CTProtein objects for the two IDs passed (could be the same)
P1 = self.proteinTrajectoryList[proteinID1]
P2 = self.proteinTrajectoryList[proteinID2]

# get resids that correspond to the FULL trajectory
R1_re_referenced = R1 + self.resid_offset_list[proteinID1]
R2_re_referenced = R2 + self.resid_offset_list[proteinID2]

try:
# if atom mode was used
if mode == 'atom':
if A1 == 'CA' and A2 == 'CA':
if float(stride) != float(1):
subtraj = self.traj.slice(list(range(0, self.n_frames, stride)))
else:
subtraj = self.traj

distances = 10*md.compute_contacts(subtraj, [[R1_re_referenced, R2_re_referenced]], scheme='ca')[0].ravel()

else:

atom1 = P1._CTProtein__residue_atom_lookup(R1,A1)
if len(atom1) == 0:
raise CTException('Unable to find atom [%s] in residue R1 (%i)' % (A1, R1))

TRJ_1 = self.traj.atom_slice(atom1)
TRJ_1 = TRJ_1.slice(list(range(0, self.traj.n_frames, stride)))

atom2 = P2._CTProtein__residue_atom_lookup(R2,A2)
if len(atom2) == 0:
raise CTException('Unable to find atom [%s] in residue R1 (%i)' % (A2, R2))

TRJ_2 = self.traj.atom_slice(atom1)
TRJ_2 = TRJ_2.slice(list(range(0, self.traj.n_frames, stride)))

COM_1 = md.compute_center_of_mass(TRJ_1)
COM_2 = md.compute_center_of_mass(TRJ_2)

distances = 10*np.sqrt(np.square(np.transpose(COM_1)[0] - np.transpose(COM_2)[0]) + np.square(np.transpose(COM_1)[1] - np.transpose(COM_2)[1])+np.square(np.transpose(COM_1)[2] - np.transpose(COM_2)[2]))

except IndexError as e:

print("<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@")
print("")
print("This is likely because one of [%s] or [%s] is not a valid atom type for the residue in question. Full error printed below" %( A1,A2))
print("")
print("<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@<>@")
print(e)
raise e

# parse any of the allowed modes in compute_contacts (see http://mdtraj.org/1.8.0/api/generated/mdtraj.compute_contacts.html
# for more details!)
if mode == 'closest' or mode == 'ca' or mode == 'closest-heavy' or mode == 'sidechain' or mode == 'sidechain-heavy':
if float(stride) != float(1):
subtraj = self.traj.slice(list(range(0, self.traj.n_frames, stride)))
else:
subtraj = self.traj

try:
distances = 10*md.compute_contacts(subtraj, [[R1,R2]], scheme=mode)[0].ravel()
except ValueError as e:
print(e)
raise CTException('Your current version of mdtraj does not support [%s] - please update mdtraj to 1.8.0 or later to facilitate support. Alternatively this may be because residue %i or %i is not parsed correctly by mdtraj' % (mode, R1, R2))

# note 10* to get Angstroms
return distances