Source code for adaptivemd.analysis.pyemma.emma

##############################################################################
# adaptiveMD: A Python Framework to Run Adaptive Molecular Dynamics (MD)
#             Simulations on HPC Resources
# Copyright 2017 FU Berlin and the Authors
#
# Authors: Jan-Hendrik Prinz
# Contributors:
#
# `adaptiveMD` is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 2.1
# of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with MDTraj. If not, see <http://www.gnu.org/licenses/>.
##############################################################################


import os

from adaptivemd import PythonTask
from adaptivemd.analysis import Analysis
from adaptivemd.mongodb import DataDict
from adaptivemd.model import Model

from _remote import remote_analysis


[docs]class PyEMMAAnalysis(Analysis):
    """
    Common computation of correlations between features using PyEmma

    Attributes
    ----------
    engine : `Engine`
        reference to an engine that knows about the topology
    outtype : str
        name of the output description to pick the frames from
    features : dict or list or None
        a feature descriptor in the format. A dict has exactly one entry:
        ``functionname: [attr1, attr2, ...]``. attributes can be results of
        function calls. All function calls are to the featurizer object!
        If a list is given each element is considered to be a feature
        descriptor. If None (default) all coordinates will be added as
        features ``.add_all()``

        Examples

        ::code

            # feat.add_backbone_torsions()
            {'add_backbone_torsions': None}

            # feat.add_distances([[0,10], [2,20]])
            {'add_distances': [ [[0,10], [2,20]] ]}

            # feat.add_inverse_distances(select_backbone())
            {'add_inverse_distances': {'select_backbone': None}}

    """

    def __init__(self, engine, outtype='master', features=None):

        super(PyEMMAAnalysis, self).__init__()

        pdb_file = engine['pdb_file']

        # todo: reuse the engines staged pdb_file if possible

        self['pdb_file'] = pdb_file
        stage = pdb_file.transfer('staging:///')

        self['pdb_file_stage'] = stage.target
        self.initial_staging.append(stage)

        self.outtype = outtype
        self.engine = engine
        self.features = features

[docs]    @classmethod
    def from_dict(cls, dct):
        obj = super(Analysis, cls).from_dict(dct)
        for k in ['outtype', 'engine', 'features']:
            setattr(obj, k, dct[k])

        return obj

[docs]    def to_dict(self):
        dct = super(Analysis, self).to_dict()
        for k in ['outtype', 'engine', 'features']:
            dct[k] = getattr(self, k)
        return dct

[docs]    @staticmethod
    def then_func(project, task, data, inputs):
        # add the input arguments for later reference
        data['input']['trajectories'] = inputs['trajectories']
        data['input']['pdb'] = inputs['topfile']

        # from the task we get the used generator and then its outtype
        data['input']['modeller'] = task.generator

        # wrapping in a DataDict allows storage of large files!
        model = Model(DataDict(data))
        project.models.add(model)

[docs]    def execute(
            self,
            trajectories,
            tica_lag=2,
            tica_dim=2,
            msm_states=5,
            msm_lag=2,
            stride=1):
        """
        Create a task that computes an msm using a given set of trajectories

        Parameters
        ----------
        trajectories : list of `Trajectory`
            the list of trajectory references to be used in the computation
        tica_lag : int
            the lag-time used for tCIA
        tica_dim : int
            number of dimensions using in tICA. This refers to the number of tIC used
        msm_states : int
            number of micro-states used for the MSM
        msm_lag : int
            lagtime used for the MSM construction
        stride : int
            a stride to be used on the data. Can speed up computation at reduced accuracy

        Returns
        -------
        `Task`
            a task object describing a simple python RPC call using pyemma

        """

        # we call the PythonTask with self to tell him about the generator used
        # this will fire the then_func from the generator once finished
        t = PythonTask(self)

        # we handle the returned output ourselves -> its stored as a model
        # so do not store the returned JSON also
        t.store_output = False

        # copy the output.json to a models/model.{uuid}.json file
        t.backup_output_json(
            os.path.join('project:///models', 'model.' + hex(t.__uuid__) + '.json'))

        input_pdb = t.link(self['pdb_file_stage'], 'input.pdb')

        trajs = list(trajectories)

        if len(trajs) == 0:
            # nothing to analyze
            return

        outtype = self.outtype
        features = self.features

        for traj in trajs:
            if outtype not in traj.types:
                # ups, one of the trajectories does not have the required type!
                return

        ty = trajs[0].types[outtype]

        t.call(
            remote_analysis,
            trajectories=trajs,
            traj_name=ty.filename,  # we need the filename in the traj folder
            selection=ty.selection,  # tell pyemma the subsets of atoms
            features=features,
            topfile=input_pdb,
            tica_lag=tica_lag,
            tica_dim=tica_dim,
            msm_states=msm_states,
            msm_lag=msm_lag,
            stride=stride
        )

        return t