Source code for mlcg_tk.input_generator.raw_data_loader

import numpy as np
import os
from natsort import natsorted
from glob import glob
import h5py
from typing import Tuple, Optional, List
import mdtraj as md
import warnings
from pathlib import Path
from tqdm import tqdm
from .utils import chunker


[docs] class DatasetLoader: r""" Base loader object for a dataset. Defines the interface that all loader should have. As there is no standard way of saving the output of molecular dynamics simulations, the objective of this is class is to be an interface which will load the MD data from a given dataset stored in a local path. It will further process the data to remove thing that are not relevant to our system (for example, solvent coordinates and forces) and returned the clean coordinate and force as np.ndarrays and an mdtraj object that carries the topology The loader should be flexible enough to handle datasets with different molecules, multiple independent trajectories of different length and other things. """
[docs] def get_traj_top(self, name: str, pdb_fn: str): r""" Function to get the topology associated with a trajectory in the dataset. PDB files represent molecules by putting the position of every atom in cartesian space. From the distances between atoms, covalent bonds and other things can be deduced. Note that, by convention, raw PDB files use angstroms as the unit of the coordinates. Engines such as Pymol and mdtraj convert this values to nanometers Parameters ---------- name: Name of input sample (i.e. the molecule or object in the data) pdb_fn: String representing a path to a PDB file which has the topolgy for """ raise NotImplementedError(f"Base class {self.__class__} has no implementation")
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: r""" Method to load the coordinate and force data This requires the base directory where files can be saved, the name of the molecule we are trying to load and a stride parameter. The stride is useful for cases where the data is saved too frequently. Parameters ---------- base_dir: Path to coordinate and force files. name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ raise NotImplementedError(f"Base class {self.__class__} has no implementation")
[docs] class CATH_loader(DatasetLoader): """ Loader object for original 50 CATH domain proteins """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given CATH domain name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given CATH domain name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ outputs_fns = natsorted(glob(os.path.join(base_dir, f"output/{name}/*_part_*"))) if n_batches > 1: raise NotImplementedError( "mol_num_batches can only be used for single-protein datasets for now" ) aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for fn in outputs_fns: output = np.load(fn) coord = output["coords"] coord = 10.0 * coord # convert nm to angstroms force = output["Fs"] force = force / 41.84 # convert to from kJ/mol/nm to kcal/mol/ang assert coord.shape == force.shape aa_coord_list.append(coord) aa_force_list.append(force) aa_coords = np.concatenate(aa_coord_list)[::stride] aa_forces = np.concatenate(aa_force_list)[::stride] return aa_coords, aa_forces
[docs] class CATH_ext_loader(DatasetLoader): """ Loader object for extended dataset of CATH domain proteins """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given CATH domain name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb_fns = glob(pdb_fn.format(name)) pdb = md.load(pdb_fns[0]) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given CATH domain name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample """ if n_batches > 1: raise NotImplementedError( "mol_num_batches can only be used for single-protein datasets for now" ) traj_dirs = glob(os.path.join(base_dir, f"group_*/{name}_*/")) all_coords = [] all_forces = [] for traj_dir in tqdm(traj_dirs): traj_coords = [] traj_forces = [] fns = glob(os.path.join(traj_dir, "prod_out_full_output/*.npz")) fns.sort(key=lambda file: int(file.split("_")[-2])) last_parent_id = None for fn in fns: np_dict = np.load(fn, allow_pickle=True) current_id = np_dict["id"] parent_id = np_dict["parent_id"] if parent_id is not None: assert parent_id == last_parent_id traj_coords.append(np_dict["coords"]) traj_forces.append(np_dict["Fs"]) last_parent_id = current_id traj_full_coords = np.concatenate(traj_coords) traj_full_forces = np.concatenate(traj_forces) if traj_full_coords.shape[0] != 25000: continue else: all_coords.append(traj_full_coords[::stride]) all_forces.append(traj_full_forces[::stride]) full_coords = np.concatenate(all_coords) full_forces = np.concatenate(all_forces) return full_coords, full_forces
[docs] class DIMER_loader(DatasetLoader): """ Loader object for original dataset of mono- and dipeptide pairwise umbrella sampling simulations """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given DIMER pair name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given DIMER pair name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ if n_batches > 1: raise NotImplementedError( "mol_num_batches can only be used for single-protein datasets for now" ) with h5py.File(os.path.join(base_dir, "allatom.h5"), "r") as data: coord = data["MINI"][name]["aa_coords"][:][::stride] force = data["MINI"][name]["aa_forces"][:][::stride] # convert to kcal/mol/angstrom and angstrom # from kJ/mol/nm and nm coord = coord * 10 force = force / 41.84 return coord, force
[docs] class DIMER_ext_loader(DatasetLoader): """ Loader object for extended dataset of mono- and dipeptide pairwise umbrella sampling simulations """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given DIMER pair name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb_fns = glob(pdb_fn.format(name)) if len(pdb_fns) == 1: pdb = md.load(pdb_fns[0]) elif len(pdb_fns) > 1: warnings.warn( f"Pattern {pdb_fn.format(name)} has more than one result", category=UserWarning, ) else: return None, None aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given DIMER pair name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample """ if n_batches > 1: raise NotImplementedError( "mol_num_batches can only be used for single-protein datasets for now" ) coord = np.load( glob(os.path.join(base_dir, f"dip_dimers_*/data/{name}_coord.npy"))[0], allow_pickle=True, )[::stride] force = np.load( glob(os.path.join(base_dir, f"dip_dimers_*/data/{name}_force.npy"))[0], allow_pickle=True, )[::stride] # convert to kcal/mol/angstrom and angstrom # from kJ/mol/nm and nm coord = coord * 10 force = force / 41.84 return coord, force
[docs] class Trpcage_loader(DatasetLoader): """ Loader object for Trpcage simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob( os.path.join(base_dir, f"coords_nowater/trp_coor_folding-trpcage_*.npy") ) ) forces_fns = [ fn.replace( "coords_nowater/trp_coor_folding", "forces_nowater/trp_force_folding" ) for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class Cln_loader(DatasetLoader):
[docs] def get_traj_top(self, name: str, pdb_fn: str): pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob(os.path.join(base_dir, f"coords_nowater/chig_coor_*.npy")) ) forces_fns = [ fn.replace("coords_nowater/chig_coor_", "forces_nowater/chig_force_") for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class BBA_loader(DatasetLoader): """ Loader object for CHARMM22* BBA simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob(os.path.join(base_dir, f"coords_nowater/bba_coor_folding-bba_*.npy")) ) forces_fns = [ fn.replace( "coords_nowater/bba_coor_folding", "forces_nowater/bba_force_folding" ) for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class Villin_loader(DatasetLoader):
[docs] def get_traj_top(self, name: str, pdb_fn: str): pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ pdb_top = md.load_pdb(os.path.join(base_dir, f"topology.pdb")) coords_fns = sorted( glob(os.path.join(base_dir, f"coords_nowater/villin*_coor*.xtc")) ) # combining all trajectories from single starting structure forces_fns = [ fn.replace( "coords_nowater/villin_coor", "forces_nowater/villin_force" ).replace(".xtc", ".dcd") for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] for c, f in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): coords = md.load_xtc(c, top=pdb_top).xyz * 10 forces = md.load_dcd(f, top=pdb_top).xyz * 10 if coords.shape == forces.shape: aa_coord_list.append(coords[::stride]) aa_force_list.append(forces[::stride]) else: warnings.warn( f"file {c} and \n file {f} \n have different shapes. Skipping" ) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class NTL9_loader(DatasetLoader): r""" Loader object for CHARMM22* NTL9 simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): r""" For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, chop: int = 5, ) -> Tuple[np.ndarray, np.ndarray]: r""" For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks chop: int initial frames to discard from every simulation. Necessary due to some problem frames. """ traj_name_pat = "ntl9_coor_" coord_pattern = Path(base_dir) / "coords_nowater" / f"{traj_name_pat}*.npy" coords_fns = natsorted(glob(str(coord_pattern))) forces_fns = [ fn.replace("coords_nowater", "forces_nowater").replace("coor", "force") for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn)[chop::stride] # in AA coord = np.load(cfn)[chop::stride] # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord) aa_force_list.append(force) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class ProteinG_loader(DatasetLoader): """ Loader object for CHARMM22* Protein G simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob( os.path.join( base_dir, f"coords_3_nowater/proteing_coor_folding-proteing-*.npy" ) ) ) forces_fns = [ fn.replace( "coords_3_nowater/proteing_coor_folding", "forces_3_nowater/proteing_force_folding", ) for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class A3D_loader(DatasetLoader): """ Loader object for CHARMM22* A3D simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob(os.path.join(base_dir, f"coords_*_nowater/a3D_coor_folding-a3d-*.npy")) ) forces_fns = [] for coord_fn in coords_fns: if "coords_1_nowater" in coord_fn: forces_fns.append( coord_fn.replace( "coords_1_nowater/a3D_coor_folding", "forces_1_nowater/a3D_force_folding", ) ) elif "coords_2_nowater" in coord_fn: forces_fns.append( coord_fn.replace( "coords_2_nowater/a3D_coor_folding", "forces_2_nowater/a3D_force_folding", ) ) else: raise RuntimeError("wrong path in A3D loader") coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class OPEP_loader(DatasetLoader): """ Loader for octapeptides dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ if n_batches > 1: raise NotImplementedError( "mol_num_batches can only be used for single-protein datasets for now" ) coord_files = sorted( glob(os.path.join(base_dir, f"coords_nowater/opep_{name}/*.npy")) ) if len(coord_files) == 0: coord_files = sorted( glob(os.path.join(base_dir, f"coords_nowater/coor_opep_{name}_*.npy")) ) force_files = sorted( glob(os.path.join(base_dir, f"forces_nowater/opep_{name}/*.npy")) ) if len(force_files) == 0: force_files = sorted( glob(os.path.join(base_dir, f"forces_nowater/force_opep_{name}_*.npy")) ) aa_coord_list = [] aa_force_list = [] for c, f in zip(coord_files, force_files): coords = np.load(c) forces = np.load(f) aa_coord_list.append(coords[::stride]) aa_force_list.append(forces[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class WWdomain_loader(DatasetLoader): """ Loader object for CHARMM22* wwdomain simulation dataset """
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: """ For a given name, returns np.ndarray's of its coordinates and forces at the input resolution (generally atomistic) Parameters ---------- base_dir: Path to coordinate and force files name: Name of input sample stride : int Interval by which to stride loaded data batch: int or None if trajectories are loaded by batch, indicates the batch index to load must be set if n_batches > 1 n_batches: int if greater than 1, divide the total trajectories to load into n_batches chunks """ coords_fns = natsorted( glob( os.path.join( base_dir, f"coords_nowater/wwdomain_coor_folding-WWdomain_*.npy" ) ) ) forces_fns = [ fn.replace( "coords_nowater/wwdomain_coor_folding", "forces_nowater/wwdomain_force_folding", ) for fn in coords_fns ] coords_fns = np.array(coords_fns) forces_fns = np.array(forces_fns) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(coords_fns))], n_batches=n_batches ) coords_fns = coords_fns[np.array(chunk_ids[batch])] forces_fns = forces_fns[np.array(chunk_ids[batch])] aa_coord_list = [] aa_force_list = [] # load the files, checking against the mol dictionary for cfn, ffn in tqdm(zip(coords_fns, forces_fns), total=len(coords_fns)): force = np.load(ffn) # in AA coord = np.load(cfn) # in kcal/mol/AA assert coord.shape == force.shape aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class HDF5_loader(DatasetLoader): r""" Base class for loading data stored in an HDF5 data format. The file can have any structure as long as all the independent trajectories are represented by groups that have keys "coords" and "Fs" with arrays of shape (n_frames,n_atoms,3) """ @staticmethod def _get_all_traj_groups(h5file: h5py.File) -> List[str]: r""" Base unction to get all the possible trajectory groups. """ traj_group_arr = [] def visit_func(name, node): r""" Helper function to collect names of internal groups. """ if isinstance(node, h5py.Group): node_keys = node.keys() if "coords" in node_keys and "Fs" in node_keys: traj_group_arr.append(name) h5file.visititems(visit_func) return traj_group_arr
[docs] @staticmethod def get_traj_groups(h5file: h5py.File) -> List[str]: r""" Function to get the trajectory groups of this dataset. This is a wrapper over `_get_all_traj_groups` so that sublcasses can filter certain trajectories that are undesirable """ return HDF5_loader._get_all_traj_groups(h5file)
[docs] def get_traj_top(self, name: str, pdb_fn: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample pdb_fn: Path to pdb structure file """ pdb = md.load(pdb_fn.format(name)).remove_solvent() aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: aa_coord_list = [] aa_force_list = [] with h5py.File(os.path.join(base_dir), "r") as dataset: traj_group_arr = self.get_traj_groups(dataset) traj_group_arr = np.array(traj_group_arr) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker( [i for i in range(len(traj_group_arr))], n_batches=n_batches ) traj_group_arr = traj_group_arr[np.array(chunk_ids[batch])] if len(traj_group_arr) == 0: raise ValueError(f"No trajectories found in supplied h5") for traj_path in tqdm(traj_group_arr, total=len(traj_group_arr)): traj = dataset.get(traj_path) if traj is None: raise ValueError(f"Failed to find trajectory in group {traj_path}") coord = traj["coords"][:] force = traj["Fs"][:] aa_coord_list.append(coord[::stride]) aa_force_list.append(force[::stride]) aa_coords = np.concatenate(aa_coord_list) aa_forces = np.concatenate(aa_force_list) return aa_coords, aa_forces
[docs] class SimInput_loader(DatasetLoader): """ Loader for protein structures to be used in CG simulations """
[docs] def get_traj_top(self, name: str, raw_data_dir: str): """ For a given name, returns a loaded MDTraj object at the input resolution (generally atomistic) as well as the dataframe associated with its topology. Parameters ---------- name: Name of input sample raw_data_dir: Path to pdb structure file """ pdb = md.load(f"{raw_data_dir}{name}.pdb") input_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = input_traj.topology.to_dataframe()[0] return input_traj, top_dataframe
[docs] class MHC_loader(DatasetLoader): """Loader for the MHC protein dataset."""
[docs] def get_traj_top(self, name: str, pdb_fn: str): pdb_files = glob(pdb_fn.format(name)) if not pdb_files: raise FileNotFoundError(f"No PDB file found for {name}") pdb = md.load(pdb_files[0]) aa_traj = pdb.atom_slice( [a.index for a in pdb.topology.atoms if a.residue.is_protein] ) top_dataframe = aa_traj.topology.to_dataframe()[0] return aa_traj, top_dataframe
[docs] def load_coords_forces( self, base_dir: str, name: str, stride: int = 1, batch: Optional[int] = None, n_batches: Optional[int] = 1, ) -> Tuple[np.ndarray, np.ndarray]: traj_dirs = glob(os.path.join(base_dir, f"group_*/{name}_*/")) coords_all, forces_all = [], [] for traj_dir in tqdm(traj_dirs, desc=f"Loading {name}"): traj_coords, traj_forces = [], [] files = sorted( glob(os.path.join(traj_dir, "prod_0_full_output/*.npz")), key=lambda f: int(f.split("_")[-2]) if "_" in f else 0, ) files = np.array(files) if n_batches > 1: assert batch is not None, "batch id must be set if more than 1 batch" chunk_ids = chunker([i for i in range(len(files))], n_batches=n_batches) files = files[np.array(chunk_ids[batch])] for fn in tqdm(files): data = np.load(fn, allow_pickle=True) traj_coords.append(data["coords"]) traj_forces.append(data["Fs"]) if traj_coords: coords_all.append(np.concatenate(traj_coords)[::stride]) forces_all.append(np.concatenate(traj_forces)[::stride]) if not coords_all: raise RuntimeError(f"No trajectory data for {name}") coords_all = np.concatenate(coords_all) forces_all = np.concatenate(forces_all) return coords_all, forces_all