Source code for mlcg_tk.scripts.gen_sim_input
import os.path as osp
import sys
from mlcg_tk.input_generator.raw_dataset import SampleCollection, RawDataset, SimInput
from mlcg_tk.input_generator.embedding_maps import (
CGEmbeddingMap,
)
from mlcg_tk.input_generator.raw_data_loader import DatasetLoader, SimInput_loader
from mlcg_tk.input_generator.prior_gen import Bonds, PriorBuilder
from mlcg_tk.input_generator.utils import get_output_tag
from tqdm import tqdm
from time import ctime
from typing import Dict, List, Union, Callable, Optional, Type
from jsonargparse import CLI
import pickle as pck
import numpy as np
from mlcg.data import AtomicData
import torch
from copy import deepcopy
[docs]
def process_sim_input(
dataset_name: str,
raw_data_dir: str,
save_dir: str,
tag: str,
pdb_fns: List[str],
cg_atoms: List[str],
embedding_map: CGEmbeddingMap,
embedding_func: Callable,
skip_residues: List[str],
copies: int,
prior_tag: str,
prior_builders: List[PriorBuilder],
mass_scale: Optional[float] = 418.4,
collection_cls: Type[SampleCollection] = SampleCollection,
smpl_loader: Type[DatasetLoader] = SimInput_loader,
):
r"""
Generates input AtomicData objects for coarse-grained simulations
Parameters
----------
dataset_name : str
Name given to specific dataset
raw_data_dir : str
Path to location of input structures
save_dir : str
Path to directory in which output will be saved
tag : str
Label given to all output files produced from dataset
pdb_fns : str
List of pdb filenames from which input will be generated
cg_atoms : List[str]
List of atom names to preserve in coarse-grained resolution
embedding_map : CGEmbeddingMap
Mapping object
embedding_func : Callable
Function which will be used to apply CG mapping
skip_residues : List[str]
List of residues to skip, can be None
copies : int
Copies that will be produced of each structure listing in pdb_fns
prior_tag : str
String identifying the specific combination of prior terms
prior_builders : List[PriorBuilder]
List of PriorBuilder objects and their corresponding parameters
mass_scale : str
Optional scaling factor applied to atomic masses
collection_cls : Type[SampleCollection]
Class type for sample collection
smpl_loader : Type[DatasetLoader]
Loader class for dataset
"""
cg_coord_list = []
cg_type_list = []
cg_mass_list = []
cg_nls_list = []
dataset = SimInput(dataset_name, tag, pdb_fns, collection_cls=collection_cls)
for samples in tqdm(dataset, f"Processing CG data for {dataset_name} dataset..."):
sample_loader = smpl_loader()
samples.input_traj, samples.top_dataframe = sample_loader.get_traj_top(
name=samples.name, raw_data_dir=raw_data_dir
)
samples.apply_cg_mapping(
cg_atoms=cg_atoms,
embedding_function=embedding_func,
embedding_dict=embedding_map,
skip_residues=skip_residues,
)
cg_trajs = samples.input_traj.atom_slice(samples.cg_atom_indices)
cg_masses = (
np.array([atom.element.mass for atom in cg_trajs[0].topology.atoms])
/ mass_scale
)
prior_nls = samples.get_prior_nls(
prior_builders=prior_builders,
save_nls=False,
save_dir=save_dir,
prior_tag=prior_tag,
)
cg_types = samples.cg_dataframe["type"].to_list()
for i in range(cg_trajs.n_frames):
cg_traj = cg_trajs[i]
cg_coords = cg_traj.xyz * 10
for i in range(copies):
cg_coord_list.append(cg_coords)
cg_type_list.append(cg_types)
cg_mass_list.append(cg_masses)
cg_nls_list.append(prior_nls)
data_list = []
for coords, types, masses, nls in zip(
cg_coord_list, cg_type_list, cg_mass_list, cg_nls_list
):
data = AtomicData.from_points(
pos=torch.tensor(coords[0]),
atom_types=torch.tensor(types),
masses=torch.tensor(masses),
)
data.neighbor_list = deepcopy(nls)
data_list.append(data)
torch.save(
data_list,
f"{save_dir}{get_output_tag([dataset_name, tag], placement='before')}configurations.pt",
)
def main():
print("Start gen_sim_input.py: {}".format(ctime()))
CLI([process_sim_input])
print("Finish gen_sim_input.py: {}".format(ctime()))
if __name__ == "__main__":
main()