Source code for mlcg_tk.scripts.package_training_data

import os.path as osp
import sys

from mlcg_tk.input_generator.raw_dataset import RawDataset
from mlcg_tk.input_generator.utils import get_output_tag

from tqdm import tqdm
from time import ctime
import numpy as np
import pickle as pck
from typing import List, Union, Optional
from sklearn.model_selection import train_test_split
from jsonargparse import CLI
from copy import deepcopy
import h5py
import yaml



[docs]
def package_training_data(
    dataset_name: str,
    names: List[str],
    dataset_tag: str,
    force_tag: str,
    training_data_dir: str,
    save_dir: str,
    save_h5: Optional[bool] = True,
    save_partition: Optional[bool] = True,
    single_protein: Optional[bool] = False,
    batch_size: int = 256,
    stride: int = 1,
    train_size: Union[float, int, None] = 0.8,
    train_mols: Optional[List[str]] = None,
    val_mols: Optional[List[str]] = None,
    random_state: Optional[str] = None,
    mol_num_batches: Optional[int] = 1,
    keep_batches: Optional[bool] = False,
):
    """
    Computes structural features and accumulates statistics on dataset samples

    Parameters
    ----------
    dataset_name : str
        Name given to specific dataset
    dataset_tag : str
        Label given to all output files produced from dataset
    names : List[str]
        List of sample names
    force_tag : str
        Label given to produced delta forces and saved packaged data
    training_data_dir : str
        Path to directory from which input will be loaded
    save_dir : str
        Path to directory to which output will be saved
    save_h5 : bool
        Whether to save dataset h5 file(s)
    save_partition : bool
        Whether to save dataset partition file(s)
    single_protein : bool
        Whether the produced partition file should be for a single-molecule model
        Will be ignored if save_partition is False
    batch_size : int
        Number of samples of dataset to include in each training batch
    stride : int
        Integer by which to stride frames
    train_size : Union[float, int]
        Either the proportion (if float) or number of samples (if int) of molecules in training data
        If None, lists should be supplied for training and validation samples
    train_mols : Optional[List]
        Molecules to be used for training set
    val_mols : Optional[List]
        Molecules to be used for validation set
    random_state : Optional[str]
        Controls shuffling applied to the data before applying the split
    mol_num_batches : int
        If greater than 1, will load each molecule data from the specified number of batches
        that were be treated as different samples
    keep_batches : bool
        If set to True, batches will be put as individual molecules in the h5 dataset and
        the partition file will be built accordingly. Otherwise, if batches exist, they will be
        accumulated into one single molecule.
    """
    if keep_batches and mol_num_batches > 1:
        dataset = RawDataset(
            dataset_name, names, dataset_tag, n_batches=mol_num_batches
        )
    else:
        dataset = RawDataset(dataset_name, names, dataset_tag)
    output_tag = get_output_tag([dataset_name, force_tag], placement="after")
    non_empty_names = []
    if save_h5:
        # Create H5 of training data
        fnout_h5 = osp.join(save_dir, f"{output_tag[1:]}.h5")

        with h5py.File(fnout_h5, "w") as f:
            metaset = f.create_group(dataset_name)
            for samples in tqdm(dataset, f"Packaging {dataset_name} dataset..."):
                if not samples.has_delta_forces_output(
                    training_data_dir=training_data_dir,
                    force_tag=force_tag,
                    mol_num_batches=mol_num_batches,
                ):
                    continue
                else:
                    non_empty_names.append(samples.mol_name)

                if mol_num_batches > 1 and not keep_batches:
                    (
                        cg_coords,
                        cg_delta_forces,
                        cg_embeds,
                    ) = samples.load_all_batches_training_inputs(
                        training_data_dir=training_data_dir,
                        force_tag=force_tag,
                        mol_num_batches=mol_num_batches,
                    )
                else:
                    (
                        cg_coords,
                        cg_delta_forces,
                        cg_embeds,
                    ) = samples.load_training_inputs(
                        training_data_dir=training_data_dir,
                        force_tag=force_tag,
                    )

                name = f"{samples.tag}{samples.name}"
                hdf_group = metaset.create_group(name)

                hdf_group.create_dataset("cg_coords", data=cg_coords.astype(np.float32))
                hdf_group.create_dataset(
                    "cg_delta_forces", data=cg_delta_forces.astype(np.float32)
                )
                hdf_group.attrs["cg_embeds"] = cg_embeds
                hdf_group.attrs["N_frames"] = cg_coords.shape[0]

    if save_partition:
        # Create partition file
        fnout_part = osp.join(save_dir, f"partition{output_tag}.yaml")
        if single_protein:
            if keep_batches and mol_num_batches > 1:
                train_mols = [
                    f"{dataset_tag}{name}_batch_{b}"
                    for b in range(mol_num_batches)
                    for name in non_empty_names
                ]
                val_mols = [
                    f"{dataset_tag}{name}_batch_{b}"
                    for b in range(mol_num_batches)
                    for name in non_empty_names
                ]
            else:
                train_mols = [f"{dataset_tag}{name}" for name in non_empty_names]
                val_mols = [f"{dataset_tag}{name}" for name in non_empty_names]
            if train_size == None:
                raise ValueError(
                    "For single-protein partitions, a train size has to be specified"
                )
            if not isinstance(train_size, float):
                raise ValueError(
                    "For single-protein partitions, train_size has to be a float corresponding to the ratio of frames for training"
                )
            assert train_size <= 1.0, "train_size has to be a ratio of frames below 1.0"
        else:
            if train_mols == None and val_mols == None:
                if train_size == None:
                    raise ValueError(
                        "Either a train size or predefined lists for training and validation samples must be specified."
                    )

                train_mols, val_mols = train_test_split(
                    [f"{dataset_tag}{name}" for name in non_empty_names],
                    train_size=train_size,
                    shuffle=True,
                    random_state=random_state,
                )
            elif train_mols != None:
                val_mols = deepcopy(non_empty_names).remove(train_mols)
            elif val_mols != None:
                train_mols = deepcopy(non_empty_names).remove(val_mols)

        partition_opts = {"train": {}, "val": {}}

        # make training data partition
        partition_opts["train"]["metasets"] = {}
        partition_opts["train"]["metasets"][dataset_name] = {
            "molecules": train_mols,
            "stride": stride,
        }
        partition_opts["train"]["batch_sizes"] = {dataset_name: batch_size}

        # make validation data partition
        partition_opts["val"]["metasets"] = {}
        partition_opts["val"]["metasets"][dataset_name] = {
            "molecules": val_mols,
            "stride": stride,
        }
        partition_opts["val"]["batch_sizes"] = {dataset_name: batch_size}

        if single_protein:
            partition_opts["train"]["metasets"][dataset_name]["detailed_indices"] = {}
            partition_opts["val"]["metasets"][dataset_name]["detailed_indices"] = {}
            for mol in train_mols:
                with h5py.File(fnout_h5, "r") as f:
                    n_frames = f[dataset_name][mol]["cg_coords"].shape[0]
                train_frames, val_frames = train_test_split(
                    np.arange(n_frames),
                    train_size=train_size,
                    shuffle=True,
                    random_state=random_state,
                )
                mol_output_tag = get_output_tag([mol, force_tag], placement="after")
                train_fnout = osp.join(save_dir, f"train_idx{mol_output_tag}.npy")
                val_fnout = osp.join(save_dir, f"val_idx{mol_output_tag}.npy")

                np.save(train_fnout, train_frames)
                np.save(val_fnout, val_frames)

                partition_opts["train"]["metasets"][dataset_name]["detailed_indices"][
                    mol
                ] = train_fnout
                partition_opts["val"]["metasets"][dataset_name]["detailed_indices"][
                    mol
                ] = val_fnout
        with open(fnout_part, "w") as ofile:
            yaml.dump(partition_opts, ofile)




[docs]
def combine_datasets(
    dataset_names: List[str],
    save_dir: str,
    force_tag: Optional[str],
    save_h5: Optional[bool] = True,
    save_partition: Optional[bool] = True,
):
    """
    Computes structural features and accumulates statistics on dataset samples

    Parameters
    ----------
    dataset_names : List[str]
        List of dataset name to combine
    save_dir : str
        Path to directory from which datasets will be loaded and to which output will be saved
    force_tag : str
        Label given to produced delta forces and saved packaged data
    save_h5 : bool
        Whether to save dataset h5 file(s)
    save_partition : bool
        Whether to save dataset partition file(s)
    """

    datasets_label = "_".join(dataset_names)
    output_tag = get_output_tag([datasets_label, force_tag], placement="after")

    if save_h5:
        fnout_h5 = osp.join(save_dir, f"combined{output_tag}.h5")

        with h5py.File(fnout_h5, "w") as f:
            for dataset in dataset_names:
                f[dataset] = h5py.ExternalLink(
                    f"{dataset}{get_output_tag(force_tag, placement='after')}.h5",
                    f"/{dataset}",
                )

    if save_partition:
        fnout_part = osp.join(save_dir, f"partition{output_tag}.yaml")

        partition_opts = {"train": {}, "val": {}}
        partition_opts["train"]["metasets"] = {}
        partition_opts["train"]["batch_sizes"] = {}
        partition_opts["val"]["metasets"] = {}
        partition_opts["val"]["batch_sizes"] = {}

        for dataset in dataset_names:
            data_fn = osp.join(
                save_dir,
                f"partition_{dataset}{get_output_tag(force_tag, placement='after')}.yaml",
            )
            with open(data_fn, "r") as ifile:
                data_partition = yaml.safe_load(ifile)

            # make training data partition
            partition_opts["train"]["metasets"][dataset] = data_partition["train"][
                "metasets"
            ][dataset]
            partition_opts["train"]["batch_sizes"] = {
                dataset: data_partition["train"]["batch_sizes"]
            }

            # make validation data partition
            partition_opts["val"]["metasets"][dataset] = data_partition["val"][
                "metasets"
            ][dataset]
            partition_opts["val"]["batch_sizes"] = {
                dataset: data_partition["val"]["batch_sizes"]
            }

        with open(fnout_part, "w") as ofile:
            yaml.dump(partition_opts, ofile)



def main():
    print("Start package_training_data.py: {}".format(ctime()))

    CLI([package_training_data, combine_datasets], as_positional=False)

    print("Finish package_training_data.py: {}".format(ctime()))


if __name__ == "__main__":
    main()