diff --git a/examples/alexandria/find_json_files.py b/examples/alexandria/find_json_files.py index 0801efb5e..acbaee0ef 100644 --- a/examples/alexandria/find_json_files.py +++ b/examples/alexandria/find_json_files.py @@ -24,7 +24,7 @@ def find_json_files(url): url_root = "https://alexandria.icams.rub.de/data" # Replace with the actual URL -dirpath = "dataset/compressed_data" +dirpath = "datasets/compressed_data" if os.path.exists(dirpath) and os.path.isdir(dirpath): shutil.rmtree(dirpath) diff --git a/examples/alexandria/train.py b/examples/alexandria/train.py index 15624c03d..82969111d 100644 --- a/examples/alexandria/train.py +++ b/examples/alexandria/train.py @@ -15,17 +15,23 @@ from torch_geometric.transforms import Distance, Spherical, LocalCartesian import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg -from hydragnn.preprocess.utils import RadiusGraph, RadiusGraphPBC +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + RadiusGraph, + RadiusGraphPBC, +) from hydragnn.preprocess.load_data import split_dataset -import hydragnn.utils.tracer as tr -from hydragnn.utils.print_utils import iterate_tqdm, log +import hydragnn.utils.profiling_and_tracing.tracer as tr +from hydragnn.utils.print.print_utils import iterate_tqdm, log from generate_dictionaries_pure_elements import ( generate_dictionary_bulk_energies, @@ -38,7 +44,7 @@ pass import subprocess -from hydragnn.utils import nsplit +from hydragnn.utils.distributed import nsplit def info(*args, logtype="info", sep=" "): @@ -244,7 +250,7 @@ def get_magmoms_array_from_structure(structure): def process_file_content(self, filepath): """ - Download a file from a dataset of the Alexandria database with the respective index + Download a file from a datasets of the Alexandria database with the respective index and write it to the LMDB file with the respective index. Parameters @@ -311,7 +317,7 @@ def get(self, idx): type=bool, default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -321,14 +327,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -341,7 +347,7 @@ def get(self, idx): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -403,7 +409,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -417,7 +423,7 @@ def get(self, idx): ## pickle elif args.format == "pickle": basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -462,14 +468,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config diff --git a/examples/ani1_x/train.py b/examples/ani1_x/train.py index d4426ec8d..8fd55e4e4 100644 --- a/examples/ani1_x/train.py +++ b/examples/ani1_x/train.py @@ -1,41 +1,42 @@ -import os, re, json +import os, json import logging import sys from mpi4py import MPI import argparse -import glob - -import random import numpy as np import torch -from torch import tensor from torch_geometric.data import Data from torch_geometric.transforms import Distance, Spherical, LocalCartesian import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg -from hydragnn.preprocess.utils import RadiusGraph, RadiusGraphPBC +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + RadiusGraph, + RadiusGraphPBC, +) from hydragnn.preprocess.load_data import split_dataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import iterate_tqdm, log +from hydragnn.utils.print.print_utils import log try: from hydragnn.utils.adiosdataset import AdiosWriter, AdiosDataset except ImportError: pass -import subprocess -from hydragnn.utils import nsplit +from hydragnn.utils.distributed import nsplit import h5py @@ -189,7 +190,7 @@ def get(self, idx): type=bool, default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -199,14 +200,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -219,7 +220,7 @@ def get(self, idx): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -281,7 +282,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -295,7 +296,7 @@ def get(self, idx): ## pickle elif args.format == "pickle": basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -340,14 +341,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config diff --git a/examples/csce/train_gap.py b/examples/csce/train_gap.py index 8d6298437..f5c5c084d 100644 --- a/examples/csce/train_gap.py +++ b/examples/csce/train_gap.py @@ -16,17 +16,20 @@ import time import hydragnn -from hydragnn.utils.print_utils import print_distributed, iterate_tqdm, log -from hydragnn.utils.time_utils import Timer -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.utils.smiles_utils import ( +from hydragnn.utils.print.print_utils import print_distributed, iterate_tqdm, log +from hydragnn.utils.profiling_and_tracing.time_utils import Timer +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.utils.descriptors_and_embeddings.smiles_utils import ( get_node_attribute_name, generate_graphdata_from_smilestr, ) -from hydragnn.preprocess.utils import gather_deg -from hydragnn.utils import nsplit -import hydragnn.utils.tracer as tr +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg +from hydragnn.utils.distributed import nsplit +import hydragnn.utils.profiling_and_tracing.tracer as tr import numpy as np @@ -163,42 +166,42 @@ def __getitem__(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", ) group.add_argument( - "--csv", help="CSV dataset", action="store_const", dest="format", const="csv" + "--csv", help="CSV datasets", action="store_const", dest="format", const="csv" ) parser.set_defaults(format="adios") group1 = parser.add_mutually_exclusive_group() group1.add_argument( "--shmem", - help="shmem dataset", + help="shmem datasets", action="store_const", - dest="dataset", + dest="datasets", const="shmem", ) group1.add_argument( "--ddstore", - help="ddstore dataset", + help="ddstore datasets", action="store_const", - dest="dataset", + dest="datasets", const="ddstore", ) group1.add_argument( "--simple", - help="no special dataset", + help="no special datasets", action="store_const", - dest="dataset", + dest="datasets", const="simple", ) parser.set_defaults(dataset="simple") @@ -208,7 +211,7 @@ def __getitem__(self, idx): graph_feature_names = ["GAP"] graph_feature_dim = [1] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datafile = os.path.join(dirpwd, "dataset/csce_gap_synth.csv") + datafile = os.path.join(dirpwd, "datasets/csce_gap_synth.csv") ################################################################################################################## inputfilesubstr = args.inputfilesubstr input_filename = os.path.join(dirpwd, "csce_" + inputfilesubstr + ".json") @@ -295,7 +298,7 @@ def __getitem__(self, idx): config["pna_deg"] = deg ## pickle - basedir = os.path.join(os.path.dirname(__file__), "dataset", "pickle") + basedir = os.path.join(os.path.dirname(__file__), "datasets", "pickle") attrs = dict() attrs["pna_deg"] = deg SimplePickleWriter( @@ -318,7 +321,7 @@ def __getitem__(self, idx): use_subdir=True, ) - fname = os.path.join(os.path.dirname(__file__), "dataset", "csce_gap.bp") + fname = os.path.join(os.path.dirname(__file__), "datasets", "csce_gap.bp") adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) adwriter.add("valset", valset) @@ -346,20 +349,22 @@ def __getitem__(self, idx): opt = {"preload": False, "shmem": shmem, "ddstore": ddstore} fname = fname = os.path.join( - os.path.dirname(__file__), "dataset", "csce_gap.bp" + os.path.dirname(__file__), "datasets", "csce_gap.bp" ) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm) testset = AdiosDataset(fname, "testset", comm) comm.Barrier() elif args.format == "csv": - fname = os.path.join(os.path.dirname(__file__), "dataset", "csce_gap_synth.csv") + fname = os.path.join( + os.path.dirname(__file__), "datasets", "csce_gap_synth.csv" + ) fact = CSCEDatasetFactory(fname, args.sampling, var_config=var_config) trainset = CSCEDataset(fact, "trainset") valset = CSCEDataset(fact, "valset") testset = CSCEDataset(fact, "testset") elif args.format == "pickle": - basedir = os.path.join(os.path.dirname(__file__), "dataset", "pickle") + basedir = os.path.join(os.path.dirname(__file__), "datasets", "pickle") trainset = SimplePickleDataset(basedir, "trainset") valset = SimplePickleDataset(basedir, "valset") testset = SimplePickleDataset(basedir, "testset") diff --git a/examples/dftb_uv_spectrum/train_discrete_uv_spectrum.py b/examples/dftb_uv_spectrum/train_discrete_uv_spectrum.py index b81fd5b9b..2540893ae 100644 --- a/examples/dftb_uv_spectrum/train_discrete_uv_spectrum.py +++ b/examples/dftb_uv_spectrum/train_discrete_uv_spectrum.py @@ -29,7 +29,7 @@ from hydragnn.preprocess.load_data import split_dataset from hydragnn.utils.distdataset import DistDataset from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg import numpy as np @@ -75,7 +75,7 @@ def dftb_to_graph(moldir, dftb_node_types, var_config): class DFTBDataset(AbstractBaseDataset): - """DFTBDataset dataset class""" + """DFTBDataset datasets class""" def __init__(self, dirpath, dftb_node_types, var_config, dist=False, sampling=None): super().__init__() @@ -138,7 +138,7 @@ def get(self, idx): help="preprocess only (no training)", ) parser.add_argument("--mae", action="store_true", help="do mae calculation") - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -148,14 +148,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -166,7 +166,7 @@ def get(self, idx): graph_feature_names = ["frequencies", "intensities"] graph_feature_dim = [50, 50] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datafile = os.path.join(dirpwd, "dataset/dftb_aisd_electronic_excitation_spectrum") + datafile = os.path.join(dirpwd, "datasets/dftb_aisd_electronic_excitation_spectrum") ################################################################################################################## input_filename = os.path.join(dirpwd, "dftb_discrete_uv_spectrum.json") ################################################################################################################## @@ -227,7 +227,7 @@ def get(self, idx): config["pna_deg"] = deg ## adios - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) adwriter.add("valset", valset) @@ -239,7 +239,7 @@ def get(self, idx): ## pickle basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -283,14 +283,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset(basedir, "trainset") valset = SimplePickleDataset(basedir, "valset") diff --git a/examples/dftb_uv_spectrum/train_smooth_uv_spectrum.py b/examples/dftb_uv_spectrum/train_smooth_uv_spectrum.py index 5af612c4a..fe721822e 100644 --- a/examples/dftb_uv_spectrum/train_smooth_uv_spectrum.py +++ b/examples/dftb_uv_spectrum/train_smooth_uv_spectrum.py @@ -31,7 +31,7 @@ from hydragnn.preprocess.load_data import split_dataset from hydragnn.utils.distdataset import DistDataset from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg import numpy as np @@ -75,7 +75,7 @@ def dftb_to_graph(moldir, dftb_node_types, var_config): class DFTBDataset(AbstractBaseDataset): - """DFTBDataset dataset class""" + """DFTBDataset datasets class""" def __init__(self, dirpath, dftb_node_types, var_config, dist=False, sampling=None): super().__init__() @@ -138,7 +138,7 @@ def get(self, idx): help="preprocess only (no training)", ) parser.add_argument("--mae", action="store_true", help="do mae calculation") - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -148,14 +148,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -166,7 +166,7 @@ def get(self, idx): graph_feature_names = ["spectrum"] graph_feature_dim = [37500] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datafile = os.path.join(dirpwd, "dataset/dftb_aisd_electronic_excitation_spectrum") + datafile = os.path.join(dirpwd, "datasets/dftb_aisd_electronic_excitation_spectrum") ################################################################################################################## input_filename = os.path.join(dirpwd, "dftb_smooth_uv_spectrum.json") ################################################################################################################## @@ -227,7 +227,7 @@ def get(self, idx): config["pna_deg"] = deg ## adios - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) adwriter.add("valset", valset) @@ -239,7 +239,7 @@ def get(self, idx): ## pickle basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -283,14 +283,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset(basedir, "trainset") valset = SimplePickleDataset(basedir, "valset") diff --git a/examples/eam/eam.py b/examples/eam/eam.py index 64a8b804e..7c6340c0c 100644 --- a/examples/eam/eam.py +++ b/examples/eam/eam.py @@ -5,13 +5,15 @@ import argparse import hydragnn -from hydragnn.utils.time_utils import Timer -from hydragnn.utils.config_utils import get_log_name_config +from hydragnn.utils.profiling_and_tracing.time_utils import Timer +from hydragnn.utils.input_config_parsing.config_utils import get_log_name_config from hydragnn.utils.model import print_model -from hydragnn.utils.cfgdataset import CFGDataset -from hydragnn.utils.serializeddataset import SerializedWriter, SerializedDataset +from hydragnn.utils.datasets.cfgdataset import CFGDataset +from hydragnn.utils.datasets.serializeddataset import ( + SerializedWriter, + SerializedDataset, +) from hydragnn.preprocess.load_data import split_dataset -from hydragnn.utils.print_utils import log try: from hydragnn.utils.adiosdataset import AdiosWriter, AdiosDataset @@ -44,14 +46,14 @@ def info(*args, logtype="info", sep=" "): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -77,9 +79,9 @@ def info(*args, logtype="info", sep=" "): datefmt="%H:%M:%S", ) - os.environ["SERIALIZED_DATA_PATH"] = dirpwd + "/dataset" + os.environ["SERIALIZED_DATA_PATH"] = dirpwd + "/datasets" datasetname = config["Dataset"]["name"] - fname_adios = dirpwd + "/dataset/%s.bp" % (datasetname) + fname_adios = dirpwd + "/datasets/%s.bp" % (datasetname) config["Dataset"]["name"] = "%s_%d" % (datasetname, rank) if not args.loadexistingsplit: total = CFGDataset(config) @@ -93,7 +95,7 @@ def info(*args, logtype="info", sep=" "): if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % datasetname + os.path.dirname(__file__), "./datasets/%s.bp" % datasetname ) adwriter = AdiosWriter(fname, MPI.COMM_SELF) adwriter.add("trainset", trainset) @@ -104,7 +106,7 @@ def info(*args, logtype="info", sep=" "): adwriter.save() elif args.format == "pickle": basedir = os.path.join( - os.path.dirname(__file__), "dataset", "serialized_dataset" + os.path.dirname(__file__), "datasets", "serialized_dataset" ) SerializedWriter( trainset, @@ -138,14 +140,16 @@ def info(*args, logtype="info", sep=" "): "preload": True, "shmem": False, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % datasetname) + fname = os.path.join( + os.path.dirname(__file__), "./datasets/%s.bp" % datasetname + ) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "serialized_dataset" + os.path.dirname(__file__), "datasets", "serialized_dataset" ) trainset = SerializedDataset(basedir, datasetname, "trainset") valset = SerializedDataset(basedir, datasetname, "valset") diff --git a/examples/ising_model/train_ising.py b/examples/ising_model/train_ising.py index 71f3ea135..baa594d19 100644 --- a/examples/ising_model/train_ising.py +++ b/examples/ising_model/train_ising.py @@ -11,15 +11,18 @@ import argparse import hydragnn -from hydragnn.utils.print_utils import print_distributed, iterate_tqdm, log -from hydragnn.utils.time_utils import Timer -from hydragnn.utils.config_utils import get_log_name_config +from hydragnn.utils.print.print_utils import print_distributed, iterate_tqdm, log +from hydragnn.utils.profiling_and_tracing.time_utils import Timer +from hydragnn.utils.input_config_parsing.config_utils import get_log_name_config from hydragnn.preprocess.load_data import split_dataset from hydragnn.utils.model import print_model -from hydragnn.utils.lsmsdataset import LSMSDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.utils.datasets.lsmsdataset import LSMSDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg import numpy as np @@ -31,8 +34,6 @@ import torch import torch.distributed as dist -import warnings - ## For create_configurations import shutil from sympy.utilities.iterables import multiset_permutations @@ -41,8 +42,8 @@ from create_configurations import E_dimensionless -import hydragnn.utils.tracer as tr -from hydragnn.utils import nsplit +import hydragnn.utils.profiling_and_tracing.tracer as tr +from hydragnn.utils.distributed import nsplit def write_to_file(total_energy, atomic_features, count_config, dir, prefix): @@ -86,7 +87,7 @@ def create_dataset_mpi( os.makedirs(subdir, exist_ok=True) for num_downs in iterate_tqdm( - range(rx.start, rx.stop), verbosity_level=2, desc="Creating dataset" + range(rx.start, rx.stop), verbosity_level=2, desc="Creating datasets" ): prefix = "output_%d_" % num_downs subdir = os.path.join(dir, str(num_downs)) @@ -155,21 +156,21 @@ def info(*args, logtype="info", sep=" "): ) parser.add_argument("--seed", type=int, help="seed", default=43) parser.add_argument("--sampling", type=float, help="sampling ratio", default=None) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--log", help="log name") parser.add_argument("--everyone", action="store_true", help="gptimer") group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -219,12 +220,12 @@ def info(*args, logtype="info", sep=" "): """ Parallel ising data generation step: 1. Generate ising data (*.txt) in parallel (create_dataset_mpi) - 2. Read raw dataset in parallel (*.txt) (RawDataset) + 2. Read raw datasets in parallel (*.txt) (RawDataset) 3. Split into a train, valid, and test set (split_dataset) 4. Save as Adios file in parallel """ sys.setrecursionlimit(1000000) - dir = os.path.join(os.path.dirname(__file__), "./dataset/%s" % modelname) + dir = os.path.join(os.path.dirname(__file__), "./datasets/%s" % modelname) if rank == 0: if os.path.exists(dir): shutil.rmtree(dir) @@ -263,7 +264,7 @@ def info(*args, logtype="info", sep=" "): config["pna_deg"] = deg basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["minmax_node_feature"] = total.minmax_node_feature @@ -289,7 +290,7 @@ def info(*args, logtype="info", sep=" "): use_subdir=True, ) - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) adwriter.add("valset", valset) @@ -314,14 +315,14 @@ def info(*args, logtype="info", sep=" "): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset(basedir, "trainset") valset = SimplePickleDataset(basedir, "valset") diff --git a/examples/lsms/lsms.py b/examples/lsms/lsms.py index 8d6654e1b..dd91069be 100644 --- a/examples/lsms/lsms.py +++ b/examples/lsms/lsms.py @@ -5,13 +5,15 @@ import argparse import hydragnn -from hydragnn.utils.time_utils import Timer -from hydragnn.utils.config_utils import get_log_name_config +from hydragnn.utils.profiling_and_tracing.time_utils import Timer +from hydragnn.utils.input_config_parsing.config_utils import get_log_name_config from hydragnn.utils.model import print_model -from hydragnn.utils.lsmsdataset import LSMSDataset -from hydragnn.utils.serializeddataset import SerializedWriter, SerializedDataset +from hydragnn.utils.datasets.lsmsdataset import LSMSDataset +from hydragnn.utils.datasets.serializeddataset import ( + SerializedWriter, + SerializedDataset, +) from hydragnn.preprocess.load_data import split_dataset -from hydragnn.utils.print_utils import log try: from hydragnn.utils.adiosdataset import AdiosWriter, AdiosDataset @@ -42,14 +44,14 @@ def info(*args, logtype="info", sep=" "): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -92,7 +94,7 @@ def info(*args, logtype="info", sep=" "): if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % datasetname + os.path.dirname(__file__), "./datasets/%s.bp" % datasetname ) adwriter = AdiosWriter(fname, MPI.COMM_SELF) adwriter.add("trainset", trainset) @@ -137,7 +139,9 @@ def info(*args, logtype="info", sep=" "): "preload": True, "shmem": False, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % datasetname) + fname = os.path.join( + os.path.dirname(__file__), "./datasets/%s.bp" % datasetname + ) trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) diff --git a/examples/md17/md17.py b/examples/md17/md17.py index 26cf60b98..ca44b5fb9 100644 --- a/examples/md17/md17.py +++ b/examples/md17/md17.py @@ -49,17 +49,17 @@ def md17_pre_filter(data): # Enable print to log file. hydragnn.utils.setup_log(log_name) -# Use built-in torch_geometric dataset. +# Use built-in torch_geometric datasets. # Filter function above used to run quick example. # NOTE: data is moved to the device in the pre-transform. # NOTE: transforms/filters will NOT be re-run unless the qm9/processed/ directory is removed. compute_edges = hydragnn.preprocess.get_radius_graph_config(arch_config) -# Fix for MD17 dataset +# Fix for MD17 datasets torch_geometric.datasets.MD17.file_names["uracil"] = "md17_uracil.npz" dataset = torch_geometric.datasets.MD17( - root="dataset/md17", + root="datasets/md17", name="uracil", pre_transform=md17_pre_transform, pre_filter=md17_pre_filter, @@ -85,7 +85,7 @@ def md17_pre_filter(data): optimizer, mode="min", factor=0.5, patience=5, min_lr=0.00001 ) -# Run training with the given model and qm9 dataset. +# Run training with the given model and qm9 datasets. writer = hydragnn.utils.get_summary_writer(log_name) hydragnn.utils.save_config(config, log_name) diff --git a/examples/mptrj/train.py b/examples/mptrj/train.py index bd209bd05..5b0f0468c 100644 --- a/examples/mptrj/train.py +++ b/examples/mptrj/train.py @@ -16,18 +16,24 @@ from torch_geometric.transforms import Distance, Spherical, LocalCartesian import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg -from hydragnn.preprocess.utils import RadiusGraph, RadiusGraphPBC +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + RadiusGraph, + RadiusGraphPBC, +) from hydragnn.preprocess.load_data import split_dataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import iterate_tqdm, log +from hydragnn.utils.print.print_utils import iterate_tqdm, log from jarvis.db.jsonutils import loadjson, dumpjson from pymatgen.core.structure import Structure @@ -42,7 +48,7 @@ pass import subprocess -from hydragnn.utils import nsplit +from hydragnn.utils.distributed import nsplit def info(*args, logtype="info", sep=" "): @@ -199,7 +205,7 @@ def get(self, idx): type=bool, default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -214,14 +220,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -234,7 +240,7 @@ def get(self, idx): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -297,7 +303,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -311,7 +317,7 @@ def get(self, idx): ## pickle elif args.format == "pickle": basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -356,14 +362,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config diff --git a/examples/multidataset/energy_linear_regression.py b/examples/multidataset/energy_linear_regression.py index e65b5e993..73a15ed3c 100644 --- a/examples/multidataset/energy_linear_regression.py +++ b/examples/multidataset/energy_linear_regression.py @@ -14,13 +14,13 @@ def subset(i): - # sz = len(dataset) + # sz = len(datasets) # chunk = sz // C.procs # left = sz % C.procs # a = i*chunk + min(i, left) # b = (i+1)*chunk + min(i+1, left) # print(f"Rank {i}/{C.procs} converting subset [{a},{b})") - # return np.array([np.array(x) for x in dataset[a:b]["image"]]) + # return np.array([np.array(x) for x in datasets[a:b]["image"]]) return np.random.random((100, 4)) @@ -101,7 +101,7 @@ def solve_least_squares_svd(A, b): comm_rank = comm.Get_rank() comm_size = comm.Get_size() - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % args.modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % args.modelname) print("fname:", fname) trainset = AdiosDataset( fname, @@ -123,7 +123,7 @@ def solve_least_squares_svd(A, b): ) pna_deg = trainset.pna_deg - ## Iterate over local dataset + ## Iterate over local datasets energy_list = list() feature_list = list() for dataset in [trainset, valset, testset]: @@ -205,7 +205,7 @@ def solve_least_squares_svd(A, b): ## Writing fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s-v2.bp" % args.modelname + os.path.dirname(__file__), "./datasets/%s-v2.bp" % args.modelname ) if comm_rank == 0: print("Saving:", fname) diff --git a/examples/multidataset/train.py b/examples/multidataset/train.py index 0f5c347a9..210f2a0ae 100644 --- a/examples/multidataset/train.py +++ b/examples/multidataset/train.py @@ -8,15 +8,15 @@ import numpy as np import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import SimplePickleDataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import log, log0 -from hydragnn.utils import nsplit +from hydragnn.utils.print.print_utils import log, log0 +from hydragnn.utils.distributed import nsplit try: from hydragnn.utils.adiosdataset import AdiosDataset @@ -41,7 +41,7 @@ def info(*args, logtype="info", sep=" "): parser.add_argument( "--inputfile", help="input file", type=str, default="gfm_multitasking.json" ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -68,21 +68,21 @@ def info(*args, logtype="info", sep=" "): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", ) group.add_argument( "--multi", - help="Multi dataset", + help="Multi datasets", action="store_const", dest="format", const="multi", @@ -95,7 +95,7 @@ def info(*args, logtype="info", sep=" "): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -151,14 +151,14 @@ def info(*args, logtype="info", sep=" "): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config @@ -189,7 +189,7 @@ def info(*args, logtype="info", sep=" "): pna_deg_list = list() for model in modellist: fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % model + os.path.dirname(__file__), "./datasets/%s.bp" % model ) with ad2.open(fname, "r", MPI.COMM_SELF) as f: f.__next__() @@ -254,7 +254,7 @@ def info(*args, logtype="info", sep=" "): "pos", "y", ] - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % mymodel) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % mymodel) trainset = AdiosDataset( fname, "trainset", diff --git a/examples/multidataset_hpo/gfm.py b/examples/multidataset_hpo/gfm.py index 25792daf0..9d555d1d4 100644 --- a/examples/multidataset_hpo/gfm.py +++ b/examples/multidataset_hpo/gfm.py @@ -8,15 +8,15 @@ import numpy as np import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import SimplePickleDataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import log -from hydragnn.utils import nsplit +from hydragnn.utils.print.print_utils import log +from hydragnn.utils.distributed import nsplit try: from hydragnn.utils.adiosdataset import AdiosDataset @@ -49,7 +49,7 @@ def main(): parser.add_argument("--num_headlayers", type=int, help="num_headlayers", default=2) parser.add_argument("--dim_headlayers", type=int, help="dim_headlayers", default=10) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name", default="gfm_test") @@ -70,21 +70,21 @@ def main(): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", ) group.add_argument( "--multi", - help="Multi dataset", + help="Multi datasets", action="store_const", dest="format", const="multi", @@ -98,7 +98,7 @@ def main(): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -181,14 +181,14 @@ def main(): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config @@ -219,7 +219,7 @@ def main(): pna_deg_list = list() for model in modellist: fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % model + os.path.dirname(__file__), "./datasets/%s.bp" % model ) with ad2.open(fname, "r", MPI.COMM_SELF) as f: f.__next__() @@ -284,7 +284,7 @@ def main(): "pos", "y", ] - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % mymodel) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % mymodel) trainset = AdiosDataset( fname, "trainset", diff --git a/examples/ogb/train_gap.py b/examples/ogb/train_gap.py index 78898e4c4..1ac6f309c 100644 --- a/examples/ogb/train_gap.py +++ b/examples/ogb/train_gap.py @@ -8,25 +8,26 @@ import sys from tqdm import tqdm from mpi4py import MPI -from itertools import chain import argparse -import time import math import hydragnn from hydragnn.preprocess.load_data import split_dataset -from hydragnn.utils.print_utils import print_distributed, iterate_tqdm -from hydragnn.utils.time_utils import Timer -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.utils.print.print_utils import print_distributed +from hydragnn.utils.profiling_and_tracing.time_utils import Timer +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg from hydragnn.utils.model import print_model -from hydragnn.utils.smiles_utils import ( +from hydragnn.utils.descriptors_and_embeddings.smiles_utils import ( get_node_attribute_name, generate_graphdata_from_smilestr, ) -from hydragnn.utils.config_utils import parse_deepspeed_config +from hydragnn.utils.input_config_parsing.config_utils import parse_deepspeed_config from hydragnn.utils.distributed import get_deepspeed_init_args -from hydragnn.utils import nsplit +from hydragnn.utils.distributed import nsplit import numpy as np @@ -132,7 +133,7 @@ def smiles_to_graph(datadir, files_list): class OGBDataset(AbstractBaseDataset): - """OGBDataset dataset class""" + """OGBDataset datasets class""" def __init__(self, dirpath, var_config, dist=False): super().__init__() @@ -260,20 +261,20 @@ def __getitem__(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", ) group.add_argument( - "--csv", help="CSV dataset", action="store_const", dest="format", const="csv" + "--csv", help="CSV datasets", action="store_const", dest="format", const="csv" ) parser.add_argument( "--use_deepspeed", @@ -287,7 +288,7 @@ def __getitem__(self, idx): graph_feature_names = ["GAP"] graph_feature_dim = [1] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset/") + datadir = os.path.join(dirpwd, "datasets/") ################################################################################################################## inputfilesubstr = args.inputfilesubstr input_filename = os.path.join(dirpwd, "ogb_" + inputfilesubstr + ".json") @@ -355,7 +356,7 @@ def __getitem__(self, idx): ## pickle basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -386,7 +387,7 @@ def __getitem__(self, idx): ) if args.format == "adios": - fname = os.path.join(os.path.dirname(__file__), "dataset", "ogb_gap.bp") + fname = os.path.join(os.path.dirname(__file__), "datasets", "ogb_gap.bp") adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) adwriter.add("valset", valset) @@ -402,12 +403,12 @@ def __getitem__(self, idx): opt = {"preload": True, "shmem": False} if args.shmem: opt = {"preload": False, "shmem": True} - fname = os.path.join(os.path.dirname(__file__), "dataset", "ogb_gap.bp") + fname = os.path.join(os.path.dirname(__file__), "datasets", "ogb_gap.bp") trainset = AdiosDataset(fname, "trainset", comm, **opt) valset = AdiosDataset(fname, "valset", comm, **opt) testset = AdiosDataset(fname, "testset", comm, **opt) elif args.format == "csv": - fname = os.path.join(os.path.dirname(__file__), "dataset", "pcqm4m_gap.csv") + fname = os.path.join(os.path.dirname(__file__), "datasets", "pcqm4m_gap.csv") fact = OGBRawDatasetFactory( fname, var_config=var_config, sampling=args.sampling ) @@ -417,7 +418,7 @@ def __getitem__(self, idx): elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config diff --git a/examples/open_catalyst_2020/download_dataset.py b/examples/open_catalyst_2020/download_dataset.py index 99865f3ae..c938a07dc 100644 --- a/examples/open_catalyst_2020/download_dataset.py +++ b/examples/open_catalyst_2020/download_dataset.py @@ -140,8 +140,8 @@ def cleanup(filename, dirname): parser.add_argument( "--data-path", type=str, - default="./dataset", - help="Specify path to save dataset. Defaults to './dataset'", + default="./datasets", + help="Specify path to save datasets. Defaults to './datasets'", ) args, _ = parser.parse_known_args() diff --git a/examples/open_catalyst_2020/train.py b/examples/open_catalyst_2020/train.py index 07540a5ea..45c54704c 100644 --- a/examples/open_catalyst_2020/train.py +++ b/examples/open_catalyst_2020/train.py @@ -14,17 +14,20 @@ from torch_geometric.data import Data import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg from hydragnn.preprocess.load_data import split_dataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import iterate_tqdm, log +from hydragnn.utils.print.print_utils import iterate_tqdm, log from utils.atoms_to_graphs import AtomsToGraphs from utils.preprocess import write_images_to_adios @@ -151,7 +154,7 @@ def get(self, idx): type=bool, default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -163,14 +166,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -255,7 +258,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -314,7 +317,7 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) diff --git a/examples/open_catalyst_2020/uncompress.py b/examples/open_catalyst_2020/uncompress.py index 49f223c81..8f7a92328 100644 --- a/examples/open_catalyst_2020/uncompress.py +++ b/examples/open_catalyst_2020/uncompress.py @@ -28,7 +28,7 @@ def decompress_list_of_files(ip_op_pair: Tuple[str, str]) -> None: def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( - "--ipdir", type=str, help="Path to compressed dataset directory" + "--ipdir", type=str, help="Path to compressed datasets directory" ) parser.add_argument( "--opdir", type=str, help="Directory path to uncompress files to" diff --git a/examples/open_catalyst_2020/utils/atoms_to_graphs.py b/examples/open_catalyst_2020/utils/atoms_to_graphs.py index 10222757c..a0edc97ce 100644 --- a/examples/open_catalyst_2020/utils/atoms_to_graphs.py +++ b/examples/open_catalyst_2020/utils/atoms_to_graphs.py @@ -15,7 +15,10 @@ from torch_geometric.data import Data from torch_geometric.transforms import Distance, Spherical, LocalCartesian -from hydragnn.preprocess.utils import RadiusGraph, RadiusGraphPBC +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + RadiusGraph, + RadiusGraphPBC, +) # transform_coordinates = Spherical(norm=False, cat=False) # transform_coordinates = LocalCartesian(norm=False, cat=False) diff --git a/examples/open_catalyst_2022/train.py b/examples/open_catalyst_2022/train.py index 83a8f80e8..f4304bb48 100644 --- a/examples/open_catalyst_2022/train.py +++ b/examples/open_catalyst_2022/train.py @@ -16,19 +16,25 @@ from torch_geometric.transforms import Distance, Spherical, LocalCartesian import hydragnn -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.model import print_model -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg from hydragnn.preprocess.load_data import split_dataset -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.print_utils import iterate_tqdm, log +from hydragnn.utils.print.print_utils import iterate_tqdm, log -from hydragnn.preprocess.utils import RadiusGraph, RadiusGraphPBC +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + RadiusGraph, + RadiusGraphPBC, +) from ase.io import read @@ -37,8 +43,7 @@ except ImportError: pass -import subprocess -from hydragnn.utils import nsplit +from hydragnn.utils.distributed import nsplit def info(*args, logtype="info", sep=" "): @@ -204,7 +209,7 @@ def get(self, idx): type=bool, default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -215,14 +220,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -235,7 +240,7 @@ def get(self, idx): node_feature_names = ["atomic_number", "cartesian_coordinates", "forces"] node_feature_dims = [1, 3, 3] dirpwd = os.path.dirname(os.path.abspath(__file__)) - datadir = os.path.join(dirpwd, "dataset") + datadir = os.path.join(dirpwd, "datasets") ################################################################################################################## input_filename = os.path.join(dirpwd, args.inputfile) ################################################################################################################## @@ -311,7 +316,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -325,7 +330,7 @@ def get(self, idx): ## pickle elif args.format == "pickle": basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) attrs = dict() attrs["pna_deg"] = deg @@ -370,14 +375,14 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) elif args.format == "pickle": info("Pickle load") basedir = os.path.join( - os.path.dirname(__file__), "dataset", "%s.pickle" % modelname + os.path.dirname(__file__), "datasets", "%s.pickle" % modelname ) trainset = SimplePickleDataset( basedir=basedir, label="trainset", var_config=var_config diff --git a/examples/qm7x/train.py b/examples/qm7x/train.py index 936c29464..e6d93f3da 100644 --- a/examples/qm7x/train.py +++ b/examples/qm7x/train.py @@ -14,14 +14,17 @@ import argparse import hydragnn -from hydragnn.utils.print_utils import iterate_tqdm, log -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.print.print_utils import iterate_tqdm, log +from hydragnn.utils.profiling_and_tracing.time_utils import Timer from hydragnn.utils.distributed import get_device from hydragnn.preprocess.load_data import split_dataset -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.preprocess.utils import gather_deg +from hydragnn.utils.datasets.distdataset import DistDataset +from hydragnn.utils.datasets.pickledataset import ( + SimplePickleWriter, + SimplePickleDataset, +) +from hydragnn.preprocess.graph_samples_checks_and_updates import gather_deg import numpy as np @@ -35,8 +38,8 @@ except ImportError: pass -from hydragnn.utils import nsplit -import hydragnn.utils.tracer as tr +from hydragnn.utils.distributed import nsplit +import hydragnn.utils.profiling_and_tracing.tracer as tr # FIXME: this works fine for now because we train on QM7-X molecules # for larger chemical spaces, the following atom representation has to be properly expanded @@ -66,7 +69,7 @@ def info(*args, logtype="info", sep=" "): class QM7XDataset(AbstractBaseDataset): - """QM7-XDataset dataset class""" + """QM7-XDataset datasets class""" def __init__(self, dirpath, var_config, energy_per_atom=True, dist=False): super().__init__() @@ -192,7 +195,7 @@ def hdf5_to_graph(self, fMOL, molid): # check forces values assert self.check_forces_values( forces - ), f"qm7x dataset - molid:{molid} - confid:{confid} - L2-norm of atomic forces exceeds {self.forces_norm_threshold}" + ), f"qm7x datasets - molid:{molid} - confid:{confid} - L2-norm of atomic forces exceeds {self.forces_norm_threshold}" if self.energy_per_atom: energy = EPBE0 / natoms @@ -241,7 +244,7 @@ def get(self, idx): default=True, ) - parser.add_argument("--ddstore", action="store_true", help="ddstore dataset") + parser.add_argument("--ddstore", action="store_true", help="ddstore datasets") parser.add_argument("--ddstore_width", type=int, help="ddstore width", default=None) parser.add_argument("--shmem", action="store_true", help="shmem") parser.add_argument("--log", help="log name") @@ -251,14 +254,14 @@ def get(self, idx): group = parser.add_mutually_exclusive_group() group.add_argument( "--adios", - help="Adios dataset", + help="Adios datasets", action="store_const", dest="format", const="adios", ) group.add_argument( "--pickle", - help="Pickle dataset", + help="Pickle datasets", action="store_const", dest="format", const="pickle", @@ -341,7 +344,7 @@ def get(self, idx): ## adios if args.format == "adios": fname = os.path.join( - os.path.dirname(__file__), "./dataset/%s.bp" % modelname + os.path.dirname(__file__), "./datasets/%s.bp" % modelname ) adwriter = AdiosWriter(fname, comm) adwriter.add("trainset", trainset) @@ -400,7 +403,7 @@ def get(self, idx): "ddstore": args.ddstore, "ddstore_width": args.ddstore_width, } - fname = os.path.join(os.path.dirname(__file__), "./dataset/%s.bp" % modelname) + fname = os.path.join(os.path.dirname(__file__), "./datasets/%s.bp" % modelname) trainset = AdiosDataset(fname, "trainset", comm, **opt, var_config=var_config) valset = AdiosDataset(fname, "valset", comm, **opt, var_config=var_config) testset = AdiosDataset(fname, "testset", comm, **opt, var_config=var_config) diff --git a/examples/qm9/qm9.py b/examples/qm9/qm9.py index 9a9de7830..b5c970793 100644 --- a/examples/qm9/qm9.py +++ b/examples/qm9/qm9.py @@ -48,12 +48,12 @@ def qm9_pre_filter(data): # Enable print to log file. hydragnn.utils.setup_log(log_name) -# Use built-in torch_geometric dataset. +# Use built-in torch_geometric datasets. # Filter function above used to run quick example. # NOTE: data is moved to the device in the pre-transform. # NOTE: transforms/filters will NOT be re-run unless the qm9/processed/ directory is removed. dataset = torch_geometric.datasets.QM9( - root="dataset/qm9", pre_transform=qm9_pre_transform, pre_filter=qm9_pre_filter + root="datasets/qm9", pre_transform=qm9_pre_transform, pre_filter=qm9_pre_filter ) train, val, test = hydragnn.preprocess.split_dataset( dataset, config["NeuralNetwork"]["Training"]["perc_train"], False @@ -76,7 +76,7 @@ def qm9_pre_filter(data): optimizer, mode="min", factor=0.5, patience=5, min_lr=0.00001 ) -# Run training with the given model and qm9 dataset. +# Run training with the given model and qm9 datasets. writer = hydragnn.utils.get_summary_writer(log_name) hydragnn.utils.save_config(config, log_name) diff --git a/examples/qm9_hpo/qm9.py b/examples/qm9_hpo/qm9.py index 61ef2376e..83bdf1a83 100644 --- a/examples/qm9_hpo/qm9.py +++ b/examples/qm9_hpo/qm9.py @@ -75,12 +75,12 @@ def qm9_pre_filter(data): # Enable print to log file. hydragnn.utils.setup_log(log_name) -# Use built-in torch_geometric dataset. +# Use built-in torch_geometric datasets. # Filter function above used to run quick example. # NOTE: data is moved to the device in the pre-transform. # NOTE: transforms/filters will NOT be re-run unless the qm9/processed/ directory is removed. dataset = torch_geometric.datasets.QM9( - root="dataset/qm9", pre_transform=qm9_pre_transform, pre_filter=qm9_pre_filter + root="datasets/qm9", pre_transform=qm9_pre_transform, pre_filter=qm9_pre_filter ) train, val, test = hydragnn.preprocess.split_dataset( dataset, config["NeuralNetwork"]["Training"]["perc_train"], False @@ -103,7 +103,7 @@ def qm9_pre_filter(data): optimizer, mode="min", factor=0.5, patience=5, min_lr=0.00001 ) -# Run training with the given model and qm9 dataset. +# Run training with the given model and qm9 datasets. writer = hydragnn.utils.get_summary_writer(log_name) hydragnn.utils.save_config(config, log_name) diff --git a/examples/qm9_hpo/qm9_deephyper.py b/examples/qm9_hpo/qm9_deephyper.py index cb4019cf4..f8ab91826 100644 --- a/examples/qm9_hpo/qm9_deephyper.py +++ b/examples/qm9_hpo/qm9_deephyper.py @@ -135,12 +135,12 @@ def run(trial): log_name = "qm9" - # Use built-in torch_geometric dataset. + # Use built-in torch_geometric datasets. # Filter function above used to run quick example. # NOTE: data is moved to the device in the pre-transform. # NOTE: transforms/filters will NOT be re-run unless the qm9/processed/ directory is removed. dataset = torch_geometric.datasets.QM9( - root="dataset/qm9", pre_transform=qm9_pre_transform + root="datasets/qm9", pre_transform=qm9_pre_transform ) trainset, valset, testset = hydragnn.preprocess.split_dataset(dataset, 0.8, False) diff --git a/examples/qm9_hpo/qm9_optuna.py b/examples/qm9_hpo/qm9_optuna.py index 64403ac29..07057bf60 100644 --- a/examples/qm9_hpo/qm9_optuna.py +++ b/examples/qm9_hpo/qm9_optuna.py @@ -171,12 +171,12 @@ def objective(trial): # Enable print to log file. hydragnn.utils.setup_log(log_name) - # Use built-in torch_geometric dataset. + # Use built-in torch_geometric datasets. # Filter function above used to run quick example. # NOTE: data is moved to the device in the pre-transform. # NOTE: transforms/filters will NOT be re-run unless the qm9/processed/ directory is removed. dataset = torch_geometric.datasets.QM9( - root="dataset/qm9", pre_transform=qm9_pre_transform + root="datasets/qm9", pre_transform=qm9_pre_transform ) trainset, valset, testset = hydragnn.preprocess.split_dataset( dataset, config["NeuralNetwork"]["Training"]["perc_train"], False diff --git a/hydragnn/__init__.py b/hydragnn/__init__.py index acbfaa786..b008f952e 100644 --- a/hydragnn/__init__.py +++ b/hydragnn/__init__.py @@ -1,3 +1,2 @@ -from . import preprocess, models, train, postprocess, utils from .run_training import run_training from .run_prediction import run_prediction diff --git a/hydragnn/models/Base.py b/hydragnn/models/Base.py index 254461b76..51e1c3cda 100644 --- a/hydragnn/models/Base.py +++ b/hydragnn/models/Base.py @@ -263,7 +263,7 @@ def _multihead(self): assert ( self.num_nodes is not None ), "num_nodes must be positive integer for MLP" - # """if different graphs in the dataset have different size, one MLP is shared across all nodes """ + # """if different graphs in the datasets have different size, one MLP is shared across all nodes """ head_NN = MLPNode( self.hidden_dim, self.head_dims[ihead] * (1 + self.var_output), diff --git a/hydragnn/preprocess/__init__.py b/hydragnn/preprocess/__init__.py index c5ce6ac20..ccc0d2bb0 100644 --- a/hydragnn/preprocess/__init__.py +++ b/hydragnn/preprocess/__init__.py @@ -1,6 +1,6 @@ from .dataset_descriptors import AtomFeatures, StructureFeatures -from .utils import ( +from .graph_samples_checks_and_updates import ( check_if_graph_size_variable, check_if_graph_size_variable_dist, get_radius_graph, @@ -10,9 +10,10 @@ RadiusGraphPBC, update_predicted_values, update_atom_features, - stratified_sampling, ) +from .stratified_sampling import stratified_sampling + from .load_data import ( dataset_loading_and_splitting, create_dataloaders, @@ -26,4 +27,3 @@ ) from .lsms_raw_dataset_loader import LSMS_RawDataLoader from .cfg_raw_dataset_loader import CFG_RawDataLoader -from .compositional_data_splitting import compositional_stratified_splitting diff --git a/hydragnn/preprocess/cfg_raw_dataset_loader.py b/hydragnn/preprocess/cfg_raw_dataset_loader.py index b5043abb1..32f44c867 100644 --- a/hydragnn/preprocess/cfg_raw_dataset_loader.py +++ b/hydragnn/preprocess/cfg_raw_dataset_loader.py @@ -12,7 +12,6 @@ import os import numpy as np -import torch from torch_geometric.data import Data from torch import tensor diff --git a/hydragnn/preprocess/utils.py b/hydragnn/preprocess/graph_samples_checks_and_updates.py similarity index 85% rename from hydragnn/preprocess/utils.py rename to hydragnn/preprocess/graph_samples_checks_and_updates.py index 3533756d2..b4162d742 100644 --- a/hydragnn/preprocess/utils.py +++ b/hydragnn/preprocess/graph_samples_checks_and_updates.py @@ -20,7 +20,7 @@ from .dataset_descriptors import AtomFeatures -## This function can be slow if dataset is too large. Use with caution. +## This function can be slow if datasets is too large. Use with caution. ## Recommend to use check_if_graph_size_variable_dist def check_if_graph_size_variable(train_loader, val_loader, test_loader): backend = os.getenv("HYDRAGNN_AGGR_BACKEND", "torch") @@ -175,7 +175,7 @@ def __repr__(self) -> str: def gather_deg(dataset): - from hydragnn.utils.print_utils import iterate_tqdm + from hydragnn.utils.print.print_utils import iterate_tqdm backend = os.getenv("HYDRAGNN_AGGR_BACKEND", "torch") if backend == "torch": @@ -197,7 +197,7 @@ def gather_deg(dataset): def gather_deg_dist(dataset): import torch.distributed as dist - from hydragnn.utils.print_utils import iterate_tqdm + from hydragnn.utils.print.print_utils import iterate_tqdm from hydragnn.utils.distributed import get_device max_deg = 0 @@ -218,7 +218,7 @@ def gather_deg_dist(dataset): def gather_deg_mpi(dataset): from mpi4py import MPI - from hydragnn.utils.print_utils import iterate_tqdm + from hydragnn.utils.print.print_utils import iterate_tqdm max_deg = 0 for data in iterate_tqdm(dataset, 2, desc="Degree max"): @@ -290,47 +290,3 @@ def update_atom_features(atom_features: [AtomFeatures], data: Data): """ feature_indices = [i for i in atom_features] data.x = data.x[:, feature_indices] - - -def stratified_sampling(dataset: [Data], subsample_percentage: float, verbosity=0): - """Given the dataset and the percentage of data you want to extract from it, method will - apply stratified sampling where X is the dataset and Y is are the category values for each datapoint. - In the case of the structures dataset where each structure contains 2 types of atoms, the category will - be constructed in a way: number of atoms of type 1 + number of protons of type 2 * 100. - - Parameters - ---------- - dataset: [Data] - A list of Data objects representing a structure that has atoms. - subsample_percentage: float - Percentage of the dataset. - - Returns - ---------- - [Data] - Subsample of the original dataset constructed using stratified sampling. - """ - dataset_categories = [] - print_distributed(verbosity, "Computing the categories for the whole dataset.") - for data in iterate_tqdm(dataset, verbosity): - frequencies = torch.bincount(data.x[:, 0].int()) - frequencies = sorted(frequencies[frequencies > 0].tolist()) - category = 0 - for index, frequency in enumerate(frequencies): - category += frequency * (100 ** index) - dataset_categories.append(category) - - subsample_indices = [] - subsample = [] - - sss = StratifiedShuffleSplit( - n_splits=1, train_size=subsample_percentage, random_state=0 - ) - - for subsample_index, rest_of_data_index in sss.split(dataset, dataset_categories): - subsample_indices = subsample_index.tolist() - - for index in subsample_indices: - subsample.append(dataset[index]) - - return subsample diff --git a/hydragnn/preprocess/load_data.py b/hydragnn/preprocess/load_data.py index 59a119f08..eeca9004d 100644 --- a/hydragnn/preprocess/load_data.py +++ b/hydragnn/preprocess/load_data.py @@ -14,7 +14,6 @@ import torch import torch.distributed as dist -import torch_geometric # FIXME: deprecated in torch_geometric 2.0 try: @@ -25,23 +24,21 @@ from hydragnn.preprocess.serialized_dataset_loader import SerializedDataLoader from hydragnn.preprocess.lsms_raw_dataset_loader import LSMS_RawDataLoader from hydragnn.preprocess.cfg_raw_dataset_loader import CFG_RawDataLoader -from hydragnn.preprocess.compositional_data_splitting import ( +from hydragnn.utils.datasets.compositional_data_splitting import ( compositional_stratified_splitting, ) from hydragnn.utils.distributed import get_comm_size_and_rank -from hydragnn.utils.time_utils import Timer +from hydragnn.utils.profiling_and_tracing.time_utils import Timer import pickle -from hydragnn.utils.print_utils import print_master, log +from hydragnn.utils.print.print_utils import log -from torch_geometric.data import Batch, Dataset +from torch_geometric.data import Batch from torch.utils.data.dataloader import _DatasetKind -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor import multiprocessing as mp import queue -import time -import sys import re @@ -209,7 +206,7 @@ def dataset_loading_and_splitting(config: {}): if not list(config["Dataset"]["path"].values())[0].endswith(".pkl"): transform_raw_data_to_serialized(config["Dataset"]) - ##if total dataset is provided, split the dataset and save them to pkl files and update config with pkl file locations + ##if total datasets is provided, split the datasets and save them to pkl files and update config with pkl file locations if "total" in config["Dataset"]["path"].keys(): total_to_train_val_test_pkls(config) @@ -370,7 +367,7 @@ def total_to_train_val_test_pkls(config, isdist=False): file_dir = config["Dataset"]["path"]["total"] else: file_dir = f"{os.environ['SERIALIZED_DATA_PATH']}/serialized_dataset/{config['Dataset']['name']}.pkl" - # if "total" raw dataset is provided, generate train/val/test pkl files and update config dict. + # if "total" raw datasets is provided, generate train/val/test pkl files and update config dict. with open(file_dir, "rb") as f: minmax_node_feature = pickle.load(f) minmax_graph_feature = pickle.load(f) diff --git a/hydragnn/preprocess/raw_dataset_loader.py b/hydragnn/preprocess/raw_dataset_loader.py index c0443bf2a..702e0ef92 100644 --- a/hydragnn/preprocess/raw_dataset_loader.py +++ b/hydragnn/preprocess/raw_dataset_loader.py @@ -14,12 +14,11 @@ import pickle import torch -from torch_geometric.data import Data -from torch import tensor from hydragnn.utils.distributed import get_device -from hydragnn.utils.print_utils import log -from hydragnn.utils import nsplit, tensor_divide, comm_reduce +from hydragnn.utils.print.print_utils import log +from hydragnn.utils.distributed import nsplit, comm_reduce +from hydragnn.utils.model.model import tensor_divide import random @@ -38,7 +37,7 @@ class AbstractRawDataLoader: def __init__(self, config, dist=False): """ config: - shows the dataset path the target variables information, e.g, location and dimension, in data file + shows the datasets path the target variables information, e.g, location and dimension, in data file ########### dataset_list: list of datasets read from self.path_dictionary @@ -193,7 +192,7 @@ def scale_features_by_num_nodes(self, dataset): def normalize_dataset(self): - """Performs the normalization on Data objects and returns the normalized dataset.""" + """Performs the normalization on Data objects and returns the normalized datasets.""" num_node_features = len(self.node_feature_dim) num_graph_features = len(self.graph_feature_dim) diff --git a/hydragnn/preprocess/serialized_dataset_loader.py b/hydragnn/preprocess/serialized_dataset_loader.py index bef054edf..3b385f936 100644 --- a/hydragnn/preprocess/serialized_dataset_loader.py +++ b/hydragnn/preprocess/serialized_dataset_loader.py @@ -23,8 +23,8 @@ from hydragnn.preprocess import update_predicted_values, update_atom_features from hydragnn.utils.distributed import get_device -from hydragnn.utils.print_utils import print_distributed, iterate_tqdm -from hydragnn.preprocess.utils import ( +from hydragnn.utils.print.print_utils import print_distributed, iterate_tqdm +from hydragnn.preprocess.graph_samples_checks_and_updates import ( get_radius_graph, get_radius_graph_pbc, ) @@ -194,9 +194,9 @@ def load_serialized_data(self, dataset_path: str): return dataset def __stratified_sampling(self, dataset: [Data], subsample_percentage: float): - """Given the dataset and the percentage of data you want to extract from it, method will - apply stratified sampling where X is the dataset and Y is are the category values for each datapoint. - In the case of the structures dataset where each structure contains 2 types of atoms, the category will + """Given the datasets and the percentage of data you want to extract from it, method will + apply stratified sampling where X is the datasets and Y is are the category values for each datapoint. + In the case of the structures datasets where each structure contains 2 types of atoms, the category will be constructed in a way: number of atoms of type 1 + number of protons of type 2 * 100. Parameters @@ -204,16 +204,16 @@ def __stratified_sampling(self, dataset: [Data], subsample_percentage: float): dataset: [Data] A list of Data objects representing a structure that has atoms. subsample_percentage: float - Percentage of the dataset. + Percentage of the datasets. Returns ---------- [Data] - Subsample of the original dataset constructed using stratified sampling. + Subsample of the original datasets constructed using stratified sampling. """ dataset_categories = [] print_distributed( - self.verbosity, "Computing the categories for the whole dataset." + self.verbosity, "Computing the categories for the whole datasets." ) for data in iterate_tqdm(dataset, self.verbosity): frequencies = torch.bincount(data.x[:, 0].int()) diff --git a/hydragnn/preprocess/stratified_sampling.py b/hydragnn/preprocess/stratified_sampling.py new file mode 100644 index 000000000..3072ff4d5 --- /dev/null +++ b/hydragnn/preprocess/stratified_sampling.py @@ -0,0 +1,48 @@ +import torch +from torch_geometric.data import Data +from hydragnn.utils.print.print_utils import print_distributed, iterate_tqdm +from sklearn.model_selection import StratifiedShuffleSplit + + +def stratified_sampling(dataset: [Data], subsample_percentage: float, verbosity=0): + """Given the datasets and the percentage of data you want to extract from it, method will + apply stratified sampling where X is the datasets and Y is are the category values for each datapoint. + In the case of the structures datasets where each structure contains 2 types of atoms, the category will + be constructed in a way: number of atoms of type 1 + number of protons of type 2 * 100. + + Parameters + ---------- + dataset: [Data] + A list of Data objects representing a structure that has atoms. + subsample_percentage: float + Percentage of the datasets. + + Returns + ---------- + [Data] + Subsample of the original datasets constructed using stratified sampling. + """ + dataset_categories = [] + print_distributed(verbosity, "Computing the categories for the whole datasets.") + for data in iterate_tqdm(dataset, verbosity): + frequencies = torch.bincount(data.x[:, 0].int()) + frequencies = sorted(frequencies[frequencies > 0].tolist()) + category = 0 + for index, frequency in enumerate(frequencies): + category += frequency * (100 ** index) + dataset_categories.append(category) + + subsample_indices = [] + subsample = [] + + sss = StratifiedShuffleSplit( + n_splits=1, train_size=subsample_percentage, random_state=0 + ) + + for subsample_index, rest_of_data_index in sss.split(dataset, dataset_categories): + subsample_indices = subsample_index.tolist() + + for index in subsample_indices: + subsample.append(dataset[index]) + + return subsample diff --git a/hydragnn/run_prediction.py b/hydragnn/run_prediction.py index b1b7bf3de..3d15f26a3 100755 --- a/hydragnn/run_prediction.py +++ b/hydragnn/run_prediction.py @@ -15,7 +15,7 @@ from hydragnn.preprocess.load_data import dataset_loading_and_splitting from hydragnn.utils.distributed import setup_ddp, get_distributed_model from hydragnn.utils.model import load_existing_model -from hydragnn.utils.config_utils import ( +from hydragnn.utils.input_config_parsing.config_utils import ( update_config, get_log_name_config, parse_deepspeed_config, diff --git a/hydragnn/run_training.py b/hydragnn/run_training.py index c702074f9..a02d85cb8 100644 --- a/hydragnn/run_training.py +++ b/hydragnn/run_training.py @@ -12,7 +12,6 @@ import os, json from functools import singledispatch -import torch import torch.distributed as dist from torch.optim.lr_scheduler import ReduceLROnPlateau @@ -20,16 +19,16 @@ from hydragnn.utils.distributed import ( setup_ddp, get_distributed_model, - print_peak_memory, ) +from hydragnn.utils.distributed import print_peak_memory from hydragnn.utils.model import ( save_model, get_summary_writer, load_existing_model_config, ) -from hydragnn.utils.print_utils import print_distributed, setup_log -from hydragnn.utils.time_utils import print_timers -from hydragnn.utils.config_utils import ( +from hydragnn.utils.print.print_utils import print_distributed, setup_log +from hydragnn.utils.profiling_and_tracing.time_utils import print_timers +from hydragnn.utils.input_config_parsing.config_utils import ( update_config, get_log_name_config, save_config, diff --git a/hydragnn/utils/__init__.py b/hydragnn/utils/__init__.py deleted file mode 100644 index c1e23d2e1..000000000 --- a/hydragnn/utils/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -from .print_utils import print_distributed, iterate_tqdm, setup_log -from .distributed import ( - get_comm_size_and_rank, - get_device_list, - get_device, - get_device_name, - get_device_from_name, - is_model_distributed, - get_distributed_model, - setup_ddp, - nsplit, - comm_reduce, -) -from .model import ( - save_model, - get_summary_writer, - unsorted_segment_mean, - load_existing_model, - load_existing_model_config, - loss_function_selection, - tensor_divide, - EarlyStopping, -) -from .time_utils import Timer, print_timers -from .config_utils import ( - update_config, - update_config_minmax, - get_log_name_config, - save_config, -) -from .deephyper import ( - master_from_host, - read_node_list, - create_ds_config, - read_job_node_list, - create_launch_command, -) - -from .optimizer import select_optimizer -from .atomicdescriptors import atomicdescriptors diff --git a/hydragnn/utils/datasets/__init__.py b/hydragnn/utils/datasets/__init__.py new file mode 100644 index 000000000..8f7028fba --- /dev/null +++ b/hydragnn/utils/datasets/__init__.py @@ -0,0 +1,19 @@ +from .abstractbasedataset import AbstractBaseDataset +from .abstractrawdataset import AbstractRawDataset +from .adiosdataset import AdiosDataset, AdiosWriter +from .cfgdataset import CFGDataset +from .compositional_data_splitting import ( + get_keys, + get_elements_list, + get_max_graph_size, + create_dictionary_from_elements_list, + create_dataset_categories, + duplicate_unique_data_samples, + generate_partition, + compositional_stratified_splitting, +) +from .distdataset import DistDataset +from .lsmsdataset import LSMSDataset +from .pickledataset import SimplePickleDataset, SimplePickleWriter +from .serializeddataset import SerializedDataset, SerializedWriter +from .xyzdataset import XYZDataset diff --git a/hydragnn/utils/abstractbasedataset.py b/hydragnn/utils/datasets/abstractbasedataset.py similarity index 87% rename from hydragnn/utils/abstractbasedataset.py rename to hydragnn/utils/datasets/abstractbasedataset.py index 556ead164..7e73e0859 100644 --- a/hydragnn/utils/abstractbasedataset.py +++ b/hydragnn/utils/datasets/abstractbasedataset.py @@ -5,7 +5,7 @@ class AbstractBaseDataset(torch.utils.data.Dataset, ABC): """ - HydraGNN's base dataset. This is abstract class. + HydraGNN's base datasets. This is abstract class. """ def __init__(self): @@ -15,14 +15,14 @@ def __init__(self): @abstractmethod def get(self, idx): """ - Return a dataset at idx + Return a datasets at idx """ pass @abstractmethod def len(self): """ - Total number of dataset. + Total number of datasets. If data is distributed, it should be the global total size. """ pass diff --git a/hydragnn/utils/abstractrawdataset.py b/hydragnn/utils/datasets/abstractrawdataset.py similarity index 95% rename from hydragnn/utils/abstractrawdataset.py rename to hydragnn/utils/datasets/abstractrawdataset.py index 657863ea4..d81d4bfd5 100644 --- a/hydragnn/utils/abstractrawdataset.py +++ b/hydragnn/utils/datasets/abstractrawdataset.py @@ -3,8 +3,6 @@ import random import torch -from torch import tensor -from torch_geometric.data import Data from torch_geometric.transforms import ( Distance, NormalizeRotation, @@ -12,38 +10,31 @@ PointPairFeatures, ) -from hydragnn.utils import nsplit, tensor_divide, comm_reduce -from hydragnn.utils.print_utils import print_distributed, iterate_tqdm, log +from hydragnn.utils.distributed import nsplit, comm_reduce +from hydragnn.utils.model.model import tensor_divide +from hydragnn.utils.print.print_utils import iterate_tqdm, log from hydragnn.utils.distributed import get_device -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.preprocess.utils import ( +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.preprocess.graph_samples_checks_and_updates import ( get_radius_graph, get_radius_graph_pbc, - get_radius_graph_config, - get_radius_graph_pbc_config, ) from hydragnn.preprocess import ( - update_predicted_values, - update_atom_features, stratified_sampling, ) -from sklearn.model_selection import StratifiedShuffleSplit - -from hydragnn.preprocess.dataset_descriptors import AtomFeatures - from abc import ABC, abstractmethod class AbstractRawDataset(AbstractBaseDataset, ABC): - """Raw dataset class""" + """Raw datasets class""" def __init__(self, config, dist=False, sampling=None): super().__init__() """ config: - shows the dataset path the target variables information, e.g, location and dimension, in data file + shows the datasets path the target variables information, e.g, location and dimension, in data file ########### dataset_list: list of datasets read from self.path_dictionary @@ -215,7 +206,7 @@ def __load_raw_data(self): def __normalize_dataset(self): - """Performs the normalization on Data objects and returns the normalized dataset.""" + """Performs the normalization on Data objects and returns the normalized datasets.""" num_node_features = len(self.node_feature_dim) num_graph_features = len(self.graph_feature_dim) diff --git a/hydragnn/utils/adiosdataset.py b/hydragnn/utils/datasets/adiosdataset.py similarity index 98% rename from hydragnn/utils/adiosdataset.py rename to hydragnn/utils/datasets/adiosdataset.py index c366cea86..d32661ac1 100644 --- a/hydragnn/utils/adiosdataset.py +++ b/hydragnn/utils/datasets/adiosdataset.py @@ -2,9 +2,8 @@ import pickle import time import os -import glob -from .print_utils import print_distributed, log, log0, iterate_tqdm +from hydragnn.utils.print.print_utils import log, log0, iterate_tqdm import numpy as np @@ -23,10 +22,10 @@ except ImportError: pass -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset -from hydragnn.utils import nsplit +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.distributed import nsplit from hydragnn.preprocess import update_predicted_values, update_atom_features @@ -279,7 +278,7 @@ def save(self): class AdiosDataset(AbstractBaseDataset): - """Adios dataset class""" + """Adios datasets class""" def __init__( self, @@ -306,7 +305,7 @@ def __init__( comm: MPI_comm MPI communicator preload: bool, optional - Option to preload all the dataset into a memory + Option to preload all the datasets into a memory shmem: bool, optional Option to use shmem to share data between processes in the same node enable_cache: bool, optional diff --git a/hydragnn/utils/cfgdataset.py b/hydragnn/utils/datasets/cfgdataset.py similarity index 97% rename from hydragnn/utils/cfgdataset.py rename to hydragnn/utils/datasets/cfgdataset.py index 5e7c59e7d..8df40217b 100644 --- a/hydragnn/utils/cfgdataset.py +++ b/hydragnn/utils/datasets/cfgdataset.py @@ -3,7 +3,7 @@ from torch import tensor from torch_geometric.data import Data -from hydragnn.utils.abstractrawdataset import AbstractRawDataset +from hydragnn.utils.datasets.abstractrawdataset import AbstractRawDataset from ase.io.cfg import read_cfg diff --git a/hydragnn/preprocess/compositional_data_splitting.py b/hydragnn/utils/datasets/compositional_data_splitting.py similarity index 94% rename from hydragnn/preprocess/compositional_data_splitting.py rename to hydragnn/utils/datasets/compositional_data_splitting.py index 574c10dcf..4805c1245 100644 --- a/hydragnn/preprocess/compositional_data_splitting.py +++ b/hydragnn/utils/datasets/compositional_data_splitting.py @@ -115,8 +115,8 @@ def generate_partition( def compositional_stratified_splitting(dataset, perc_train): - """Given the dataset and the percentage of data you want to extract from it, method will - apply stratified sampling where X is the dataset and Y is are the category values for each datapoint. + """Given the datasets and the percentage of data you want to extract from it, method will + apply stratified sampling where X is the datasets and Y is are the category values for each datapoint. In the case each structure contains 2 types of atoms, the category will be constructed as such: number of atoms of type 1 + number of atoms of type 2 * 100. Parameters @@ -124,11 +124,11 @@ def compositional_stratified_splitting(dataset, perc_train): dataset: [Data] A list of Data objects representing a structure that has atoms. subsample_percentage: float - Percentage of the dataset. + Percentage of the datasets. Returns ---------- [Data] - Subsample of the original dataset constructed using stratified sampling. + Subsample of the original datasets constructed using stratified sampling. """ dataset_categories = create_dataset_categories(dataset) dataset, dataset_categories = duplicate_unique_data_samples( diff --git a/hydragnn/utils/distdataset.py b/hydragnn/utils/datasets/distdataset.py similarity index 95% rename from hydragnn/utils/distdataset.py rename to hydragnn/utils/datasets/distdataset.py index 5732a6cd8..80c282e67 100644 --- a/hydragnn/utils/distdataset.py +++ b/hydragnn/utils/datasets/distdataset.py @@ -4,23 +4,23 @@ import torch import torch_geometric.data -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset try: import pyddstore as dds except ImportError: pass -from hydragnn.utils.print_utils import log, log0 -from hydragnn.utils import nsplit +from hydragnn.utils.print.print_utils import log0 +from hydragnn.utils.distributed import nsplit from hydragnn.preprocess import update_predicted_values, update_atom_features -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr from tqdm import tqdm class DistDataset(AbstractBaseDataset): - """Distributed dataset class""" + """Distributed datasets class""" def __init__( self, diff --git a/hydragnn/utils/lsmsdataset.py b/hydragnn/utils/datasets/lsmsdataset.py similarity index 97% rename from hydragnn/utils/lsmsdataset.py rename to hydragnn/utils/datasets/lsmsdataset.py index a1314938d..99a121644 100644 --- a/hydragnn/utils/lsmsdataset.py +++ b/hydragnn/utils/datasets/lsmsdataset.py @@ -1,6 +1,6 @@ from torch import tensor from torch_geometric.data import Data -from hydragnn.utils.abstractrawdataset import AbstractRawDataset +from hydragnn.utils.datasets.abstractrawdataset import AbstractRawDataset class LSMSDataset(AbstractRawDataset): diff --git a/hydragnn/utils/pickledataset.py b/hydragnn/utils/datasets/pickledataset.py similarity index 95% rename from hydragnn/utils/pickledataset.py rename to hydragnn/utils/datasets/pickledataset.py index 8b99f0f9d..48da3d06b 100644 --- a/hydragnn/utils/pickledataset.py +++ b/hydragnn/utils/datasets/pickledataset.py @@ -1,15 +1,14 @@ import os import pickle -import torch from mpi4py import MPI -from .print_utils import print_distributed, log, iterate_tqdm +from hydragnn.utils.print.print_utils import log, iterate_tqdm -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset from hydragnn.preprocess import update_predicted_values, update_atom_features -import hydragnn.utils.tracer as tr +import hydragnn.utils.profiling_and_tracing.tracer as tr class SimplePickleDataset(AbstractBaseDataset): @@ -119,7 +118,7 @@ def __init__( """ Parameters ---------- - dataset: locally owned dataset (should be iterable) + dataset: locally owned datasets (should be iterable) basedir: basedir label: label nmax: nmax in case of subdir diff --git a/hydragnn/utils/serializeddataset.py b/hydragnn/utils/datasets/serializeddataset.py similarity index 86% rename from hydragnn/utils/serializeddataset.py rename to hydragnn/utils/datasets/serializeddataset.py index c469e3cab..70f71076f 100644 --- a/hydragnn/utils/serializeddataset.py +++ b/hydragnn/utils/datasets/serializeddataset.py @@ -1,10 +1,10 @@ import os import pickle -from .print_utils import log +from hydragnn.utils.print.print_utils import log -from hydragnn.utils import get_comm_size_and_rank -from hydragnn.utils.abstractbasedataset import AbstractBaseDataset +from hydragnn.utils.distributed import get_comm_size_and_rank +from hydragnn.utils.datasets.abstractbasedataset import AbstractBaseDataset class SerializedDataset(AbstractBaseDataset): @@ -15,7 +15,7 @@ def __init__(self, basedir, datasetname, label, dist=False): Parameters ---------- basedir: basedir - datasetname: dataset name + datasetname: datasets name label: label """ super().__init__() @@ -62,9 +62,9 @@ def __init__( """ Parameters ---------- - dataset: locally owned dataset (should be iterable) + dataset: locally owned datasets (should be iterable) basedir: basedir - datasetname: dataset name + datasetname: datasets name label: label nmax: nmax in case of subdir minmax_node_feature: minmax_node_feature diff --git a/hydragnn/utils/xyzdataset.py b/hydragnn/utils/datasets/xyzdataset.py similarity index 95% rename from hydragnn/utils/xyzdataset.py rename to hydragnn/utils/datasets/xyzdataset.py index b7c89be30..e3b57c29b 100644 --- a/hydragnn/utils/xyzdataset.py +++ b/hydragnn/utils/datasets/xyzdataset.py @@ -3,9 +3,8 @@ from torch import tensor from torch_geometric.data import Data -from hydragnn.utils.abstractrawdataset import AbstractRawDataset +from hydragnn.utils.datasets.abstractrawdataset import AbstractRawDataset -from ase.io.cfg import read_cfg from ase.io import read diff --git a/hydragnn/utils/descriptors_and_embeddings/__init__.py b/hydragnn/utils/descriptors_and_embeddings/__init__.py new file mode 100644 index 000000000..13b70bc6c --- /dev/null +++ b/hydragnn/utils/descriptors_and_embeddings/__init__.py @@ -0,0 +1,6 @@ +from .atomicdescriptors import atomicdescriptors +from .smiles_utils import ( + get_node_attribute_name, + generate_graphdata_from_smilestr, + generate_graphdata_from_rdkit_molecule, +) diff --git a/hydragnn/utils/atomicdescriptors.py b/hydragnn/utils/descriptors_and_embeddings/atomicdescriptors.py similarity index 100% rename from hydragnn/utils/atomicdescriptors.py rename to hydragnn/utils/descriptors_and_embeddings/atomicdescriptors.py diff --git a/hydragnn/utils/smiles_utils.py b/hydragnn/utils/descriptors_and_embeddings/smiles_utils.py similarity index 100% rename from hydragnn/utils/smiles_utils.py rename to hydragnn/utils/descriptors_and_embeddings/smiles_utils.py diff --git a/hydragnn/utils/distributed/__init__.py b/hydragnn/utils/distributed/__init__.py new file mode 100644 index 000000000..063642124 --- /dev/null +++ b/hydragnn/utils/distributed/__init__.py @@ -0,0 +1,16 @@ +from .distributed import ( + get_comm_size_and_rank, + get_device_list, + get_device, + get_device_name, + get_device_from_name, + is_model_distributed, + get_distributed_model, + setup_ddp, + nsplit, + comm_reduce, + get_deepspeed_init_args, + init_comm_size_and_rank, + check_remaining, + print_peak_memory, +) diff --git a/hydragnn/utils/distributed.py b/hydragnn/utils/distributed/distributed.py similarity index 99% rename from hydragnn/utils/distributed.py rename to hydragnn/utils/distributed/distributed.py index 50c853776..1cc86d3a4 100644 --- a/hydragnn/utils/distributed.py +++ b/hydragnn/utils/distributed/distributed.py @@ -15,14 +15,13 @@ import torch import torch.distributed as dist -from .print_utils import print_distributed +from hydragnn.utils.print.print_utils import print_distributed import psutil import socket from datetime import timedelta import time import subprocess -from mpi4py import MPI deepspeed_available = True try: diff --git a/hydragnn/utils/hpo/__init__.py b/hydragnn/utils/hpo/__init__.py new file mode 100644 index 000000000..d0cf926bd --- /dev/null +++ b/hydragnn/utils/hpo/__init__.py @@ -0,0 +1,7 @@ +from .deephyper import ( + master_from_host, + read_node_list, + create_ds_config, + read_job_node_list, + create_launch_command, +) diff --git a/hydragnn/utils/deephyper.py b/hydragnn/utils/hpo/deephyper.py similarity index 100% rename from hydragnn/utils/deephyper.py rename to hydragnn/utils/hpo/deephyper.py diff --git a/hydragnn/utils/input_config_parsing/__init__.py b/hydragnn/utils/input_config_parsing/__init__.py new file mode 100644 index 000000000..50e3e5176 --- /dev/null +++ b/hydragnn/utils/input_config_parsing/__init__.py @@ -0,0 +1,6 @@ +from .config_utils import ( + update_config, + update_config_minmax, + get_log_name_config, + save_config, +) diff --git a/hydragnn/utils/config_utils.py b/hydragnn/utils/input_config_parsing/config_utils.py similarity index 97% rename from hydragnn/utils/config_utils.py rename to hydragnn/utils/input_config_parsing/config_utils.py index 3331952c8..50771b402 100644 --- a/hydragnn/utils/config_utils.py +++ b/hydragnn/utils/input_config_parsing/config_utils.py @@ -10,15 +10,14 @@ ############################################################################## import pickle import os -from hydragnn.preprocess.utils import check_if_graph_size_variable, gather_deg -from hydragnn.utils.model import calculate_PNA_degree -from hydragnn.utils import get_comm_size_and_rank -import time +from hydragnn.preprocess.graph_samples_checks_and_updates import ( + check_if_graph_size_variable, + gather_deg, +) +from hydragnn.utils.distributed import get_comm_size_and_rank from copy import deepcopy import json -from torch_geometric.utils import degree import torch -import torch.distributed as dist def update_config(config, train_loader, val_loader, test_loader): @@ -47,7 +46,7 @@ def update_config(config, train_loader, val_loader, test_loader): if config["NeuralNetwork"]["Architecture"]["model_type"] == "PNA": if hasattr(train_loader.dataset, "pna_deg"): - ## Use max neighbours used in the dataset. + ## Use max neighbours used in the datasets. deg = torch.tensor(train_loader.dataset.pna_deg) else: deg = gather_deg(train_loader.dataset) diff --git a/hydragnn/utils/model/__init__.py b/hydragnn/utils/model/__init__.py new file mode 100644 index 000000000..078ba616b --- /dev/null +++ b/hydragnn/utils/model/__init__.py @@ -0,0 +1,11 @@ +from .model import ( + save_model, + get_summary_writer, + unsorted_segment_mean, + load_existing_model, + load_existing_model_config, + loss_function_selection, + tensor_divide, + EarlyStopping, + print_model, +) diff --git a/hydragnn/utils/model.py b/hydragnn/utils/model/model.py similarity index 98% rename from hydragnn/utils/model.py rename to hydragnn/utils/model/model.py index 67253fcae..6b6d3eb56 100644 --- a/hydragnn/utils/model.py +++ b/hydragnn/utils/model/model.py @@ -15,9 +15,8 @@ import torch import torch.distributed as dist from torch.utils.tensorboard import SummaryWriter -from torch_geometric.data import Data from torch_geometric.utils import degree -from .print_utils import print_master, iterate_tqdm +from hydragnn.utils.print.print_utils import print_master, iterate_tqdm from hydragnn.utils.distributed import ( get_comm_size_and_rank, @@ -123,7 +122,7 @@ def load_existing_model( model.load_checkpoint(os.path.join(path, model_name), model_name) -## This function may cause OOM if dataset is too large +## This function may cause OOM if datasets is too large ## to fit in a single GPU (i.e., with DDP). Use with caution. ## Recommend to use calculate_PNA_degree_dist def calculate_PNA_degree(loader, max_neighbours): diff --git a/hydragnn/utils/optimizer/__init__.py b/hydragnn/utils/optimizer/__init__.py new file mode 100644 index 000000000..9d9dce433 --- /dev/null +++ b/hydragnn/utils/optimizer/__init__.py @@ -0,0 +1 @@ +from .optimizer import select_optimizer diff --git a/hydragnn/utils/optimizer.py b/hydragnn/utils/optimizer/optimizer.py similarity index 98% rename from hydragnn/utils/optimizer.py rename to hydragnn/utils/optimizer/optimizer.py index 6950e1146..af2fdcc32 100644 --- a/hydragnn/utils/optimizer.py +++ b/hydragnn/utils/optimizer/optimizer.py @@ -1,5 +1,5 @@ import torch -from .distributed import get_device_name +from hydragnn.utils.distributed import get_device_name from torch.distributed.optim import ZeroRedundancyOptimizer deepspeed_available = True diff --git a/hydragnn/utils/print/__init__.py b/hydragnn/utils/print/__init__.py new file mode 100644 index 000000000..8093611dc --- /dev/null +++ b/hydragnn/utils/print/__init__.py @@ -0,0 +1 @@ +from .print_utils import print_distributed, iterate_tqdm, setup_log diff --git a/hydragnn/utils/print_utils.py b/hydragnn/utils/print/print_utils.py similarity index 95% rename from hydragnn/utils/print_utils.py rename to hydragnn/utils/print/print_utils.py index f01facaf8..eb7329ecd 100644 --- a/hydragnn/utils/print_utils.py +++ b/hydragnn/utils/print/print_utils.py @@ -64,7 +64,7 @@ def setup_log(prefix): """ Setup logging to print messages for both screen and file. """ - from .distributed import init_comm_size_and_rank + from hydragnn.utils.distributed import init_comm_size_and_rank world_size, world_rank = init_comm_size_and_rank() @@ -100,7 +100,7 @@ def log(*args, sep=" ", rank=None): if rank is None: logger.info(sep.join(map(str, args))) else: - from .distributed import init_comm_size_and_rank + from hydragnn.utils.distributed import init_comm_size_and_rank world_size, world_rank = init_comm_size_and_rank() if rank == world_rank: diff --git a/hydragnn/utils/profiling_and_tracing/__init__.py b/hydragnn/utils/profiling_and_tracing/__init__.py new file mode 100644 index 000000000..55a3ee102 --- /dev/null +++ b/hydragnn/utils/profiling_and_tracing/__init__.py @@ -0,0 +1,3 @@ +from .profile import Profiler, ProfilerActivity +from .time_utils import Timer, TimerError +from .tracer import Tracer, GPTLTracer, SCOREPTracer diff --git a/hydragnn/utils/gptl4py_dummy.py b/hydragnn/utils/profiling_and_tracing/gptl4py_dummy.py similarity index 97% rename from hydragnn/utils/gptl4py_dummy.py rename to hydragnn/utils/profiling_and_tracing/gptl4py_dummy.py index 67cafe45f..4f8fd91e5 100644 --- a/hydragnn/utils/gptl4py_dummy.py +++ b/hydragnn/utils/profiling_and_tracing/gptl4py_dummy.py @@ -12,7 +12,6 @@ from __future__ import absolute_import from functools import wraps from contextlib import contextmanager -import torch.cuda.nvtx as nvtx def initialize(): diff --git a/hydragnn/utils/profile.py b/hydragnn/utils/profiling_and_tracing/profile.py similarity index 96% rename from hydragnn/utils/profile.py rename to hydragnn/utils/profiling_and_tracing/profile.py index a0e113324..0e15027c8 100644 --- a/hydragnn/utils/profile.py +++ b/hydragnn/utils/profiling_and_tracing/profile.py @@ -1,7 +1,7 @@ import torch import contextlib from unittest.mock import MagicMock -from torch.profiler import profile, record_function, ProfilerActivity +from torch.profiler import ProfilerActivity from hydragnn.utils.distributed import get_device_name diff --git a/hydragnn/utils/time_utils.py b/hydragnn/utils/profiling_and_tracing/time_utils.py similarity index 97% rename from hydragnn/utils/time_utils.py rename to hydragnn/utils/profiling_and_tracing/time_utils.py index f30bb9b11..ddd7b0251 100644 --- a/hydragnn/utils/time_utils.py +++ b/hydragnn/utils/profiling_and_tracing/time_utils.py @@ -11,8 +11,8 @@ import time import torch -from .distributed import get_comm_size_and_rank, get_device -from .print_utils import print_distributed +from hydragnn.utils.distributed import get_comm_size_and_rank, get_device +from hydragnn.utils.print.print_utils import print_distributed class TimerError(Exception): diff --git a/hydragnn/utils/tracer.py b/hydragnn/utils/profiling_and_tracing/tracer.py similarity index 98% rename from hydragnn/utils/tracer.py rename to hydragnn/utils/profiling_and_tracing/tracer.py index df2b5285f..a156f9cf6 100644 --- a/hydragnn/utils/tracer.py +++ b/hydragnn/utils/profiling_and_tracing/tracer.py @@ -6,10 +6,6 @@ from functools import wraps from contextlib import contextmanager -import os -import sys -from collections import OrderedDict - from abc import ABC, abstractmethod import torch from mpi4py import MPI diff --git a/tests/test_datasetclass_inheritance.py b/tests/test_datasetclass_inheritance.py index ba2e76e28..a742e0eca 100644 --- a/tests/test_datasetclass_inheritance.py +++ b/tests/test_datasetclass_inheritance.py @@ -20,10 +20,13 @@ import hydragnn, tests -from hydragnn.utils.config_utils import get_log_name_config +from hydragnn.utils.input_config_parsing.config_utils import get_log_name_config from hydragnn.utils.model import print_model -from hydragnn.utils.lsmsdataset import LSMSDataset -from hydragnn.utils.serializeddataset import SerializedWriter, SerializedDataset +from hydragnn.utils.datasets.lsmsdataset import LSMSDataset +from hydragnn.utils.datasets.serializeddataset import ( + SerializedWriter, + SerializedDataset, +) from hydragnn.preprocess.load_data import split_dataset diff --git a/tests/test_deepspeed.py b/tests/test_deepspeed.py index 2cd89b4e9..f111b282c 100644 --- a/tests/test_deepspeed.py +++ b/tests/test_deepspeed.py @@ -1,9 +1,5 @@ -import os, json import pytest -import torch -import random -import hydragnn from tests.test_graphs import unittest_train_model diff --git a/tests/test_graphs.py b/tests/test_graphs.py index e2b36be60..7d29cd9bb 100755 --- a/tests/test_graphs.py +++ b/tests/test_graphs.py @@ -18,7 +18,7 @@ import shutil import hydragnn, tests -from hydragnn.utils.config_utils import merge_config +from hydragnn.utils.input_config_parsing.config_utils import merge_config # Main unit test function called by pytest wrappers. diff --git a/tests/test_model_loadpred.py b/tests/test_model_loadpred.py index b301962a2..7e13fefda 100755 --- a/tests/test_model_loadpred.py +++ b/tests/test_model_loadpred.py @@ -68,7 +68,7 @@ def pytest_model_loadpred(): # get the directory of trained model log_name = hydragnn.utils.config_utils.get_log_name_config(config) modelfile = os.path.join("./logs/", log_name, log_name + ".pk") - # check if pretrained model and pkl dataset files exists + # check if pretrained model and pkl datasets files exists case_exist = True config_file = os.path.join("./logs/", log_name, "config.json") if not (os.path.isfile(modelfile) and os.path.isfile(config_file)): @@ -79,7 +79,7 @@ def pytest_model_loadpred(): config = json.load(f) for dataset_name, raw_data_path in config["Dataset"]["path"].items(): if not os.path.isfile(raw_data_path): - print(dataset_name, "dataset not found: ", raw_data_path) + print(dataset_name, "datasets not found: ", raw_data_path) case_exist = False break if not case_exist: diff --git a/tests/test_periodic_boundary_conditions.py b/tests/test_periodic_boundary_conditions.py index 43d92b46c..a81e0b9f6 100644 --- a/tests/test_periodic_boundary_conditions.py +++ b/tests/test_periodic_boundary_conditions.py @@ -9,12 +9,11 @@ # SPDX-License-Identifier: BSD-3-Clause # ############################################################################## -import sys, os, json, numpy as np -import pytest +import json, numpy as np import torch from torch_geometric.data import Data -from hydragnn.preprocess.utils import ( +from hydragnn.preprocess.graph_samples_checks_and_updates import ( get_radius_graph_config, get_radius_graph_pbc_config, ) diff --git a/tests/test_rotational_invariance.py b/tests/test_rotational_invariance.py index 6dfcf377b..f7ac970a2 100644 --- a/tests/test_rotational_invariance.py +++ b/tests/test_rotational_invariance.py @@ -9,15 +9,15 @@ # SPDX-License-Identifier: BSD-3-Clause # ############################################################################## -import sys, os, json +import json import pytest import torch from torch_geometric.data import Data from torch_geometric.transforms import Distance, NormalizeRotation -from hydragnn.preprocess.utils import get_radius_graph_config +from hydragnn.preprocess.graph_samples_checks_and_updates import get_radius_graph_config -from hydragnn.preprocess.utils import ( +from hydragnn.preprocess.graph_samples_checks_and_updates import ( check_data_samples_equivalence, )