You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to train m3gnet with my own dataset of structures, energies and forces. For some reason, the training crashes immediately because the loss function returns the following error:
Traceback (most recent call last):
File "train.py", line 41, in
trainer.train(
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/m3gnet/trainers/potential.py", line 210, in train
lossval, grads, pred_list, emae, fmae, smae = train_one_step(
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/tmp/autograph_generated_fileeztfkyaw.py", line 14, in tf__train_one_step
(loss_val, emae, fmae, smae) = ag.converted_call(ag_.ld(loss), (ag_.ld(target_list), ag__.ld(pred_list), ag__.ld(graph_list)[ag__.ld(Index).N_ATOMS]), None, fscope)
File "/tmp/autograph_generated_fileac34350v.py", line 19, in tf___loss
e_loss = ag.converted_call(ag__.ld(flat_loss), (ag_.ld(e_target), ag__.ld(e_pred)), None, fscope)
File "/tmp/autograph_generated_filema59lezf.py", line 13, in tf___flat_loss
retval = ag_.converted_call(ag__.ld(loss), (ag__.converted_call(ag__.ld(tf).reshape, (ag__.ld(x), ((- 1),)), None, fscope), ag__.converted_call(ag__.ld(tf).reshape, (ag__.ld(y), ((- 1),)), None, fscope)), None, fscope)
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/keras/losses.py", line 1486, in mean_squared_error
return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
ValueError: in user code:
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/m3gnet/trainers/_potential.py", line 192, in train_one_step *
loss_val, emae, fmae, smae = _loss(target_list, pred_list, graph_list[Index.N_ATOMS])
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/m3gnet/trainers/_potential.py", line 139, in _loss *
e_loss = _flat_loss(e_target, e_pred)
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/m3gnet/trainers/_potential.py", line 128, in _flat_loss *
return loss(tf.reshape(x, (-1,)), tf.reshape(y, (-1,)))
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/keras/losses.py", line 1486, in mean_squared_error
return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
ValueError: Dimensions must be equal, but are 32 and 1024 for '{{node SquaredDifference}} = SquaredDifference[T=DT_FLOAT](Reshape_1, Reshape)' with input shapes: [32], [1024].
I have tried different batch sizes and get the same error however the first dimension is always the same as the batch size and the second dimension is always the batch size squared.
I have provided the script I use to call the training function below:
from pymatgen.core import Lattice, Structure, Molecule
from m3gnet.models import M3GNet, Potential
from m3gnet.trainers import PotentialTrainer
import tensorflow as tf
import pickle
import json
import warnings
from tensorflow import keras
import numpy as np
structs=traindict['structures']
energies=traindict['energies']
forces=traindict['forces']
structures=[]
for s in structs:
structures.append(Structure.from_dict(s))
val_structs=valdict['structures']
val_energies=valdict['energies']
val_forces=valdict['forces']
val_structures=[]
for v in val_structs:
val_structures.append(Structure.from_dict(v))
I am trying to train m3gnet with my own dataset of structures, energies and forces. For some reason, the training crashes immediately because the loss function returns the following error:
Traceback (most recent call last):
File "train.py", line 41, in
trainer.train(
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/m3gnet/trainers/potential.py", line 210, in train
lossval, grads, pred_list, emae, fmae, smae = train_one_step(
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/tmp/autograph_generated_fileeztfkyaw.py", line 14, in tf__train_one_step
(loss_val, emae, fmae, smae) = ag.converted_call(ag_.ld(loss), (ag_.ld(target_list), ag__.ld(pred_list), ag__.ld(graph_list)[ag__.ld(Index).N_ATOMS]), None, fscope)
File "/tmp/autograph_generated_fileac34350v.py", line 19, in tf___loss
e_loss = ag.converted_call(ag__.ld(flat_loss), (ag_.ld(e_target), ag__.ld(e_pred)), None, fscope)
File "/tmp/autograph_generated_filema59lezf.py", line 13, in tf___flat_loss
retval = ag_.converted_call(ag__.ld(loss), (ag__.converted_call(ag__.ld(tf).reshape, (ag__.ld(x), ((- 1),)), None, fscope), ag__.converted_call(ag__.ld(tf).reshape, (ag__.ld(y), ((- 1),)), None, fscope)), None, fscope)
File "/home/rapplet/.conda/envs/cent7/2020.11-py38/my_tf_env/lib/python3.8/site-packages/keras/losses.py", line 1486, in mean_squared_error
return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
ValueError: in user code:
I have tried different batch sizes and get the same error however the first dimension is always the same as the batch size and the second dimension is always the batch size squared.
I have provided the script I use to call the training function below:
from pymatgen.core import Lattice, Structure, Molecule
from m3gnet.models import M3GNet, Potential
from m3gnet.trainers import PotentialTrainer
import tensorflow as tf
import pickle
import json
import warnings
from tensorflow import keras
import numpy as np
warnings.filterwarnings("ignore")
trainf=open('traindict.json')
valf=open('valdict.json')
traindict=json.load(trainf)
valdict=json.load(valf)
structs=traindict['structures']
energies=traindict['energies']
forces=traindict['forces']
structures=[]
for s in structs:
structures.append(Structure.from_dict(s))
val_structs=valdict['structures']
val_energies=valdict['energies']
val_forces=valdict['forces']
val_structures=[]
for v in val_structs:
val_structures.append(Structure.from_dict(v))
m3gnet = M3GNet.load()
potential = Potential(model=m3gnet)
trainer = PotentialTrainer(
potential=potential, optimizer=tf.keras.optimizers.Adam(1e-3)
)
callbacks = [tf.keras.callbacks.CSVLogger('./training.log', separator=',', append=False)]
trainer.train(
structures,
energies,
forces,
validation_graphs_or_structures=val_structures,
val_energies=val_energies,
val_forces=val_forces,
epochs=2000,
fit_per_element_offset=False,
batch_size=1024,
early_stop_patience=200,
save_checkpoint=True,
callbacks=callbacks
)
potential.model.save('./gstpot')
The text was updated successfully, but these errors were encountered: