Skip to content

Commit

Permalink
make it work on ml engine, google cloud storage
Browse files Browse the repository at this point in the history
  • Loading branch information
seahrh committed Aug 31, 2018
1 parent 26926b0 commit 953cc91
Show file tree
Hide file tree
Showing 13 changed files with 180 additions and 108 deletions.
5 changes: 5 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
trainingInput:
scaleTier: BASIC #BASIC_GPU
pythonVersion: "3.5"
runtimeVersion: "1.9"
region: "us-east1"
15 changes: 15 additions & 0 deletions eval_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash
set -x #echo on

source ./venv/bin/activate

gcloud ml-engine local train \
--module-name trainer.task \
--package-path trainer/ \
--job-dir /Users/me/googledrive/suma/log/huat \
-- \
--mode eval \
--data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
--vocab_path /Users/me/googledrive/suma/finished_files/vocab \

deactivate
15 changes: 15 additions & 0 deletions infer_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash
set -x #echo on

source ./venv/bin/activate

gcloud ml-engine local train \
--module-name trainer.task \
--package-path trainer/ \
--job-dir /Users/me/googledrive/suma/log/huat \
-- \
--mode infer \
--data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
--vocab_path /Users/me/googledrive/suma/finished_files/vocab \

deactivate
16 changes: 16 additions & 0 deletions infer_single_pass_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -x #echo on

source ./venv/bin/activate

gcloud ml-engine local train \
--module-name trainer.task \
--package-path trainer/ \
--job-dir /Users/me/googledrive/suma/log/huat \
-- \
--mode infer \
--data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
--vocab_path /Users/me/googledrive/suma/finished_files/vocab \
--single_pass 1

deactivate
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
setup(
name='sgcharts-pointer-generator',
version=__version__,
python_requires='>=3.6.0',
python_requires='>=3.5.0',
install_requires=[
'tensorflow==1.10.0',
'tensorflow==1.9.0',
'pyrouge==0.1.3'
],
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
Expand Down
23 changes: 23 additions & 0 deletions train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash
set -x #echo on

DATE=`date '+%Y%m%d_%H%M%S'`
MAX_STEP=400
BUCKET_NAME="mybucket"
MODEL_NAME="mymodel"
JOB_NAME="${MODEL_NAME}_${DATE}"
JOB_DIR="gs://${BUCKET_NAME}/models/${MODEL_NAME}"
DATA_PATH="gs://${BUCKET_NAME}/data/finished_files/chunked/train_*"
VOCAB_PATH="gs://${BUCKET_NAME}/data/finished_files/vocab"


gcloud ml-engine jobs submit training ${JOB_NAME} \
--module-name trainer.task \
--package-path trainer/ \
--job-dir ${JOB_DIR} \
--config config.yaml \
-- \
--mode train \
--data_path ${DATA_PATH} \
--vocab_path ${VOCAB_PATH} \
--max_step ${MAX_STEP}
16 changes: 16 additions & 0 deletions train_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -x #echo on

source ./venv/bin/activate

gcloud ml-engine local train \
--module-name trainer.task \
--package-path trainer/ \
--job-dir /Users/me/googledrive/suma/log/huat \
-- \
--mode train \
--data_path "/Users/my/googledrive/suma/finished_files/chunked/train_*" \
--vocab_path /Users/my/googledrive/suma/finished_files/vocab \
--max_step 30

deactivate
25 changes: 13 additions & 12 deletions trainer/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

"""This file contains code to read the train/eval/test data from file and process it, and read the vocab data from file and process it"""

import glob
from tensorflow.gfile import Glob
import random
import struct
import csv
from tensorflow.core.example import example_pb2
from tensorflow import logging as log
from tensorflow.python.lib.io import file_io

# <s> and </s> are used in the data files to segment the abstracts into sentences. They don't receive vocab ids.
SENTENCE_START = '<s>'
Expand Down Expand Up @@ -56,7 +57,7 @@ def __init__(self, vocab_file, max_size):
self._count += 1

# Read the vocab file and add words up to max_size
with open(vocab_file, 'r', encoding='utf8') as vocab_f:
with file_io.FileIO(vocab_file, 'r') as vocab_f:
for line in vocab_f:
pieces = line.split()
if len(pieces) != 2:
Expand Down Expand Up @@ -102,8 +103,8 @@ def write_metadata(self, fpath):
Args:
fpath: place to write the metadata file
"""
log.info("Writing word embedding metadata file to %s..." % (fpath))
with open(fpath, "w", encoding='utf8') as f:
log.info("Writing word embedding metadata file to %s..." % fpath)
with file_io.FileIO(fpath, "w") as f:
fieldnames = ['word']
writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
for i in range(self.size()):
Expand All @@ -127,20 +128,20 @@ def example_generator(data_path, single_pass):
Deserialized tf.Example.
"""
while True:
filelist = glob.glob(data_path) # get the list of datafiles
filelist = Glob(data_path) # get the list of datafiles
assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty
if single_pass:
filelist = sorted(filelist)
else:
random.shuffle(filelist)
for f in filelist:
reader = open(f, 'rb')
while True:
len_bytes = reader.read(8)
if not len_bytes: break # finished reading this file
str_len = struct.unpack('q', len_bytes)[0]
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
yield example_pb2.Example.FromString(example_str)
with file_io.FileIO(f, 'rb') as reader:
while True:
len_bytes = reader.read(8)
if not len_bytes: break # finished reading this file
str_len = struct.unpack('q', len_bytes)[0]
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
yield example_pb2.Example.FromString(example_str)
if single_pass:
log.info("example_generator completed reading all datafiles. No more data.")
break
Expand Down
7 changes: 4 additions & 3 deletions trainer/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import trainer.util as util
import logging
from tensorflow.python.estimator.model_fn import ModeKeys as Modes
from tensorflow.python.lib.io import file_io

SECS_UNTIL_NEW_CKPT = 60 # max number of seconds before loading new checkpoint

Expand Down Expand Up @@ -192,10 +193,10 @@ def write_for_rouge(self, reference_sents, decoded_words, ex_index):
ref_file = os.path.join(self._rouge_ref_dir, "%06d_reference.txt" % ex_index)
decoded_file = os.path.join(self._rouge_dec_dir, "%06d_decoded.txt" % ex_index)

with open(ref_file, "w") as f:
with file_io.FileIO(ref_file, "w") as f:
for idx, sent in enumerate(reference_sents):
f.write(sent) if idx == len(reference_sents) - 1 else f.write(sent + "\n")
with open(decoded_file, "w") as f:
with file_io.FileIO(decoded_file, "w") as f:
for idx, sent in enumerate(decoded_sents):
f.write(sent) if idx == len(decoded_sents) - 1 else f.write(sent + "\n")

Expand Down Expand Up @@ -251,7 +252,7 @@ def rouge_log(results_dict, dir_to_write):
log.info(log_str) # log to screen
results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
log.info("Writing final ROUGE results to %s...", results_file)
with open(results_file, "w") as f:
with file_io.FileIO(results_file, "w") as f:
f.write(log_str)


Expand Down
23 changes: 11 additions & 12 deletions trainer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,18 +356,17 @@ def build_graph(self):
"""Add the placeholders, model, global step, train_op and summaries to the graph"""
log.info('Building graph...')
t0 = time.time()
with tf.device(tf.train.replica_device_setter(cluster=self._conf.cluster_spec)):
self._add_placeholders()
self._add_seq2seq()
self.global_step = tf.get_variable(
'global_step',
dtype=tf.int32,
initializer=tf.constant(0),
trainable=False
)
if self._mode == Modes.TRAIN:
self._add_train_op()
self._summaries = tf.summary.merge_all()
self._add_placeholders()
self._add_seq2seq()
self.global_step = tf.get_variable(
'global_step',
dtype=tf.int32,
initializer=tf.constant(0),
trainable=False
)
if self._mode == Modes.TRAIN:
self._add_train_op()
self._summaries = tf.summary.merge_all()
t1 = time.time()
log.info('Time to build graph: %i seconds', t1 - t0)

Expand Down
2 changes: 1 addition & 1 deletion trainer/my_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def generate_model_fn():
def _model_fn(features, labels, mode, params, config):
modes = [Modes.TRAIN, Modes.EVAL, Modes.PREDICT]
if mode not in modes:
raise ValueError(f'mode must be one of {repr(modes)} but mode={mode}')
raise ValueError('mode must be one of {} but mode={}'.format(repr(modes), mode))
if mode == Modes.TRAIN:
loss = 0 # TODO filler
grads = [0] # TODO filler
Expand Down
49 changes: 19 additions & 30 deletions trainer/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,17 +140,21 @@ def setup_training(


def __train_session(train_dir, debug, conf):
log.info('RunConfig is_chief={}, master={}, task_id={}'.format(
repr(conf.is_chief),
repr(conf.master),
repr(conf.task_id)))
sess = tf.train.MonitoredTrainingSession(
checkpoint_dir=train_dir, # required to restore variables!
summary_dir=train_dir,
master=conf.master,
is_chief=conf.is_chief,
save_summaries_secs=60,
save_checkpoint_secs=60,
max_wait_secs=60,
save_checkpoint_secs=conf.save_checkpoints_secs,
max_wait_secs=120,
stop_grace_period_secs=60,
config=conf.session_config,
scaffold=tf.train.Scaffold(
saver=tf.train.Saver(max_to_keep=3)
saver=tf.train.Saver(max_to_keep=conf.keep_checkpoint_max)
)
)
if debug: # start the tensorflow debugger
Expand All @@ -164,6 +168,7 @@ def run_training(model, batcher, train_dir, coverage, debug, max_step, conf):
log.debug("starting run_training")
summary_writer = tf.summary.FileWriterCache.get(train_dir)
with __train_session(train_dir=train_dir, debug=debug, conf=conf) as sess:
log.debug('after MonitoredTrainingSession')
train_step = 0
# repeats until max_step is reached
while not sess.should_stop() and train_step <= max_step:
Expand All @@ -172,6 +177,7 @@ def run_training(model, batcher, train_dir, coverage, debug, max_step, conf):
t0 = time.time()
results = model.run_train_step(sess, batch)
t1 = time.time()
log.debug('after session.run')
loss = results['loss']
if not np.isfinite(loss):
raise Exception("Loss is not finite. Stopping.")
Expand Down Expand Up @@ -260,42 +266,29 @@ def __log_verbosity(level):
log.set_verbosity(log.DEBUG)


def __log_root(base_path, exp_name, mode):
"""Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary"""
res = os.path.join(base_path, exp_name)
if not os.path.exists(res):
if mode == "train":
os.makedirs(res)
else:
raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % res)
return res


def __main(
job_dir,
mode,
data_path,
vocab_path,
vocab_size,
log_root,
beam_size,
pointer_gen,
convert_to_coverage_model,
coverage,
restore_best_model,
log_level,
exp_name,
single_pass,
random_seed,
debug,
**hparams
):
__log_verbosity(log_level)
log.info('Starting seq2seq_attention in %s mode...', mode)
log_root = __log_root(log_root, exp_name, mode)
vocab = Vocab(vocab_path, vocab_size) # create a vocabulary
hps = __hparams(**hparams)
conf = util.run_config(model_dir=log_root, random_seed=random_seed)
log.info(f'hps={repr(hps)}\nconf={util.repr_run_config(conf)}')
conf = util.run_config(model_dir=job_dir, random_seed=random_seed)
log.info('hps={}\nconf={}'.format(repr(hps), util.repr_run_config(conf)))

# Create a batcher object that will create minibatches of data
batcher = Batcher(
Expand Down Expand Up @@ -375,15 +368,11 @@ def __main(
and log the results to screen, indefinitely.\
""")
parser.add_argument(
'--log_root',
type=str,
required=True,
help='Root directory for all logging.')
parser.add_argument(
'--exp_name',
'--job-dir',
type=str,
default='default_exp_name',
help='Name for experiment. Logs will be saved in a directory with this name, under log_root.')
help='GCS location to write checkpoints and export models',
required=True
)
parser.add_argument(
'--hidden_dim',
type=int,
Expand Down Expand Up @@ -521,9 +510,9 @@ def __main(
args = parser.parse_args()
modes = [Modes.TRAIN, Modes.EVAL, Modes.PREDICT]
if args.mode not in modes:
raise ValueError(f'--mode flag must be one of {repr(modes)}')
raise ValueError('--mode flag must be one of {}'.format(repr(modes)))
if args.single_pass and args.mode != Modes.PREDICT:
raise ValueError(f'--single_pass flag should only be True in {repr(Modes.PREDICT)} mode')
raise ValueError('--single_pass flag should only be True in {} mode'.format(repr(Modes.PREDICT)))
# If in decode mode, set batch_size = beam_size
# Reason: in decode mode, we decode one example at a time.
# On each step, we have beam_size-many hypotheses in the beam,
Expand Down
Loading

0 comments on commit 953cc91

Please sign in to comment.