make it work on ml engine, google cloud storage

seahrh · Aug 31, 2018 · 953cc91 · 953cc91
1 parent 26926b0
commit 953cc91
Show file tree

Hide file tree

Showing 13 changed files with 180 additions and 108 deletions.
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,5 @@
+trainingInput:
+  scaleTier: BASIC #BASIC_GPU
+  pythonVersion: "3.5"
+  runtimeVersion: "1.9"
+  region: "us-east1"
diff --git a/eval_local.sh b/eval_local.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -x #echo on
+
+source ./venv/bin/activate
+
+gcloud ml-engine local train \
+    --module-name trainer.task \
+    --package-path trainer/ \
+    --job-dir /Users/me/googledrive/suma/log/huat \
+    -- \
+    --mode eval \
+    --data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
+    --vocab_path /Users/me/googledrive/suma/finished_files/vocab \
+
+deactivate
diff --git a/infer_local.sh b/infer_local.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -x #echo on
+
+source ./venv/bin/activate
+
+gcloud ml-engine local train \
+    --module-name trainer.task \
+    --package-path trainer/ \
+    --job-dir /Users/me/googledrive/suma/log/huat \
+    -- \
+    --mode infer \
+    --data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
+    --vocab_path /Users/me/googledrive/suma/finished_files/vocab \
+
+deactivate
diff --git a/infer_single_pass_local.sh b/infer_single_pass_local.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -x #echo on
+
+source ./venv/bin/activate
+
+gcloud ml-engine local train \
+    --module-name trainer.task \
+    --package-path trainer/ \
+    --job-dir /Users/me/googledrive/suma/log/huat \
+    -- \
+    --mode infer \
+    --data_path "/Users/me/googledrive/suma/finished_files/chunked/val_*" \
+    --vocab_path /Users/me/googledrive/suma/finished_files/vocab \
+    --single_pass 1
+
+deactivate
diff --git a/setup.py b/setup.py
@@ -5,9 +5,9 @@
 setup(
     name='sgcharts-pointer-generator',
     version=__version__,
-    python_requires='>=3.6.0',
+    python_requires='>=3.5.0',
     install_requires=[
-        'tensorflow==1.10.0',
+        'tensorflow==1.9.0',
         'pyrouge==0.1.3'
     ],
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),

diff --git a/train.sh b/train.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+set -x #echo on
+
+DATE=`date '+%Y%m%d_%H%M%S'`
+MAX_STEP=400
+BUCKET_NAME="mybucket"
+MODEL_NAME="mymodel"
+JOB_NAME="${MODEL_NAME}_${DATE}"
+JOB_DIR="gs://${BUCKET_NAME}/models/${MODEL_NAME}"
+DATA_PATH="gs://${BUCKET_NAME}/data/finished_files/chunked/train_*"
+VOCAB_PATH="gs://${BUCKET_NAME}/data/finished_files/vocab"
+
+
+gcloud ml-engine jobs submit training ${JOB_NAME} \
+    --module-name trainer.task \
+    --package-path trainer/ \
+    --job-dir ${JOB_DIR} \
+    --config config.yaml \
+    -- \
+    --mode train \
+    --data_path ${DATA_PATH} \
+    --vocab_path ${VOCAB_PATH} \
+    --max_step ${MAX_STEP}
diff --git a/train_local.sh b/train_local.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -x #echo on
+
+source ./venv/bin/activate
+
+gcloud ml-engine local train \
+    --module-name trainer.task \
+    --package-path trainer/ \
+    --job-dir /Users/me/googledrive/suma/log/huat \
+    -- \
+    --mode train \
+    --data_path "/Users/my/googledrive/suma/finished_files/chunked/train_*" \
+    --vocab_path /Users/my/googledrive/suma/finished_files/vocab \
+    --max_step 30
+
+deactivate
diff --git a/trainer/data.py b/trainer/data.py
@@ -16,12 +16,13 @@
 
 """This file contains code to read the train/eval/test data from file and process it, and read the vocab data from file and process it"""
 
-import glob
+from tensorflow.gfile import Glob
 import random
 import struct
 import csv
 from tensorflow.core.example import example_pb2
 from tensorflow import logging as log
+from tensorflow.python.lib.io import file_io
 
 # <s> and </s> are used in the data files to segment the abstracts into sentences. They don't receive vocab ids.
 SENTENCE_START = '<s>'
@@ -56,7 +57,7 @@ def __init__(self, vocab_file, max_size):
             self._count += 1
 
         # Read the vocab file and add words up to max_size
-        with open(vocab_file, 'r', encoding='utf8') as vocab_f:
+        with file_io.FileIO(vocab_file, 'r') as vocab_f:
             for line in vocab_f:
                 pieces = line.split()
                 if len(pieces) != 2:
@@ -102,8 +103,8 @@ def write_metadata(self, fpath):
         Args:
           fpath: place to write the metadata file
         """
-        log.info("Writing word embedding metadata file to %s..." % (fpath))
-        with open(fpath, "w", encoding='utf8') as f:
+        log.info("Writing word embedding metadata file to %s..." % fpath)
+        with file_io.FileIO(fpath, "w") as f:
             fieldnames = ['word']
             writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
             for i in range(self.size()):
@@ -127,20 +128,20 @@ def example_generator(data_path, single_pass):
       Deserialized tf.Example.
     """
     while True:
-        filelist = glob.glob(data_path)  # get the list of datafiles
+        filelist = Glob(data_path)  # get the list of datafiles
         assert filelist, ('Error: Empty filelist at %s' % data_path)  # check filelist isn't empty
         if single_pass:
             filelist = sorted(filelist)
         else:
             random.shuffle(filelist)
         for f in filelist:
-            reader = open(f, 'rb')
-            while True:
-                len_bytes = reader.read(8)
-                if not len_bytes: break  # finished reading this file
-                str_len = struct.unpack('q', len_bytes)[0]
-                example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
-                yield example_pb2.Example.FromString(example_str)
+            with file_io.FileIO(f, 'rb') as reader:
+                while True:
+                    len_bytes = reader.read(8)
+                    if not len_bytes: break  # finished reading this file
+                    str_len = struct.unpack('q', len_bytes)[0]
+                    example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
+                    yield example_pb2.Example.FromString(example_str)
         if single_pass:
             log.info("example_generator completed reading all datafiles. No more data.")
             break

diff --git a/trainer/decode.py b/trainer/decode.py
@@ -28,6 +28,7 @@
 import trainer.util as util
 import logging
 from tensorflow.python.estimator.model_fn import ModeKeys as Modes
+from tensorflow.python.lib.io import file_io
 
 SECS_UNTIL_NEW_CKPT = 60  # max number of seconds before loading new checkpoint
 
@@ -192,10 +193,10 @@ def write_for_rouge(self, reference_sents, decoded_words, ex_index):
         ref_file = os.path.join(self._rouge_ref_dir, "%06d_reference.txt" % ex_index)
         decoded_file = os.path.join(self._rouge_dec_dir, "%06d_decoded.txt" % ex_index)
 
-        with open(ref_file, "w") as f:
+        with file_io.FileIO(ref_file, "w") as f:
             for idx, sent in enumerate(reference_sents):
                 f.write(sent) if idx == len(reference_sents) - 1 else f.write(sent + "\n")
-        with open(decoded_file, "w") as f:
+        with file_io.FileIO(decoded_file, "w") as f:
             for idx, sent in enumerate(decoded_sents):
                 f.write(sent) if idx == len(decoded_sents) - 1 else f.write(sent + "\n")
 
@@ -251,7 +252,7 @@ def rouge_log(results_dict, dir_to_write):
     log.info(log_str)  # log to screen
     results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
     log.info("Writing final ROUGE results to %s...", results_file)
-    with open(results_file, "w") as f:
+    with file_io.FileIO(results_file, "w") as f:
         f.write(log_str)
 
 

diff --git a/trainer/model.py b/trainer/model.py
@@ -356,18 +356,17 @@ def build_graph(self):
         """Add the placeholders, model, global step, train_op and summaries to the graph"""
         log.info('Building graph...')
         t0 = time.time()
-        with tf.device(tf.train.replica_device_setter(cluster=self._conf.cluster_spec)):
-            self._add_placeholders()
-            self._add_seq2seq()
-            self.global_step = tf.get_variable(
-                'global_step',
-                dtype=tf.int32,
-                initializer=tf.constant(0),
-                trainable=False
-            )
-            if self._mode == Modes.TRAIN:
-                self._add_train_op()
-            self._summaries = tf.summary.merge_all()
+        self._add_placeholders()
+        self._add_seq2seq()
+        self.global_step = tf.get_variable(
+            'global_step',
+            dtype=tf.int32,
+            initializer=tf.constant(0),
+            trainable=False
+        )
+        if self._mode == Modes.TRAIN:
+            self._add_train_op()
+        self._summaries = tf.summary.merge_all()
         t1 = time.time()
         log.info('Time to build graph: %i seconds', t1 - t0)
 

diff --git a/trainer/my_model.py b/trainer/my_model.py
@@ -6,7 +6,7 @@ def generate_model_fn():
     def _model_fn(features, labels, mode, params, config):
         modes = [Modes.TRAIN, Modes.EVAL, Modes.PREDICT]
         if mode not in modes:
-            raise ValueError(f'mode must be one of {repr(modes)} but mode={mode}')
+            raise ValueError('mode must be one of {} but mode={}'.format(repr(modes), mode))
         if mode == Modes.TRAIN:
             loss = 0  # TODO filler
             grads = [0]  # TODO filler

diff --git a/trainer/task.py b/trainer/task.py
@@ -140,17 +140,21 @@ def setup_training(
 
 
 def __train_session(train_dir, debug, conf):
+    log.info('RunConfig is_chief={}, master={}, task_id={}'.format(
+        repr(conf.is_chief),
+        repr(conf.master),
+        repr(conf.task_id)))
     sess = tf.train.MonitoredTrainingSession(
         checkpoint_dir=train_dir,  # required to restore variables!
-        summary_dir=train_dir,
+        master=conf.master,
         is_chief=conf.is_chief,
         save_summaries_secs=60,
-        save_checkpoint_secs=60,
-        max_wait_secs=60,
+        save_checkpoint_secs=conf.save_checkpoints_secs,
+        max_wait_secs=120,
         stop_grace_period_secs=60,
         config=conf.session_config,
         scaffold=tf.train.Scaffold(
-            saver=tf.train.Saver(max_to_keep=3)
+            saver=tf.train.Saver(max_to_keep=conf.keep_checkpoint_max)
         )
     )
     if debug:  # start the tensorflow debugger
@@ -164,6 +168,7 @@ def run_training(model, batcher, train_dir, coverage, debug, max_step, conf):
     log.debug("starting run_training")
     summary_writer = tf.summary.FileWriterCache.get(train_dir)
     with __train_session(train_dir=train_dir, debug=debug, conf=conf) as sess:
+        log.debug('after MonitoredTrainingSession')
         train_step = 0
         # repeats until max_step is reached
         while not sess.should_stop() and train_step <= max_step:
@@ -172,6 +177,7 @@ def run_training(model, batcher, train_dir, coverage, debug, max_step, conf):
             t0 = time.time()
             results = model.run_train_step(sess, batch)
             t1 = time.time()
+            log.debug('after session.run')
             loss = results['loss']
             if not np.isfinite(loss):
                 raise Exception("Loss is not finite. Stopping.")
@@ -260,42 +266,29 @@ def __log_verbosity(level):
         log.set_verbosity(log.DEBUG)
 
 
-def __log_root(base_path, exp_name, mode):
-    """Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary"""
-    res = os.path.join(base_path, exp_name)
-    if not os.path.exists(res):
-        if mode == "train":
-            os.makedirs(res)
-        else:
-            raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % res)
-    return res
-
-
 def __main(
+        job_dir,
         mode,
         data_path,
         vocab_path,
         vocab_size,
-        log_root,
         beam_size,
         pointer_gen,
         convert_to_coverage_model,
         coverage,
         restore_best_model,
         log_level,
-        exp_name,
         single_pass,
         random_seed,
         debug,
         **hparams
 ):
     __log_verbosity(log_level)
     log.info('Starting seq2seq_attention in %s mode...', mode)
-    log_root = __log_root(log_root, exp_name, mode)
     vocab = Vocab(vocab_path, vocab_size)  # create a vocabulary
     hps = __hparams(**hparams)
-    conf = util.run_config(model_dir=log_root, random_seed=random_seed)
-    log.info(f'hps={repr(hps)}\nconf={util.repr_run_config(conf)}')
+    conf = util.run_config(model_dir=job_dir, random_seed=random_seed)
+    log.info('hps={}\nconf={}'.format(repr(hps), util.repr_run_config(conf)))
 
     # Create a batcher object that will create minibatches of data
     batcher = Batcher(
@@ -375,15 +368,11 @@ def __main(
         and log the results to screen, indefinitely.\
         """)
     parser.add_argument(
-        '--log_root',
-        type=str,
-        required=True,
-        help='Root directory for all logging.')
-    parser.add_argument(
-        '--exp_name',
+        '--job-dir',
         type=str,
-        default='default_exp_name',
-        help='Name for experiment. Logs will be saved in a directory with this name, under log_root.')
+        help='GCS location to write checkpoints and export models',
+        required=True
+    )
     parser.add_argument(
         '--hidden_dim',
         type=int,
@@ -521,9 +510,9 @@ def __main(
     args = parser.parse_args()
     modes = [Modes.TRAIN, Modes.EVAL, Modes.PREDICT]
     if args.mode not in modes:
-        raise ValueError(f'--mode flag must be one of {repr(modes)}')
+        raise ValueError('--mode flag must be one of {}'.format(repr(modes)))
     if args.single_pass and args.mode != Modes.PREDICT:
-        raise ValueError(f'--single_pass flag should only be True in {repr(Modes.PREDICT)} mode')
+        raise ValueError('--single_pass flag should only be True in {} mode'.format(repr(Modes.PREDICT)))
     # If in decode mode, set batch_size = beam_size
     # Reason: in decode mode, we decode one example at a time.
     # On each step, we have beam_size-many hypotheses in the beam,