BFloat16 Integration/System Test Update (#1079)

awslabs · Jan 3, 2023 · 13c63be · 13c63be
1 parent a08216c
commit 13c63be
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [3.1.31]
+
+### Fixed
+
+- Fixed sequence copying integration tests to correctly specify that scoring/translation outputs should not be checked.
+- Enabled `bfloat16` integration and system testing on all platforms.
+
 ## [3.1.30]
 
 ### Added

diff --git a/sockeye/__init__.py b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '3.1.30'
+__version__ = '3.1.31'
diff --git a/test/common.py b/test/common.py
@@ -53,13 +53,13 @@ def check_train_translate(train_params: str,
     # may differ.
     if 'greedy' not in translate_params and 'neural-vocab-selection' not in train_params:
         translate_params_batch = translate_params + " --batch-size 2"
-        test_translate_equivalence(data, translate_params_batch, compare_output=True)
+        test_translate_equivalence(data, translate_params_batch, compare_output=compare_output)
 
     # Run translate with restrict-lexicon
     if 'neural-vocab-selection ' not in train_params:
         data = run_translate_restrict(data, translate_params)
 
-    test_translate_equivalence(data, translate_params, compare_output=True)
+    test_translate_equivalence(data, translate_params, compare_output=compare_output)
 
     # Test scoring by ensuring that the sockeye.scoring module produces the same scores when scoring the output
     # of sockeye.translate. However, since this training is on very small datasets, the output of sockeye.translate

diff --git a/test/integration/test_seq_copy_int.py b/test/integration/test_seq_copy_int.py
@@ -133,8 +133,7 @@
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --clamp-to-dtype",
      "--beam-size 2 --clamp-to-dtype",
      False, 0, 0),
-    # Basic transformer, training only the decoder with bfloat16 inference when
-    # running on Linux
+    # Basic transformer, training only the decoder with bfloat16 inference
     ("--encoder transformer --decoder {decoder}"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
@@ -143,7 +142,7 @@
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 2"
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
      " --fixed-param-strategy " + C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_DECODER,
-     "--beam-size 2" + (" --dtype bfloat16" if platform.system() == "Linux" else ""),
+     "--beam-size 2 --dtype bfloat16",
      False, 0, 0),
 ]
 

diff --git a/test/system/test_seq_copy_sys.py b/test/system/test_seq_copy_sys.py
@@ -166,7 +166,8 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
      "--beam-size 1 --prevent-unk",
      True, 0, 0,
      1.03,
-     0.97),
+     0.97,
+     True),
     ("Sort:transformer:transformer:source_factors:target_factors:batch_max_word",
      "--encoder transformer --decoder transformer"
      " --max-seq-len 10 --batch-size 70 --update-interval 2 --batch-type max-word --batch-sentences-multiple-of 1"
@@ -179,7 +180,8 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
      "--beam-size 1",
      True, 3, 1,
      1.03,
-     0.96),
+     0.96,
+     True),
     ("Sort:transformer:ssru_transformer:batch_word",
      "--encoder transformer --decoder ssru_transformer"
      " --max-seq-len 10 --batch-size 90 --update-interval 1 --batch-type word --batch-sentences-multiple-of 1"
@@ -190,14 +192,27 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
      "--beam-size 1",
      True, 0, 0,
      1.03,
-     0.97)
+     0.97,
+     True),
+    ("Sort:transformer:batch_word:bfloat16",
+     "--encoder transformer --decoder transformer"
+     " --max-seq-len 10 --batch-size 90 --update-interval 1 --batch-type word --batch-sentences-multiple-of 1"
+     " --max-updates 6000"
+     " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 32 --num-embed 32"
+     " --transformer-dropout-attention 0.0 --transformer-dropout-act 0.0 --transformer-dropout-prepost 0.0"
+     " --transformer-feed-forward-num-hidden 64" + COMMON_TRAINING_PARAMS,
+     "--beam-size 1 --dtype bfloat16",
+     True, 0, 0,
+     1.03,
+     0.97,
+     False)
 ]
 
 
 @pytest.mark.parametrize("name, train_params, translate_params, use_prepared_data, n_source_factors, "
-                         "n_target_factors, perplexity_thresh, bleu_thresh", SORT_CASES)
+                         "n_target_factors, perplexity_thresh, bleu_thresh, compare_output", SORT_CASES)
 def test_seq_sort(name, train_params, translate_params, use_prepared_data,
-                  n_source_factors, n_target_factors, perplexity_thresh, bleu_thresh):
+                  n_source_factors, n_target_factors, perplexity_thresh, bleu_thresh, compare_output):
     """Task: sort short sequences of digits"""
     with tmp_digits_dataset("test_seq_sort.",
                             _TRAIN_LINE_COUNT, _TRAIN_LINE_COUNT_EMPTY, _LINE_MAX_LENGTH,
@@ -211,7 +226,7 @@ def test_seq_sort(name, train_params, translate_params, use_prepared_data,
                                      data=data,
                                      use_prepared_data=use_prepared_data,
                                      max_seq_len=_LINE_MAX_LENGTH,
-                                     compare_output=True,
+                                     compare_output=compare_output,
                                      seed=seed)
 
         # get best validation perplexity