diff --git a/dockerfiles/ml/scikit-learn/rcv1_svm/Dockerfile b/dockerfiles/ml/scikit-learn/rcv1_svm/Dockerfile
index b9c289a4a..0024f0375 100755
--- a/dockerfiles/ml/scikit-learn/rcv1_svm/Dockerfile
+++ b/dockerfiles/ml/scikit-learn/rcv1_svm/Dockerfile
@@ -1,59 +1,59 @@
-# Copyright (c) 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
-
-FROM ubuntu:18.04
-
-RUN apt-get update --yes \
-    && apt-get install wget --yes && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV USE_DAAL4PY_SKLEARN=YES
-ENV USER modin
-ENV UID 1000
-ENV HOME /home/$USER
-
-RUN adduser --disabled-password \
-    --gecos "Non-root user" \
-    --uid $UID \
-    --home $HOME \
-    $USER
-
-ENV CONDA_DIR ${HOME}/miniconda
-
-SHELL ["/bin/bash", "--login", "-c"]
-
-RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
-    bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
-    "${CONDA_DIR}/bin/conda" init bash && \
-    rm -f /tmp/miniconda3.sh && \
-    echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
-
-RUN conda update -n base -c defaults conda -y && \
-    conda create --name intel_sklearn --yes -c intel python=3.7 scikit-learn
-
-COPY rcv1_svm.py "${HOME}/rcv1_svm.py"
-COPY rcv1_loader.py "${HOME}/rcv1_loader.py"
-
-RUN conda activate intel_sklearn && python ${HOME}/rcv1_loader.py
-
-ENTRYPOINT ["/bin/bash", "--login", "-c", "conda run \"$@\"", "/bin/bash", "-n", "intel_sklearn", "/usr/bin/env", "--"]
-
-CMD ["python", "${HOME}/rcv1_svm.py"]
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+FROM ubuntu:18.04
+
+RUN apt-get update --yes \
+    && apt-get install wget --yes && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV USE_DAAL4PY_SKLEARN=YES
+ENV USER modin
+ENV UID 1000
+ENV HOME /home/$USER
+
+RUN adduser --disabled-password \
+    --gecos "Non-root user" \
+    --uid $UID \
+    --home $HOME \
+    $USER
+
+ENV CONDA_DIR ${HOME}/miniconda
+
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
+    bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
+    "${CONDA_DIR}/bin/conda" init bash && \
+    rm -f /tmp/miniconda3.sh && \
+    echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
+
+RUN conda update -n base -c defaults conda -y && \
+    conda create --name intel_sklearn --yes -c intel python=3.7 scikit-learn
+
+COPY rcv1_svm.py "${HOME}/rcv1_svm.py"
+COPY rcv1_loader.py "${HOME}/rcv1_loader.py"
+
+RUN conda activate intel_sklearn && python ${HOME}/rcv1_loader.py
+
+ENTRYPOINT ["/bin/bash", "--login", "-c", "conda run \"$@\"", "/bin/bash", "-n", "intel_sklearn", "/usr/bin/env", "--"]
+
+CMD ["python", "${HOME}/rcv1_svm.py"]
diff --git a/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_loader.py b/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_loader.py
index cd937cd48..e9c02f6db 100755
--- a/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_loader.py
+++ b/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_loader.py
@@ -1,23 +1,23 @@
-# Copyright (c) 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
-
-from sklearn.datasets import fetch_rcv1
-rcv1 = fetch_rcv1()
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+from sklearn.datasets import fetch_rcv1
+rcv1 = fetch_rcv1()
diff --git a/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_svm.py b/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_svm.py
index f55c2a3cb..60ddd9e21 100755
--- a/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_svm.py
+++ b/dockerfiles/ml/scikit-learn/rcv1_svm/rcv1_svm.py
@@ -1,86 +1,86 @@
-# Copyright (c) 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
-
-from sklearn.metrics import accuracy_score, f1_score
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.datasets import fetch_rcv1
-import timeit
-import os
-
-t0 = timeit.default_timer()
-
-rcv1 = fetch_rcv1()
-
-rcv1_data = rcv1.data
-rcv1_target = rcv1.target
-
-t1 = timeit.default_timer()
-time_load = t1 - t0
-
-t0 = timeit.default_timer()
-
-x_train, x_test, y_train, y_test = train_test_split(
-    rcv1_data, rcv1_target, test_size=0.05, random_state=42)
-
-from daal4py.sklearn import patch_sklearn
-patch_sklearn()
-
-from sklearn.svm import SVC
-
-t1 = timeit.default_timer()
-time_train_test_split = t1 - t0
-
-print('[Data] train: {} test: {}'.format(x_train.shape, x_test.shape))
-print('[Target] train: {} test: {}'.format(y_train.shape, y_test.shape))
-
-
-print('[Time] Load time {} sec'.format(time_load))
-print('[Time] train_test_split time {} sec'.format(time_train_test_split))
-
-t0 = timeit.default_timer()
-
-clf = SVC(C=100.0, kernel='rbf', cache_size=8*1024)
-svm = OneVsRestClassifier(clf, n_jobs=4)
-svm.fit(x_train, y_train)
-
-t1 = timeit.default_timer()
-time_fit_train_run = t1 - t0
-
-print('[Time] Fit time {} sec'.format(time_fit_train_run))
-
-t0 = timeit.default_timer()
-svm_prediction = svm.predict(x_test)
-t1 = timeit.default_timer()
-time_predict_test_run = t1 - t0
-
-print('[Time] Predict time {} sec'.format(time_predict_test_run))
-
-t0 = timeit.default_timer()
-print('Accuracy score is {}'.format(accuracy_score(y_test, svm_prediction)))
-print('F1 samples score is {}'.format(
-    f1_score(y_test, svm_prediction, average='samples')))
-print('F1 micro score is {}'.format(
-    f1_score(y_test, svm_prediction, average='micro')))
-t1 = timeit.default_timer()
-time_metric_run = t1 - t0
-
-print('[Time] Metric time {} sec'.format(time_metric_run))
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import fetch_rcv1
+import timeit
+import os
+
+t0 = timeit.default_timer()
+
+rcv1 = fetch_rcv1()
+
+rcv1_data = rcv1.data
+rcv1_target = rcv1.target
+
+t1 = timeit.default_timer()
+time_load = t1 - t0
+
+t0 = timeit.default_timer()
+
+x_train, x_test, y_train, y_test = train_test_split(
+    rcv1_data, rcv1_target, test_size=0.05, random_state=42)
+
+from daal4py.sklearn import patch_sklearn
+patch_sklearn()
+
+from sklearn.svm import SVC
+
+t1 = timeit.default_timer()
+time_train_test_split = t1 - t0
+
+print('[Data] train: {} test: {}'.format(x_train.shape, x_test.shape))
+print('[Target] train: {} test: {}'.format(y_train.shape, y_test.shape))
+
+
+print('[Time] Load time {} sec'.format(time_load))
+print('[Time] train_test_split time {} sec'.format(time_train_test_split))
+
+t0 = timeit.default_timer()
+
+clf = SVC(C=100.0, kernel='rbf', cache_size=8*1024)
+svm = OneVsRestClassifier(clf, n_jobs=4)
+svm.fit(x_train, y_train)
+
+t1 = timeit.default_timer()
+time_fit_train_run = t1 - t0
+
+print('[Time] Fit time {} sec'.format(time_fit_train_run))
+
+t0 = timeit.default_timer()
+svm_prediction = svm.predict(x_test)
+t1 = timeit.default_timer()
+time_predict_test_run = t1 - t0
+
+print('[Time] Predict time {} sec'.format(time_predict_test_run))
+
+t0 = timeit.default_timer()
+print('Accuracy score is {}'.format(accuracy_score(y_test, svm_prediction)))
+print('F1 samples score is {}'.format(
+    f1_score(y_test, svm_prediction, average='samples')))
+print('F1 micro score is {}'.format(
+    f1_score(y_test, svm_prediction, average='micro')))
+t1 = timeit.default_timer()
+time_metric_run = t1 - t0
+
+print('[Time] Metric time {} sec'.format(time_metric_run))
diff --git a/docs/image_recognition/tensorflow/Tutorial.md b/docs/image_recognition/tensorflow/Tutorial.md
index 4d202245e..72845dbe3 100644
--- a/docs/image_recognition/tensorflow/Tutorial.md
+++ b/docs/image_recognition/tensorflow/Tutorial.md
@@ -1,434 +1,427 @@
-# Image Recognition with ResNet50, ResNet101 and InceptionV3
-
-
-## Goal
-This tutorial will introduce CPU performance considerations for three image recognition deep learning models, and how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs. 
-This tutorial will also provide code examples to use with Model Zoo's pretrained model that can be copy/pasted for quick off-the-ground implementation with synthetic and real data.
-
-## Background
-Image recognition with deep learning is a computationally expensive endeavor. 
-This tutorial will show you how to reduce the inference runtime of your network. 
-Convolutional neural networks (CNNs) have been shown to learn and extract usable features by layering many convolution filters. ResNet50, ResNet101 and InceptionV3 are among the popular topologies for image recognition in the industry today.
-There are 2 main setbacks for CNNs for performance:
-1. Deeply layering convolutions causes the number of training parameters to increase drastically.
-2. Linear convolution filters cannot learn size-invariant features without using separate filter for each size regime.
-
-ResNet models use gate and skip logic to address issue 1 and lower the number of parameters, similar to a recurrent neural network (RNN). The InceptionV3 model utilizes “network in network” mini perceptrons to convert linear convolutions into non-linear convolutions in a compact step, addressing issue 2. InceptionV3 also includes optimization that factor and vectorize the convolutions, further increasing the speed of the network.
-
-##  Recommended Settings 
-
-In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN) to utilize instruction sets appropriately, runtime settings also significantly contribute to improved performance. 
-Tuning these options for CPU workloads is vital to optimize performance of TensorFlow on Intel® processors. 
-Below are the set of run-time options recommended by Intel on ResNet50, ResNet101 and InceptionV3 through empirical testing. 
-
-<table class="tg">
-  <tr>
-    <th class="tg-amwm" rowspan="2">Run-time options</th>
-    <th class="tg-amwm" colspan="3">Recommendations</th>
-  </tr>
-  <tr>
-    <td class="tg-amwm">ResNet50</td>
-    <td class="tg-amwm">InceptionV3</td>
-    <td class="tg-amwm">ResNet101</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">Batch Size</td>
-    <td class="tg-baqh" colspan="3">128. Regardless of the hardware</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">Hyperthreading</td>
-    <td class="tg-baqh" colspan="3">Enabled. Turn on in BIOS. Requires a restart.</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">intra_op_parallelism_threads</td>
-    <td class="tg-baqh">#physical cores per socket</td>
-    <td class="tg-baqh">#physical cores per socket</td>
-    <td class="tg-baqh"># all physical cores</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">inter_op_parallelism_threads</td>
-    <td class="tg-baqh">1</td>
-    <td class="tg-baqh">1</td>
-    <td class="tg-baqh">2</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">Data Layout</td>
-    <td class="tg-baqh" colspan="3">NCHW</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">NUMA Controls</td>
-    <td class="tg-baqh" colspan="2">numactl --cpunodebind=0  --membind=0</td>
-    <td class="tg-baqh"></td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">KMP_AFFINITY</td>
-    <td class="tg-baqh" colspan="3">KMP_AFFINITY=granularity=fine,verbose,compact,1,0</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">KMP_BLOCKTIME</td>
-    <td class="tg-baqh" colspan="3">1</td>
-  </tr>
-  <tr>
-    <td class="tg-0lax">OMP_NUM_THREADS</td>
-    <td class="tg-baqh" colspan="2"># intra_op_parallelism_threads</td>
-    <td class="tg-baqh">#physical cores per socket</td>
-  </tr>
-</table>
-<br>
-
-*Note: Refer to the [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) here to learn more about the run time options.*
-
-Run the following commands to get your processor information
-
-a. #physical cores per socket : 
-`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
-
-b. #all physical cores: 
-`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
- 
-Below is a code snippet you can incorporate into your existing ResNet50 or ResNet101 or InceptionV3 TensorFlow application to set the best settings. 
-You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set 
-in the Python script
-
-```bash
-export OMP_NUM_THREADS=physical cores
-export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
-export KMP_BLOCKTIME=1
-export KMP_SETTINGS=1
-```
-(or)
-```
-import os
-os.environ["KMP_BLOCKTIME"] = "1"
-os.environ["KMP_SETTINGS"] = "1"
-os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
-if FLAGS.num_intra_threads > 0:
-  os.environ["OMP_NUM_THREADS"]= # <physical cores>
-tf.config.threading.set_inter_op_parallelism_threads(1)
-# tf.config.threading.set_inter_op_parallelism_threads(2) # for ResNet101
-tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
-```
-
-## Hands-on Tutorial
-This section shows how to measure inference performance on Intel's Model Zoo pretrained model (or your pretrained model) by setting the above-discussed run time flags. 
-### FP32 inference
- 
-### Initial Setup
-1. Clone IntelAI models and download into your home directory
-
-```bash
-git clone https://github.com/IntelAI/models.git
-```
-
-2. (Skip to the next step if you already have a pretrained model) Download the pretrained models ```resnet50_fp32_pretrained_model.pb```, ```resnet101_fp32_pretrained_model.pb``` and
-```inceptionv3_fp32_pretrained_model.pb``` into your home directory or
-any other directory of your choice. 
-
-```
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50_fp32_pretrained_model.pb
-
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet101_fp32_pretrained_model.pb
-
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/inceptionv3_fp32_pretrained_model.pb
-```
-Refer to following Readme files to get the latest locations of pretrained models <br>
-a. [ResNet50](/benchmarks/image_recognition/tensorflow/resnet50) <br>
-b. [ResNet101](/benchmarks/image_recognition/tensorflow/resnet101) <br>
-c. [InceptionV3](/benchmarks/image_recognition/tensorflow/inceptionv3) <br>
-
-3. (optional) Download and setup a data directory that has image files in TFRecord format if you are inferring on a real dataset. 
-You can refer to [ImageNet](/datasets/imagenet) or [Coco Dataset](http://cocodataset.org/#home) which have images converted to TFRecords, or you can run the [build_image_data.py](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/inception/inception/data/build_image_data.py) script to convert raw images into TFRecords.
-
-4. Install [Docker](https://docs.docker.com/install/) since the tutorial runs on a Docker container.
-
-### Run inference
-
-1. Pull the relevant Intel-optimized TensorFlow Docker image. We'll be running the pretrained model to infer on Docker container. 
-[Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
-```bash
-docker pull intel/intel-optimized-tensorflow:latest
-```
-2. cd to the inference script directory
-```bash        
-cd ~/models/benchmarks
-```
-3. Run the Python script ``` launch_benchmark.py``` with the pretrained model. 
-```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance 
-inference on pretrained models trained of popular topologies. 
-The script will automatically set the recommended run-time options for supported topologies, 
-but if you choose to set your own options, refer to the full list of available flags and a detailed
-explanation of the ```launch_benchmark.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
-This step will automatically launch a new container on every run and terminate. Go to the [Step 4](#step_4) to interactively run the script on the container.
-
-3.1. <b> *Online inference*</b>(or real-time inference, batch_size=1)
-
-3.1.1 <b>ResNet50</b>
-
-Note: As per the recommended settings `socket-id` is set to 0 for ResNet50. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
-
-
- *Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
-		--model-name resnet50 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
-		--model-name resnet50 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-3.1.2 <b>ResNet101</b>
-
-
-*Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
-		--model-name resnet101 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
-		--model-name resnet101 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-3.1.3 <b>InceptionV3</b>
-
-Note: As per the recommended settings `socket-id` is set to 0 for InceptionV3. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
-
-*Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
-		--model-name inceptionv3 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
-		--model-name inceptionv3 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 1 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-3.2. <b>*Best Batch inference*</b>(batch_size=128)
-
-3.2.1 <b>ResNet50</b>
-
-Note: As per the recommended settings `socket-id` is set to 0 for ResNet50. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
-
-
- *Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
-		--model-name resnet50 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
-		--model-name resnet50 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-3.2.2 <b>ResNet101</b>
-
-
-*Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
-		--model-name resnet101 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
-		--model-name resnet101 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-3.2.3 <b>InceptionV3</b>
-
-Note: As per the recommended settings `socket-id` is set to 0 for InceptionV3. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
-
-*Synthetic data*
-	
-	python launch_benchmark.py \
-		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
-		--model-name inceptionv3 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-
-*Real data*
-
-	python launch_benchmark.py \
-		--data-location /home/<user>/<tfrecords_dataset_directory> \
-		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
-		--model-name inceptionv3 \
-		--framework tensorflow \
-		--precision fp32 \
-		--mode inference \
-		--batch-size 128 \
-		--benchmark-only \
-		--socket-id 0 \
-		--docker-image intel/intel-optimized-tensorflow:latest
-		
-
-<u>Example Output</u>
-
-	[Running warmup steps...]
-	steps = 10, ... images/sec
-	[Running benchmark steps...]
-	steps = 10, ... images/sec
-	steps = 20, ... images/sec
-	steps = 30, ... images/sec
-	steps = 40, ... images/sec
-	steps = 50, ... images/sec
-	Ran inference with batch size 128
-	Log location outside container: {--output-dir value}/benchmark_resnet50
-	
-
-The logs are captured in a directory outside of the container.<br> 
-
-
-4. <a name="step_4"></a>If you want to run the model script interactively within the docker container, run ```launch_benchmark.py``` with ```--debug``` flag. This will launch a docker container based on the ```--docker-image```,
-performs necessary installs, runs the ```launch_benchmark.py``` script and does not terminate the container process. As an example, this step will demonstrate ResNet50 Real Time inference on Synthetic Data use case, 
-you can implement the same strategy on different use cases demoed in Step 3.
-		
-		python launch_benchmark.py \
-			--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
-			--model-name resnet50 \
-			--framework tensorflow \
-			--precision fp32 \
-			--mode inference \
-			--batch-size 1 \
-			--benchmark-only \
-			--docker-image intel/intel-optimized-tensorflow:latest \
-			--debug 				
-	
-<u>Example Output</u>
-
-	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
-	
-To rerun the bechmarking script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For e.g  to rerun with the best batch inference (batch size=128) settings run with ```BATCH_SIZE``` 
-and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```. 
-	
-	chmod +x ./start.sh
-	
-	
-	NOINSTALL=True BATCH_SIZE=128 ./start.sh
-	
-All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. 
-	
-<u>Example Output</u>	
-	
-    USE_CASE: image_recognition
-    FRAMEWORK: tensorflow
-    WORKSPACE: /workspace/benchmarks/common/tensorflow
-    DATASET_LOCATION: /dataset
-    CHECKPOINT_DIRECTORY: /checkpoints
-    IN_GRAPH: /in_graph/freezed_resnet50.pb
-    Mounted volumes:
-        /localdisk/<user>/models/benchmarks mounted on: /workspace/benchmarks
-        None mounted on: /workspace/models
-        /localdisk/<user>/models/benchmarks/../models/image_recognition/tensorflow/resnet50 mounted on: /workspace/intelai_models
-        None mounted on: /dataset
-        None mounted on: /checkpoints
-    SOCKET_ID: -1
-    MODEL_NAME: resnet50
-    MODE: inference
-    PRECISION: fp32
-    BATCH_SIZE: 128
-    NUM_CORES: -1
-    BENCHMARK_ONLY: True
-    ACCURACY_ONLY: False
-    NOINSTALL: True
-	.
-	.
-	.
-	.
-	.
-	Batch size = 128
-	Throughput: ... images/sec
-	Ran inference with batch size 128
-	Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190205_201632.log
-	
-	
-
-	
-	
-
-
+# Image Recognition with ResNet50, ResNet101 and InceptionV3
+
+
+## Goal
+This tutorial will introduce CPU performance considerations for three image recognition deep learning models, and how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs.
+This tutorial will also provide code examples to use with Model Zoo's pretrained model that can be copy/pasted for quick off-the-ground implementation with synthetic and real data.
+
+## Background
+Image recognition with deep learning is a computationally expensive endeavor.
+This tutorial will show you how to reduce the inference runtime of your network.
+Convolutional neural networks (CNNs) have been shown to learn and extract usable features by layering many convolution filters. ResNet50, ResNet101 and InceptionV3 are among the popular topologies for image recognition in the industry today.
+There are 2 main setbacks for CNNs for performance:
+1. Deeply layering convolutions causes the number of training parameters to increase drastically.
+2. Linear convolution filters cannot learn size-invariant features without using separate filter for each size regime.
+
+ResNet models use gate and skip logic to address issue 1 and lower the number of parameters, similar to a recurrent neural network (RNN). The InceptionV3 model utilizes “network in network” mini perceptrons to convert linear convolutions into non-linear convolutions in a compact step, addressing issue 2. InceptionV3 also includes optimization that factor and vectorize the convolutions, further increasing the speed of the network.
+
+##  Recommended Settings
+
+In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN) to utilize instruction sets appropriately, runtime settings also significantly contribute to improved performance.
+Tuning these options for CPU workloads is vital to optimize performance of TensorFlow on Intel® processors.
+Below are the set of run-time options recommended by Intel on ResNet50, ResNet101 and InceptionV3 through empirical testing.
+
+<table class="tg">
+  <tr>
+    <th class="tg-amwm" rowspan="2">Run-time options</th>
+    <th class="tg-amwm" colspan="3">Recommendations</th>
+  </tr>
+  <tr>
+    <td class="tg-amwm">ResNet50</td>
+    <td class="tg-amwm">InceptionV3</td>
+    <td class="tg-amwm">ResNet101</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">Batch Size</td>
+    <td class="tg-baqh" colspan="3">128. Regardless of the hardware</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">Hyperthreading</td>
+    <td class="tg-baqh" colspan="3">Enabled. Turn on in BIOS. Requires a restart.</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">intra_op_parallelism_threads</td>
+    <td class="tg-baqh">#physical cores per socket</td>
+    <td class="tg-baqh">#physical cores per socket</td>
+    <td class="tg-baqh"># all physical cores</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">inter_op_parallelism_threads</td>
+    <td class="tg-baqh">1</td>
+    <td class="tg-baqh">1</td>
+    <td class="tg-baqh">2</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">Data Layout</td>
+    <td class="tg-baqh" colspan="3">NCHW</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">NUMA Controls</td>
+    <td class="tg-baqh" colspan="2">numactl --cpunodebind=0  --membind=0</td>
+    <td class="tg-baqh"></td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">KMP_AFFINITY</td>
+    <td class="tg-baqh" colspan="3">KMP_AFFINITY=granularity=fine,verbose,compact,1,0</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">KMP_BLOCKTIME</td>
+    <td class="tg-baqh" colspan="3">1</td>
+  </tr>
+  <tr>
+    <td class="tg-0lax">OMP_NUM_THREADS</td>
+    <td class="tg-baqh" colspan="2"># intra_op_parallelism_threads</td>
+    <td class="tg-baqh">#physical cores per socket</td>
+  </tr>
+</table>
+<br>
+
+*Note: Refer to the [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) here to learn more about the run time options.*
+
+Run the following commands to get your processor information
+
+a. #physical cores per socket:
+`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
+
+b. #all physical cores:
+`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
+
+Below is a code snippet you can incorporate into your existing ResNet50 or ResNet101 or InceptionV3 TensorFlow application to set the best settings.
+You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set
+in the Python script
+
+```bash
+export OMP_NUM_THREADS=physical cores
+export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+```
+(or)
+```
+import os
+os.environ["KMP_BLOCKTIME"] = "1"
+os.environ["KMP_SETTINGS"] = "1"
+os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
+if FLAGS.num_intra_threads > 0:
+  os.environ["OMP_NUM_THREADS"]= # <physical cores>
+tf.config.threading.set_inter_op_parallelism_threads(1)
+# tf.config.threading.set_inter_op_parallelism_threads(2) # for ResNet101
+tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
+```
+
+## Hands-on Tutorial
+This section shows how to measure inference performance on Intel's Model Zoo pretrained model (or your pretrained model) by setting the above-discussed run time flags.
+### FP32 inference
+
+### Initial Setup
+1. Clone IntelAI models and download into your home directory
+
+```bash
+git clone https://github.com/IntelAI/models.git
+```
+
+2. (Skip to the next step if you already have a pretrained model) Download the pretrained models ```resnet50_fp32_pretrained_model.pb```, ```resnet101_fp32_pretrained_model.pb``` and
+```inceptionv3_fp32_pretrained_model.pb``` into your home directory or
+any other directory of your choice.
+
+```
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50_fp32_pretrained_model.pb
+
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet101_fp32_pretrained_model.pb
+
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/inceptionv3_fp32_pretrained_model.pb
+```
+Refer to following Readme files to get the latest locations of pretrained models <br>
+a. [ResNet50](/benchmarks/image_recognition/tensorflow/resnet50) <br>
+b. [ResNet101](/benchmarks/image_recognition/tensorflow/resnet101) <br>
+c. [InceptionV3](/benchmarks/image_recognition/tensorflow/inceptionv3) <br>
+
+3. (optional) Download and setup a data directory that has image files in TFRecord format if you are inferring on a real dataset.
+You can refer to [ImageNet](/datasets/imagenet) or [Coco Dataset](http://cocodataset.org/#home) which have images converted to TFRecords, or you can run the [build_image_data.py](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/inception/inception/data/build_image_data.py) script to convert raw images into TFRecords.
+
+4. Install [Docker](https://docs.docker.com/install/) since the tutorial runs on a Docker container.
+
+### Run inference
+
+1. Pull the relevant Intel-optimized TensorFlow Docker image. We'll be running the pretrained model to infer on Docker container.
+[Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
+```bash
+docker pull intel/intel-optimized-tensorflow:latest
+```
+2. cd to the inference script directory
+```bash
+cd ~/models/benchmarks
+```
+3. Run the Python script ``` launch_benchmark.py``` with the pretrained model.
+```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance
+inference on pretrained models trained of popular topologies.
+The script will automatically set the recommended run-time options for supported topologies,
+but if you choose to set your own options, refer to the full list of available flags and a detailed
+explanation of the ```launch_benchmark.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
+This step will automatically launch a new container on every run and terminate. Go to the [Step 4](#step_4) to interactively run the script on the container.
+
+3.1. <b> *Online inference*</b>(or real-time inference, batch_size=1)
+
+3.1.1 <b>ResNet50</b>
+
+Note: As per the recommended settings `socket-id` is set to 0 for ResNet50. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
+
+
+ *Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
+		--model-name resnet50 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
+		--model-name resnet50 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+3.1.2 <b>ResNet101</b>
+
+
+*Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
+		--model-name resnet101 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
+		--model-name resnet101 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+3.1.3 <b>InceptionV3</b>
+
+Note: As per the recommended settings `socket-id` is set to 0 for InceptionV3. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
+
+*Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
+		--model-name inceptionv3 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
+		--model-name inceptionv3 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 1 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+3.2. <b>*Best Batch inference*</b>(batch_size=128)
+
+3.2.1 <b>ResNet50</b>
+
+Note: As per the recommended settings `socket-id` is set to 0 for ResNet50. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
+
+
+ *Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
+		--model-name resnet50 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
+		--model-name resnet50 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+3.2.2 <b>ResNet101</b>
+
+
+*Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
+		--model-name resnet101 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/resnet101_fp32_pretrained_model.pb \
+		--model-name resnet101 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+3.2.3 <b>InceptionV3</b>
+
+Note: As per the recommended settings `socket-id` is set to 0 for InceptionV3. The workload will run on a single socket with `numactl` enabled. Remove the flag or set it to -1 to disable it.
+
+*Synthetic data*
+
+	python launch_benchmark.py \
+		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
+		--model-name inceptionv3 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+*Real data*
+
+	python launch_benchmark.py \
+		--data-location /home/<user>/<tfrecords_dataset_directory> \
+		--in-graph /home/<user>/inceptionv3_fp32_pretrained_model.pb \
+		--model-name inceptionv3 \
+		--framework tensorflow \
+		--precision fp32 \
+		--mode inference \
+		--batch-size 128 \
+		--benchmark-only \
+		--socket-id 0 \
+		--docker-image intel/intel-optimized-tensorflow:latest
+
+
+<u>Example Output</u>
+
+	[Running warmup steps...]
+	steps = 10, ... images/sec
+	[Running benchmark steps...]
+	steps = 10, ... images/sec
+	steps = 20, ... images/sec
+	steps = 30, ... images/sec
+	steps = 40, ... images/sec
+	steps = 50, ... images/sec
+	Ran inference with batch size 128
+	Log location outside container: {--output-dir value}/benchmark_resnet50
+
+
+The logs are captured in a directory outside of the container.<br>
+
+
+4. <a name="step_4"></a>If you want to run the model script interactively within the docker container, run ```launch_benchmark.py``` with ```--debug``` flag. This will launch a docker container based on the ```--docker-image```,
+performs necessary installs, runs the ```launch_benchmark.py``` script and does not terminate the container process. As an example, this step will demonstrate ResNet50 Real Time inference on Synthetic Data use case,
+you can implement the same strategy on different use cases demoed in Step 3.
+
+		python launch_benchmark.py \
+			--in-graph /home/<user>/resnet50_fp32_pretrained_model.pb \
+			--model-name resnet50 \
+			--framework tensorflow \
+			--precision fp32 \
+			--mode inference \
+			--batch-size 1 \
+			--benchmark-only \
+			--docker-image intel/intel-optimized-tensorflow:latest \
+			--debug
+
+<u>Example Output</u>
+
+	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
+
+To rerun the bechmarking script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For e.g  to rerun with the best batch inference (batch size=128) settings run with ```BATCH_SIZE```
+and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```.
+
+	chmod +x ./start.sh
+
+
+	NOINSTALL=True BATCH_SIZE=128 ./start.sh
+
+All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags.
+
+<u>Example Output</u>
+
+    USE_CASE: image_recognition
+    FRAMEWORK: tensorflow
+    WORKSPACE: /workspace/benchmarks/common/tensorflow
+    DATASET_LOCATION: /dataset
+    CHECKPOINT_DIRECTORY: /checkpoints
+    IN_GRAPH: /in_graph/freezed_resnet50.pb
+    Mounted volumes:
+        /localdisk/<user>/models/benchmarks mounted on: /workspace/benchmarks
+        None mounted on: /workspace/models
+        /localdisk/<user>/models/benchmarks/../models/image_recognition/tensorflow/resnet50 mounted on: /workspace/intelai_models
+        None mounted on: /dataset
+        None mounted on: /checkpoints
+    SOCKET_ID: -1
+    MODEL_NAME: resnet50
+    MODE: inference
+    PRECISION: fp32
+    BATCH_SIZE: 128
+    NUM_CORES: -1
+    BENCHMARK_ONLY: True
+    ACCURACY_ONLY: False
+    NOINSTALL: True
+	.
+	.
+	.
+	.
+	.
+	Batch size = 128
+	Throughput: ... images/sec
+	Ran inference with batch size 128
+	Log location outside container: {--output-dir value}/benchmark_resnet50_inference_fp32_20190205_201632.log
diff --git a/docs/language_modeling/tensorflow/InferenceTutorial.md b/docs/language_modeling/tensorflow/InferenceTutorial.md
index 659bf4b1e..1ac3be165 100644
--- a/docs/language_modeling/tensorflow/InferenceTutorial.md
+++ b/docs/language_modeling/tensorflow/InferenceTutorial.md
@@ -1,260 +1,258 @@
-# Language Modeling Inference with BERT Large
-
-## Goal
-This tutorial will introduce CPU performance considerations for the deep learning model BERT Large for language modeling and demonstrate how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs. 
-This tutorial will also provide code examples to use Intel Model Zoo's pre-trained BERT model for a quick off-the-ground implementation.
-
-## Background
-With BFloat16 (BF16) instructions and optimizations now in the Intel® Xeon® Scalable processor and Intel® Optimizations for TensorFlow, deep learning workload performance can benefit from a smaller data representation (16-bit instead of the traditional 32-bit floating point) often with little or no loss of accuracy. 
-This is because the BF16 standard halves the data size in a way that retains most of the precision near zero while sacrificing more precision at the extremes of the numerical range. For many machine and deep learning tasks, this is a favorable trade-off.
-For more technical details, see this article on [lowering numerical precision to increase deep learning performance](https://www.intel.com/content/www/us/en/artificial-intelligence/posts/lowering-numerical-precision-increase-deep-learning-performance.html).
-
-BERT (Bidirectional Encoder Representations from Transformers) is a popular language modeling topology. 
-Since its [publication](https://arxiv.org/pdf/1810.04805.pdf) in May 2019, BERT has quickly become state-of-the-art for many Natural Language Processing (NLP) tasks, including question answering and next sentence prediction.
-The BERT Large variant has 340 million parameters and uses an innovative masked language model (MLM) pre-training approach that allows a second training stage called fine-tuning to achieve a wide variety of NLP tasks. 
-To demonstrate Bert Large inference performance with BF16 precision, this tutorial uses the Intel Model Zoo's BERT Large pre-trained model which has been fine-tuned for question answering with the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset. 
-The tutorial concludes with FP32 inference for comparison of performance and accuracy. 
-
-##  Recommended Settings 
-In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN), the run-time settings also significantly contribute to improved performance. 
-Tuning these options to optimize CPU workloads is vital to optimize performance of TensorFlow on Intel® processors. 
-Below are the set of run-time options tested empirically on BERT Large and recommended by Intel: 
-
-| Run-time options  | Recommendations |
-| ------------- | ------------- |
-| Batch Size | 32. Regardless of the hardware  |
-| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
-|intra_op_parallelism_threads |# physical cores | 
-|inter_op_parallelism_threads | 1 or 2|
-|NUMA Controls| --cpunodebind=0 --membind=0 |
-|KMP_AFFINITY| KMP_AFFINITY=granularity=fine,verbose,compact,1,0|
-|KMP_BLOCKTIME| 1 |
-|KMP_SETTINGS| 1 |
-|OMP_NUM_THREADS |# physical cores - 1 or # physical cores - 2|
- 
-Note 1: Refer to this [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run-time options.
-
-Note 2: You can remove `verbose` from `KMP_AFFINITY` setting to avoid verbose output at runtime. 
-
-Run the following commands to get your processor information:
-
-a. # physical cores per socket : `lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
-
-b. # all physical cores: `lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
-
-Below is a code snippet you can incorporate into your existing TensorFlow application to set the best settings. 
-You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set 
-in the Python script.
-
-```bash
-export OMP_NUM_THREADS=<# physical cores - 2>
-export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
-export KMP_BLOCKTIME=1
-export KMP_SETTINGS=1
-```
-(or)
-```
-import os
-os.environ["KMP_BLOCKTIME"] = "1"
-os.environ["KMP_SETTINGS"] = "1"
-os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
-os.environ["OMP_NUM_THREADS"]= <# physical cores - 2>
-tf.config.threading.set_inter_op_parallelism_threads(1)
-tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
-```
-
-## Hands-on Tutorial
-This section shows how to measure and compare BF16 and FP32 inference performance on Intel's Model Zoo pre-trained model (or your pre-trained model) by setting the above-discussed run-time flags. 
- 
-### Initial Setup
-
-Note: These steps are adapted from the BERT Large Inference [README](/benchmarks/language_modeling/tensorflow/bert_large/README.md#inference-instructions).
-Please check there for the most up-to-date information and links.
-
-1. Clone IntelAI models and download into your home directory, skip this step if you already have Intel AI models installed.
-
-```bash
-cd ~
-git clone https://github.com/IntelAI/models.git
-```
-
-2. Download and unzip the BERT large uncased (whole word masking) model from the [google bert repo](https://github.com/google-research/bert#pre-trained-models).
-Then, download the `dev-v1.1.json` file from the [google bert repo](https://github.com/google-research/bert#squad-11) into the `wwm_uncased_L-24_H-1024_A-16` directory that was just unzipped.
-
-```
-wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
-unzip wwm_uncased_L-24_H-1024_A-16.zip
-
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16
-```
-The `wwm_uncased_L-24_H-1024_A-16` directory is what will be passed as the `--data-location` when running inference.
-
-3. Download and unzip the pre-trained model. The file is 3.4GB so it will take some time.
-
-```
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip
-unzip bert_large_checkpoints.zip
-```
-This directory will be passed as the `--checkpoint` location when running inference.
-
-4. Install [Docker](https://docs.docker.com/v17.09/engine/installation/) since the tutorial runs in a Docker container. 
-
-5. Pull the relevant Intel-optimized TensorFlow Docker image.
-[Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
-```bash
-docker pull intel/intel-optimized-tensorflow:latest
-```
-
-6. Navigate to the inference script directory in local IntelAI repository.
-```bash        
-cd ~/models/benchmarks
-```
-
-### BF16 Inference
-
-Run the Python script `launch_benchmark.py` with the pre-trained model. 
-The `launch_benchmark.py` script can be treated as an entry point to conveniently perform out-of-box high performance inference on pre-trained models from the Intel Model Zoo. 
-The script will automatically set the recommended run-time options for supported topologies, but if you choose to set your own options, refer to the full list of available flags and a detailed explanation of `launch_benchmark.py` [here](/docs/general/tensorflow/LaunchBenchmark.md).
-This step will automatically launch a new container on every run and terminate. Go to [this optional step](#optional) to interactively run the script on the container.
-
-1. <b>*BF16 Batch Inference*</b>
-
-Console in:
-```bash
-python launch_benchmark.py \
-    --model-name=bert_large \
-    --precision=bfloat16 \
-    --mode=inference \
-    --framework=tensorflow \
-    --batch-size=32 \
-    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
-    --checkpoint ~/bert_large_checkpoints \
-    --output-dir ~/output \
-    --benchmark-only \
-    --docker-image intel/intel-optimized-tensorflow:latest \
-    -- infer_option=SQuAD
-```
-Console out:
-```
-...
-I0424 21:14:28.002666 140184442087232 run_squad.py:1365] Processed #examples: 960
-INFO:tensorflow:prediction_loop marked as finished
-Elapsed time: ...
-throughput((num_processed_examples-threshod_examples)/Elapsedtime): ...
-Ran inference with batch size 32
-Log location outside container: /~/output/benchmark_bert_large_inference_bfloat16_20200424_210607.log
-```
-
-2. <b>*BF16 Accuracy*</b>
-
-Console in:
-```bash
-python launch_benchmark.py \
-    --model-name=bert_large \
-    --precision=bfloat16 \
-    --mode=inference \
-    --framework=tensorflow \
-    --batch-size=32 \
-    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
-    --checkpoint ~/bert_large_checkpoints \
-    --output-dir ~/output \
-    --accuracy-only \
-    --docker-image intel/intel-optimized-tensorflow:latest \
-    -- infer_option=SQuAD
-```
-
-Console out:
-```bash
-INFO:tensorflow:Processing example: 10830
-I0428 00:26:11.595798 140332503766848 run_squad.py:1370] Processing example: 10830
-INFO:tensorflow:prediction_loop marked as finished
-INFO:tensorflow:Writing predictions to: /~/output/predictions.json
-I0428 00:26:11.794145 140332503766848 run_squad.py:804] Writing predictions to: /~/output/predictions.json
-INFO:tensorflow:Writing nbest to: /~/output/nbest_predictions.json
-I0428 00:26:11.794228 140332503766848 run_squad.py:805] Writing nbest to: /~/output/nbest_predictions.json
-{"exact_match": ..., "f1": ...}
-Ran inference with batch size 32
-Log location outside container: /~/output/benchmark_bert_large_inference_bfloat16_20200427_224428.log
-```
-
-Output files and logs are saved to the `--output-dir` or to the default location models/benchmarks/common/tensorflow/logs, if no `--output-dir` is set.
-
-### FP32 Inference
-
-1. <b>*FP32 Batch Inference*</b>
-
-To see the FP32 batch inference performance, run the same command from above but change `--precision=bfloat16` to `--precision=fp32`.
-
-```bash
-python launch_benchmark.py \
-    --model-name=bert_large \
-    --precision=fp32 \
-    --mode=inference \
-    --framework=tensorflow \
-    --batch-size=32 \
-    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
-    --checkpoint ~/bert_large_checkpoints \
-    --output-dir ~/output \
-    --benchmark-only \
-    --docker-image intel/intel-optimized-tensorflow:latest \
-    -- infer_option=SQuAD
-```
-
-2. <b>*FP32 Accuracy*</b>
-
-Similarly, to see the FP32 accuracy, run the above command but change `--precision=bfloat16` to `--precision=fp32`.
-
-```bash
-python launch_benchmark.py \
-    --model-name=bert_large \
-    --precision=fp32 \
-    --mode=inference \
-    --framework=tensorflow \
-    --batch-size=32 \
-    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
-    --checkpoint ~/bert_large_checkpoints \
-    --output-dir ~/output \
-    --accuracy-only \
-    --docker-image intel/intel-optimized-tensorflow:latest \
-    -- infer_option=SQuAD
-```
-
-### Interactive Option
-
-<a name="optional"></a>If you want to run `launch_benchmark.py` interactively from within the docker container, add flag `--debug`. This will launch a docker container based on the `--docker_image`,
-perform necessary installs, and run the `launch_benchmark.py` script, but does not terminate the container process. As an example, this is how you would launch interactive BF16 batch inference for benchmarking:
-
-Console in:		
-```bash
-python launch_benchmark.py \
-    --model-name=bert_large \
-    --precision=bfloat16 \
-    --mode=inference \
-    --framework=tensorflow \
-    --batch-size=32 \
-    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
-    --checkpoint ~/bert_large_checkpoints \
-    --output-dir ~/output \
-    --benchmark-only \
-    --docker-image intel/intel-optimized-tensorflow:latest \
-    --debug \
-    -- infer_option=SQuAD
-```
-
-Console out:	
-```bash
-root@c49f3442efb1:/workspace/benchmarks/common/tensorflow#
-```
-	
-To rerun the benchmarking script, execute the `start.sh` bash script from your existing directory with the available flags, which in turn will run `launch_benchmark.py`. 
-For example, to run with different batch size settings (e.g. batch size=64) run with `BATCH_SIZE` 
-and to skip the run from reinstalling packages pass `True` to `NOINSTALL`. 
-
-```bash	
-chmod +x ./start.sh
-NOINSTALL=True BATCH_SIZE=64 ./start.sh
-```
-
-All other flags will be defaulted to values passed in the first `launch_benchmark.py` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. 
-	
-	
+# Language Modeling Inference with BERT Large
+
+## Goal
+This tutorial will introduce CPU performance considerations for the deep learning model BERT Large for language modeling and demonstrate how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs.
+This tutorial will also provide code examples to use Intel Model Zoo's pre-trained BERT model for a quick off-the-ground implementation.
+
+## Background
+With BFloat16 (BF16) instructions and optimizations now in the Intel® Xeon® Scalable processor and Intel® Optimizations for TensorFlow, deep learning workload performance can benefit from a smaller data representation (16-bit instead of the traditional 32-bit floating point) often with little or no loss of accuracy.
+This is because the BF16 standard halves the data size in a way that retains most of the precision near zero while sacrificing more precision at the extremes of the numerical range. For many machine and deep learning tasks, this is a favorable trade-off.
+For more technical details, see this article on [lowering numerical precision to increase deep learning performance](https://www.intel.com/content/www/us/en/artificial-intelligence/posts/lowering-numerical-precision-increase-deep-learning-performance.html).
+
+BERT (Bidirectional Encoder Representations from Transformers) is a popular language modeling topology.
+Since its [publication](https://arxiv.org/pdf/1810.04805.pdf) in May 2019, BERT has quickly become state-of-the-art for many Natural Language Processing (NLP) tasks, including question answering and next sentence prediction.
+The BERT Large variant has 340 million parameters and uses an innovative masked language model (MLM) pre-training approach that allows a second training stage called fine-tuning to achieve a wide variety of NLP tasks.
+To demonstrate Bert Large inference performance with BF16 precision, this tutorial uses the Intel Model Zoo's BERT Large pre-trained model which has been fine-tuned for question answering with the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+The tutorial concludes with FP32 inference for comparison of performance and accuracy.
+
+##  Recommended Settings
+In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN), the run-time settings also significantly contribute to improved performance.
+Tuning these options to optimize CPU workloads is vital to optimize performance of TensorFlow on Intel® processors.
+Below are the set of run-time options tested empirically on BERT Large and recommended by Intel:
+
+| Run-time options  | Recommendations |
+| ------------- | ------------- |
+| Batch Size | 32. Regardless of the hardware  |
+| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
+|intra_op_parallelism_threads |# physical cores |
+|inter_op_parallelism_threads | 1 or 2|
+|NUMA Controls| --cpunodebind=0 --membind=0 |
+|KMP_AFFINITY| KMP_AFFINITY=granularity=fine,verbose,compact,1,0|
+|KMP_BLOCKTIME| 1 |
+|KMP_SETTINGS| 1 |
+|OMP_NUM_THREADS |# physical cores - 1 or # physical cores - 2|
+
+Note 1: Refer to this [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run-time options.
+
+Note 2: You can remove `verbose` from `KMP_AFFINITY` setting to avoid verbose output at runtime.
+
+Run the following commands to get your processor information:
+
+a. # physical cores per socket: `lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
+
+b. # all physical cores: `lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
+
+Below is a code snippet you can incorporate into your existing TensorFlow application to set the best settings.
+You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set
+in the Python script.
+
+```bash
+export OMP_NUM_THREADS=<# physical cores - 2>
+export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+```
+(or)
+```
+import os
+os.environ["KMP_BLOCKTIME"] = "1"
+os.environ["KMP_SETTINGS"] = "1"
+os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
+os.environ["OMP_NUM_THREADS"]= <# physical cores - 2>
+tf.config.threading.set_inter_op_parallelism_threads(1)
+tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
+```
+
+## Hands-on Tutorial
+This section shows how to measure and compare BF16 and FP32 inference performance on Intel's Model Zoo pre-trained model (or your pre-trained model) by setting the above-discussed run-time flags.
+
+### Initial Setup
+
+Note: These steps are adapted from the BERT Large Inference [README](/benchmarks/language_modeling/tensorflow/bert_large/README.md#inference-instructions).
+Please check there for the most up-to-date information and links.
+
+1. Clone IntelAI models and download into your home directory, skip this step if you already have Intel AI models installed.
+
+```bash
+cd ~
+git clone https://github.com/IntelAI/models.git
+```
+
+2. Download and unzip the BERT large uncased (whole word masking) model from the [google bert repo](https://github.com/google-research/bert#pre-trained-models).
+Then, download the `dev-v1.1.json` file from the [google bert repo](https://github.com/google-research/bert#squad-11) into the `wwm_uncased_L-24_H-1024_A-16` directory that was just unzipped.
+
+```
+wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
+unzip wwm_uncased_L-24_H-1024_A-16.zip
+
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16
+```
+The `wwm_uncased_L-24_H-1024_A-16` directory is what will be passed as the `--data-location` when running inference.
+
+3. Download and unzip the pre-trained model. The file is 3.4GB so it will take some time.
+
+```
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip
+unzip bert_large_checkpoints.zip
+```
+This directory will be passed as the `--checkpoint` location when running inference.
+
+4. Install [Docker](https://docs.docker.com/v17.09/engine/installation/) since the tutorial runs in a Docker container.
+
+5. Pull the relevant Intel-optimized TensorFlow Docker image.
+[Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
+```bash
+docker pull intel/intel-optimized-tensorflow:latest
+```
+
+6. Navigate to the inference script directory in local IntelAI repository.
+```bash
+cd ~/models/benchmarks
+```
+
+### BF16 Inference
+
+Run the Python script `launch_benchmark.py` with the pre-trained model.
+The `launch_benchmark.py` script can be treated as an entry point to conveniently perform out-of-box high performance inference on pre-trained models from the Intel Model Zoo.
+The script will automatically set the recommended run-time options for supported topologies, but if you choose to set your own options, refer to the full list of available flags and a detailed explanation of `launch_benchmark.py` [here](/docs/general/tensorflow/LaunchBenchmark.md).
+This step will automatically launch a new container on every run and terminate. Go to [this optional step](#optional) to interactively run the script on the container.
+
+1. <b>*BF16 Batch Inference*</b>
+
+Console in:
+```bash
+python launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=bfloat16 \
+    --mode=inference \
+    --framework=tensorflow \
+    --batch-size=32 \
+    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
+    --checkpoint ~/bert_large_checkpoints \
+    --output-dir ~/output \
+    --benchmark-only \
+    --docker-image intel/intel-optimized-tensorflow:latest \
+    -- infer_option=SQuAD
+```
+Console out:
+```
+...
+I0424 21:14:28.002666 140184442087232 run_squad.py:1365] Processed #examples: 960
+INFO:tensorflow:prediction_loop marked as finished
+Elapsed time: ...
+throughput((num_processed_examples-threshod_examples)/Elapsedtime): ...
+Ran inference with batch size 32
+Log location outside container: /~/output/benchmark_bert_large_inference_bfloat16_20200424_210607.log
+```
+
+2. <b>*BF16 Accuracy*</b>
+
+Console in:
+```bash
+python launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=bfloat16 \
+    --mode=inference \
+    --framework=tensorflow \
+    --batch-size=32 \
+    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
+    --checkpoint ~/bert_large_checkpoints \
+    --output-dir ~/output \
+    --accuracy-only \
+    --docker-image intel/intel-optimized-tensorflow:latest \
+    -- infer_option=SQuAD
+```
+
+Console out:
+```bash
+INFO:tensorflow:Processing example: 10830
+I0428 00:26:11.595798 140332503766848 run_squad.py:1370] Processing example: 10830
+INFO:tensorflow:prediction_loop marked as finished
+INFO:tensorflow:Writing predictions to: /~/output/predictions.json
+I0428 00:26:11.794145 140332503766848 run_squad.py:804] Writing predictions to: /~/output/predictions.json
+INFO:tensorflow:Writing nbest to: /~/output/nbest_predictions.json
+I0428 00:26:11.794228 140332503766848 run_squad.py:805] Writing nbest to: /~/output/nbest_predictions.json
+{"exact_match": ..., "f1": ...}
+Ran inference with batch size 32
+Log location outside container: /~/output/benchmark_bert_large_inference_bfloat16_20200427_224428.log
+```
+
+Output files and logs are saved to the `--output-dir` or to the default location models/benchmarks/common/tensorflow/logs, if no `--output-dir` is set.
+
+### FP32 Inference
+
+1. <b>*FP32 Batch Inference*</b>
+
+To see the FP32 batch inference performance, run the same command from above but change `--precision=bfloat16` to `--precision=fp32`.
+
+```bash
+python launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=fp32 \
+    --mode=inference \
+    --framework=tensorflow \
+    --batch-size=32 \
+    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
+    --checkpoint ~/bert_large_checkpoints \
+    --output-dir ~/output \
+    --benchmark-only \
+    --docker-image intel/intel-optimized-tensorflow:latest \
+    -- infer_option=SQuAD
+```
+
+2. <b>*FP32 Accuracy*</b>
+
+Similarly, to see the FP32 accuracy, run the above command but change `--precision=bfloat16` to `--precision=fp32`.
+
+```bash
+python launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=fp32 \
+    --mode=inference \
+    --framework=tensorflow \
+    --batch-size=32 \
+    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
+    --checkpoint ~/bert_large_checkpoints \
+    --output-dir ~/output \
+    --accuracy-only \
+    --docker-image intel/intel-optimized-tensorflow:latest \
+    -- infer_option=SQuAD
+```
+
+### Interactive Option
+
+<a name="optional"></a>If you want to run `launch_benchmark.py` interactively from within the docker container, add flag `--debug`. This will launch a docker container based on the `--docker_image`,
+perform necessary installs, and run the `launch_benchmark.py` script, but does not terminate the container process. As an example, this is how you would launch interactive BF16 batch inference for benchmarking:
+
+Console in:
+```bash
+python launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=bfloat16 \
+    --mode=inference \
+    --framework=tensorflow \
+    --batch-size=32 \
+    --data-location ~/wwm_uncased_L-24_H-1024_A-16 \
+    --checkpoint ~/bert_large_checkpoints \
+    --output-dir ~/output \
+    --benchmark-only \
+    --docker-image intel/intel-optimized-tensorflow:latest \
+    --debug \
+    -- infer_option=SQuAD
+```
+
+Console out:
+```bash
+root@c49f3442efb1:/workspace/benchmarks/common/tensorflow#
+```
+
+To rerun the benchmarking script, execute the `start.sh` bash script from your existing directory with the available flags, which in turn will run `launch_benchmark.py`.
+For example, to run with different batch size settings (e.g. batch size=64) run with `BATCH_SIZE`
+and to skip the run from reinstalling packages pass `True` to `NOINSTALL`.
+
+```bash
+chmod +x ./start.sh
+NOINSTALL=True BATCH_SIZE=64 ./start.sh
+```
+
+All other flags will be defaulted to values passed in the first `launch_benchmark.py` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags.
diff --git a/docs/language_translation/tensorflow/Tutorial.md b/docs/language_translation/tensorflow/Tutorial.md
index 0304f3af0..d9c420b47 100644
--- a/docs/language_translation/tensorflow/Tutorial.md
+++ b/docs/language_translation/tensorflow/Tutorial.md
@@ -1,244 +1,242 @@
-# Language Translation with Transformer-LT
-
-
-## Goal
-This tutorial will introduce CPU performance considerations of the deep learning Transformer-LT model for language translation and how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs. 
-This tutorial will also provide code examples to use Intel Model Zoo's pretrained English to German model that can be copy/pasted for quick off-the-ground implementation on real data.
-
-## Background
-Language Translation with deep learning is a computationally expensive endeavor. This tutorial will show you how to reduce the inference runtime of your Transformer-LT network, a popular topology solution to translation. 
-It is based on an encoder-decoder architecture with an added attention mechanism. The encoder is used to encode the original sentence to a meaningful fixed-length vector, and the decoder is responsible for extracting the context data from the vector. 
-The encoder and decoder process the inputs and outputs, which are in the form of a time sequence. 
-
-In a traditional encoder/decoder model, each element in the context vector is treated equally. This is typically not the ideal solution. 
-For instance, when you translate the phrase “I travel by train” from English into Chinese, the word “I” has a greater influence than other words when producing its counterpart in Chinese. 
-Thus, the attention mechanism was introduced to differentiate contributions of each element in the source sequence to their counterpart in the destination sequence, through the use of a hidden matrix. 
-This matrix contains weights of each element in the source sequence when producing elements in the destination sequence. 
-
-
-##  Recommended Settings 
-In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN) to utilize instruction sets appropriately, the runtime settings also significantly contribute to improved performance. 
-Tuning these options to optimize CPU workloads is vital to optimize performance of TensorFlow on Intel® processors. 
-Below are the set of run-time options tested empirically on Transformer-LT and recommended by Intel: 
-
-
-| Run-time options  | Recommendations |
-| ------------- | ------------- |
-| Batch Size | 64. Regardless of the hardware  |
-| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
-|intra_op_parallelism_threads |# physical cores | 
-|inter_op_parallelism_threads | 1 |
-|NUMA Controls| --cpunodebind=0 --membind=0 |
-|KMP_AFFINITY| KMP_AFFINITY=granularity=fine,verbose,compact,1,0|
-|KMP_BLOCKTIME| 1 |
-|OMP_NUM_THREADS |physical cores|
- 
-Note 1: Refer to this [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run time options.
-
-Note 2: You can remove `verbose` from `KMP_AFFINITY` setting to avoid verbose output at runtime. 
-
-Run the following commands to get your processor information:
-
-a. #physical cores per socket : `lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
-
-b. #all physical cores: `lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
-
-Below is a code snippet you can incorporate into your existing TensorFlow application to set the best settings. 
-You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set 
-in the Python script.
-
-```bash
-export OMP_NUM_THREADS=physical cores
-export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
-export KMP_BLOCKTIME=1
-export KMP_SETTINGS=1
-```
-(or)
-```
-import os
-os.environ["KMP_BLOCKTIME"] = "1"
-os.environ["KMP_SETTINGS"] = "1"
-os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
-os.environ["OMP_NUM_THREADS"]= <# physical cores>
-tf.config.threading.set_inter_op_parallelism_threads(1)
-tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
-```
-
-## Hands-on Tutorial
-This section shows how to measure inference performance on Intel's Model Zoo pretrained model (or your pretrained model) by setting the above-discussed run time flags. 
-### FP32 inference
- 
-### Initial Setup
-
-1. Clone IntelAI models and download into your home directory, skip this step if you already have Intel AI models installed.
-
-```bash
-cd ~
-git clone https://github.com/IntelAI/models.git
-```
-
-2. Skip to step 3 if you already have a pretrained model or download the file `transformer_lt_official_fp32_pretrained_model.tar.gz` into your ~/transformer_LT_german location.
-```
-mkdir ~/transformer_LT_german
-cd ~/transformer_LT_german
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/transformer_lt_official_fp32_pretrained_model.tar.gz
-tar -xzvf transformer_lt_official_fp32_pretrained_model.tar.gz
-```
-Refer to the Transformer LT Official [README](/benchmarks/language_translation/tensorflow/transformer_lt_official) to get the latest location of the pretrained model.
-
-3. After extraction, you should see the following folders and files in the `transformer_lt_official_fp32_pretrained_model` directory:
-```
-$ ls -l transformer_lt_official_fp32_pretrained_model/*
-
-transformer_lt_official_fp32_pretrained_model/data:
-total 1064
--rw-r--r--. 1 <user> <group> 359898 Feb 20 16:05 newstest2014.en
--rw-r--r--. 1 <user> <group> 399406 Feb 20 16:05 newstest2014.de
--rw-r--r--. 1 <user> <group> 324025 Mar 15 17:31 vocab.txt
-
-transformer_lt_official_fp32_pretrained_model/graph:
-total 241540
--rwx------. 1 <user> <group> 247333269 Mar 15 17:29 fp32_graphdef.pb
-
-```
-`newstest2014.en`: Input file with English text<br>
-`newstest2014.de`: German translation of the input file for measuring accuracy<br>
-`vocab.txt`: A dictionary of vocabulary<br>
-`fp32_graphdef.pb`: Pretrained model
-
-Or, if you have your own model/data, ensure the folder structure following the structure depicted below to run the pretrained model in Intel Model Zoo.
-
-```
-├─ transformer_LT_german
-│	    ├── transformer_pretrained_model
-│	    	 ├── data
-│	         │   ├── newstest2014.en (Input file)
-│	   	 │   ├── newstest2014.de (Reference file, this is optional)
-│	         │   └── vocab.txt
-│	         └── graph
-│	    	     └── pretrained_model.pb
-```
-4. Install [Docker](https://docs.docker.com/install/) since the tutorial runs in a Docker container.
-
-### Run inference
-
-1. Pull the relevant Intel-optimized TensorFlow Docker image.
-   [Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
-```bash
-docker pull docker.io/intel/intel-optimized-tensorflow:latest
-```
-2. cd to the inference script directory in local IntelAI repo
-```bash        
-cd ~/models/benchmarks
-```
-3. Run the Python script ``` launch_benchmark.py``` with the pretrained model. 
-```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance 
-inference on pretrained models trained of popular topologies. 
-The script will automatically set the recommended run-time options for supported topologies, 
-but if you choose to set your own options, refer to full of available flags and a detailed
-explanation on ```launch_benchmarking.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
- This step will automatically launch a new container on every run and terminate. Go to [Step 4](#step_4) to interactively run the script on the container.
-
-3.1. <b> *Online inference*</b> (using `--socket-id 0` and `--batch-size 1`)
-
-If you wish to calculate the [BLEU](https://en.wikipedia.org/wiki/BLEU) metric to find out the machine-translation quality, pass the file as `reference` flag.
-`newstest2014.en` file must have only one sentence per line
-
-
-console in:
-```bash
-python launch_benchmark.py \
-     --model-name transformer_lt_official \
-     --precision fp32 \
-     --mode inference \
-     --framework tensorflow \
-     --batch-size 1 \
-     --socket-id 0 \
-     --docker-image intel/intel-optimized-tensorflow:latest \
-     --in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
-     --data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
-     -- file=newstest2014.en \
-     vocab_file=vocab.txt \
-     file_out=translate.txt \
-     reference=newstest2014.de
-```
-
-The translated German text will be in the file `translation.txt` located at `~/models/benchmarks/common/tensorflow/logs`   
-   
-3.2. <b>*Batch inference*</b> (using `--socket-id 0` and `--batch-size 64`)
-
-```bash
-python launch_benchmark.py \
-	--model-name transformer_lt_official \
-	--precision fp32 \
-	--mode inference \
-	--framework tensorflow \
-	--batch-size 64 \
-	--socket-id 0 \
-	--docker-image intel/intel-optimized-tensorflow:latest \
-	--in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
-	--data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
-	-- file=newstest2014.en \
-	vocab_file=vocab.txt \
-	file_out=translate.txt \
-	reference=newstest2014.de
-```
-console out:
-```
-Graph parsed in ..... s
-import_graph_def took .....s
-tokenizer took ..... s
-Translating 3003 sentences from English to German.
-Total inferencing time:....
-Throughput:.... sentences/second
-Total number of sentences translated:3003
-I0419 22:50:49.856748 140013257643776 compute_bleu.py:106] Case-insensitive results: 27.510020
-I0419 22:50:51.203501 140013257643776 compute_bleu.py:110] Case-sensitive results: 26.964748
-Ran inference with batch size 64
-Log location outside container: /~/models/benchmarks/common/tensorflow/logs/benchmark_transformer_lt_official_inference_fp32_20190419_224047.log
-```
-
-The logs are captured in a directory outside of the container.<br> 
-
-4. <a name="step_4"></a>If you want to run the ```launch_benchmark.py``` interactively from within the docker container, add flag ```--debug```. This will launch a docker container based on the ```--docker_image```,
-performs necessary installs, runs the ```launch_benchmark.py``` script and does not terminate the container process. As an example, this step will demonstrate online inference (--batch-size 1), but you can implement the same strategy for batch inference (--batch-size 64)."
-
-console in:		
-```bash
-python launch_benchmark.py \
-	--model-name transformer_lt_official \
-	--precision fp32 \
-	--mode inference \
-	--framework tensorflow \
-	--batch-size 64 \
-	--socket-id 0 \
-	--docker-image intel/intel-optimized-tensorflow:latest \
-	--in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
-	--data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
-	--debug \
-	-- file=newstest2014.en \
-	vocab_file=vocab.txt \
-	file_out=translate.txt \
-	reference=newstest2014.de
-     
-```
-console out:	
-```bash
-	lscpu_path_cmd = command -v lscpu
-	lscpu located here: b'/usr/bin/lscpu'
-	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
-```
-	
-To rerun the benchmarking script, execute the ```start.sh``` bash script from your existing directory with the available flags, which in turn will run ```launch_benchmark.py```. For e.g  to rerun with the different batch size (batch size=64) settings run with ```BATCH_SIZE``` 
-and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```. 
-
-```bash	
-	chmod +x ./start.sh
-```
-```bash
-	NOINSTALL=True BATCH_SIZE=64 ./start.sh
-```
-
-All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. 
-	
-	
+# Language Translation with Transformer-LT
+
+
+## Goal
+This tutorial will introduce CPU performance considerations of the deep learning Transformer-LT model for language translation and how to use Intel® Optimizations for TensorFlow to improve inference time on CPUs.
+This tutorial will also provide code examples to use Intel Model Zoo's pretrained English to German model that can be copy/pasted for quick off-the-ground implementation on real data.
+
+## Background
+Language Translation with deep learning is a computationally expensive endeavor. This tutorial will show you how to reduce the inference runtime of your Transformer-LT network, a popular topology solution to translation.
+It is based on an encoder-decoder architecture with an added attention mechanism. The encoder is used to encode the original sentence to a meaningful fixed-length vector, and the decoder is responsible for extracting the context data from the vector.
+The encoder and decoder process the inputs and outputs, which are in the form of a time sequence.
+
+In a traditional encoder/decoder model, each element in the context vector is treated equally. This is typically not the ideal solution.
+For instance, when you translate the phrase “I travel by train” from English into Chinese, the word “I” has a greater influence than other words when producing its counterpart in Chinese.
+Thus, the attention mechanism was introduced to differentiate contributions of each element in the source sequence to their counterpart in the destination sequence, through the use of a hidden matrix.
+This matrix contains weights of each element in the source sequence when producing elements in the destination sequence.
+
+
+##  Recommended Settings
+In addition to TensorFlow optimizations that use the [Intel® oneAPI Deep Neural Network Library (Intel® oneDNN)](https://github.com/oneapi-src/oneDNN) to utilize instruction sets appropriately, the runtime settings also significantly contribute to improved performance.
+Tuning these options to optimize CPU workloads is vital to optimize performance of TensorFlow on Intel® processors.
+Below are the set of run-time options tested empirically on Transformer-LT and recommended by Intel:
+
+
+| Run-time options  | Recommendations |
+| ------------- | ------------- |
+| Batch Size | 64. Regardless of the hardware  |
+| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
+|intra_op_parallelism_threads |# physical cores |
+|inter_op_parallelism_threads | 1 |
+|NUMA Controls| --cpunodebind=0 --membind=0 |
+|KMP_AFFINITY| KMP_AFFINITY=granularity=fine,verbose,compact,1,0|
+|KMP_BLOCKTIME| 1 |
+|OMP_NUM_THREADS |physical cores|
+
+Note 1: Refer to this [link](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run time options.
+
+Note 2: You can remove `verbose` from `KMP_AFFINITY` setting to avoid verbose output at runtime.
+
+Run the following commands to get your processor information:
+
+a. #physical cores per socket: `lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
+
+b. #all physical cores: `lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
+
+Below is a code snippet you can incorporate into your existing TensorFlow application to set the best settings.
+You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set
+in the Python script.
+
+```bash
+export OMP_NUM_THREADS=physical cores
+export KMP_AFFINITY="granularity=fine,verbose,compact,1,0"
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+```
+(or)
+```
+import os
+os.environ["KMP_BLOCKTIME"] = "1"
+os.environ["KMP_SETTINGS"] = "1"
+os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
+os.environ["OMP_NUM_THREADS"]= <# physical cores>
+tf.config.threading.set_inter_op_parallelism_threads(1)
+tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
+```
+
+## Hands-on Tutorial
+This section shows how to measure inference performance on Intel's Model Zoo pretrained model (or your pretrained model) by setting the above-discussed run time flags.
+### FP32 inference
+
+### Initial Setup
+
+1. Clone IntelAI models and download into your home directory, skip this step if you already have Intel AI models installed.
+
+```bash
+cd ~
+git clone https://github.com/IntelAI/models.git
+```
+
+2. Skip to step 3 if you already have a pretrained model or download the file `transformer_lt_official_fp32_pretrained_model.tar.gz` into your ~/transformer_LT_german location.
+```
+mkdir ~/transformer_LT_german
+cd ~/transformer_LT_german
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/transformer_lt_official_fp32_pretrained_model.tar.gz
+tar -xzvf transformer_lt_official_fp32_pretrained_model.tar.gz
+```
+Refer to the Transformer LT Official [README](/benchmarks/language_translation/tensorflow/transformer_lt_official) to get the latest location of the pretrained model.
+
+3. After extraction, you should see the following folders and files in the `transformer_lt_official_fp32_pretrained_model` directory:
+```
+$ ls -l transformer_lt_official_fp32_pretrained_model/*
+
+transformer_lt_official_fp32_pretrained_model/data:
+total 1064
+-rw-r--r--. 1 <user> <group> 359898 Feb 20 16:05 newstest2014.en
+-rw-r--r--. 1 <user> <group> 399406 Feb 20 16:05 newstest2014.de
+-rw-r--r--. 1 <user> <group> 324025 Mar 15 17:31 vocab.txt
+
+transformer_lt_official_fp32_pretrained_model/graph:
+total 241540
+-rwx------. 1 <user> <group> 247333269 Mar 15 17:29 fp32_graphdef.pb
+
+```
+`newstest2014.en`: Input file with English text<br>
+`newstest2014.de`: German translation of the input file for measuring accuracy<br>
+`vocab.txt`: A dictionary of vocabulary<br>
+`fp32_graphdef.pb`: Pretrained model
+
+Or, if you have your own model/data, ensure the folder structure following the structure depicted below to run the pretrained model in Intel Model Zoo.
+
+```
+├─ transformer_LT_german
+│	    ├── transformer_pretrained_model
+│	    	 ├── data
+│	         │   ├── newstest2014.en (Input file)
+│	   	 │   ├── newstest2014.de (Reference file, this is optional)
+│	         │   └── vocab.txt
+│	         └── graph
+│	    	     └── pretrained_model.pb
+```
+4. Install [Docker](https://docs.docker.com/install/) since the tutorial runs in a Docker container.
+
+### Run inference
+
+1. Pull the relevant Intel-optimized TensorFlow Docker image.
+   [Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
+```bash
+docker pull docker.io/intel/intel-optimized-tensorflow:latest
+```
+2. cd to the inference script directory in local IntelAI repo
+```bash
+cd ~/models/benchmarks
+```
+3. Run the Python script ``` launch_benchmark.py``` with the pretrained model.
+```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance
+inference on pretrained models trained of popular topologies.
+The script will automatically set the recommended run-time options for supported topologies,
+but if you choose to set your own options, refer to full of available flags and a detailed
+explanation on ```launch_benchmarking.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
+ This step will automatically launch a new container on every run and terminate. Go to [Step 4](#step_4) to interactively run the script on the container.
+
+3.1. <b> *Online inference*</b> (using `--socket-id 0` and `--batch-size 1`)
+
+If you wish to calculate the [BLEU](https://en.wikipedia.org/wiki/BLEU) metric to find out the machine-translation quality, pass the file as `reference` flag.
+`newstest2014.en` file must have only one sentence per line
+
+
+console in:
+```bash
+python launch_benchmark.py \
+     --model-name transformer_lt_official \
+     --precision fp32 \
+     --mode inference \
+     --framework tensorflow \
+     --batch-size 1 \
+     --socket-id 0 \
+     --docker-image intel/intel-optimized-tensorflow:latest \
+     --in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
+     --data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
+     -- file=newstest2014.en \
+     vocab_file=vocab.txt \
+     file_out=translate.txt \
+     reference=newstest2014.de
+```
+
+The translated German text will be in the file `translation.txt` located at `~/models/benchmarks/common/tensorflow/logs`
+
+3.2. <b>*Batch inference*</b> (using `--socket-id 0` and `--batch-size 64`)
+
+```bash
+python launch_benchmark.py \
+	--model-name transformer_lt_official \
+	--precision fp32 \
+	--mode inference \
+	--framework tensorflow \
+	--batch-size 64 \
+	--socket-id 0 \
+	--docker-image intel/intel-optimized-tensorflow:latest \
+	--in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
+	--data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
+	-- file=newstest2014.en \
+	vocab_file=vocab.txt \
+	file_out=translate.txt \
+	reference=newstest2014.de
+```
+console out:
+```
+Graph parsed in ..... s
+import_graph_def took .....s
+tokenizer took ..... s
+Translating 3003 sentences from English to German.
+Total inferencing time:....
+Throughput:.... sentences/second
+Total number of sentences translated:3003
+I0419 22:50:49.856748 140013257643776 compute_bleu.py:106] Case-insensitive results: 27.510020
+I0419 22:50:51.203501 140013257643776 compute_bleu.py:110] Case-sensitive results: 26.964748
+Ran inference with batch size 64
+Log location outside container: /~/models/benchmarks/common/tensorflow/logs/benchmark_transformer_lt_official_inference_fp32_20190419_224047.log
+```
+
+The logs are captured in a directory outside of the container.<br>
+
+4. <a name="step_4"></a>If you want to run the ```launch_benchmark.py``` interactively from within the docker container, add flag ```--debug```. This will launch a docker container based on the ```--docker_image```,
+performs necessary installs, runs the ```launch_benchmark.py``` script and does not terminate the container process. As an example, this step will demonstrate online inference (--batch-size 1), but you can implement the same strategy for batch inference (--batch-size 64)."
+
+console in:
+```bash
+python launch_benchmark.py \
+	--model-name transformer_lt_official \
+	--precision fp32 \
+	--mode inference \
+	--framework tensorflow \
+	--batch-size 64 \
+	--socket-id 0 \
+	--docker-image intel/intel-optimized-tensorflow:latest \
+	--in-graph ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/graph/fp32_graphdef.pb \
+	--data-location ~/transformer_LT_german/transformer_lt_official_fp32_pretrained_model/data \
+	--debug \
+	-- file=newstest2014.en \
+	vocab_file=vocab.txt \
+	file_out=translate.txt \
+	reference=newstest2014.de
+
+```
+console out:
+```bash
+	lscpu_path_cmd = command -v lscpu
+	lscpu located here: b'/usr/bin/lscpu'
+	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
+```
+
+To rerun the benchmarking script, execute the ```start.sh``` bash script from your existing directory with the available flags, which in turn will run ```launch_benchmark.py```. For e.g  to rerun with the different batch size (batch size=64) settings run with ```BATCH_SIZE```
+and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```.
+
+```bash
+	chmod +x ./start.sh
+```
+```bash
+	NOINSTALL=True BATCH_SIZE=64 ./start.sh
+```
+
+All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags.
diff --git a/docs/recommendation/tensorflow/Tutorial.md b/docs/recommendation/tensorflow/Tutorial.md
index 790b12f42..c95880e5e 100644
--- a/docs/recommendation/tensorflow/Tutorial.md
+++ b/docs/recommendation/tensorflow/Tutorial.md
@@ -1,343 +1,343 @@
-# Recommendation System with Wide and Deep Model
-
-
-## Goal
-This tutorial will introduce CPU performance considerations for the popular [Wide and Deep](https://arxiv.org/abs/1606.07792) model to solve recommendation system problems
-and how to tune run-time parameters to maximize performance using Intel® Optimizations for TensorFlow. 
-This tutorial also includes a hands-on demo on Intel Model Zoo's Wide and Deep pretrained model built using a dataset from [Kaggle's Display Advertising Challenge](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) 
-to run online (real-time) and batch inference.
-
-## Background
-Google's latest innovation  to solve some of the shortcomings in traditional recommendation systems is the
-Wide and Deep model, which combines the best aspects of linear modeling and deep neural networks to
-outperform either approach individually by a significant percentage. In practice, linear models follow the simple mechanism of capturing feature relationships, resulting in 
-a lot of derived features. This piece of the topology is called “wide” learning, while the complexities 
-to generalize these relationships are solved by the "deep" piece of this topology. This wide and deep combination has
-proven to be a robust approach in handling the underfitting and overfitting problems caused by unique feature combinations, however 
-at the cost of significant compute power.
-Google has published a blog on [Wide and Deep learning with TensorFlow](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html), 
-and Datatonic has published [performance gains of CPU over GPU](https://datatonic.com/insights/accelerate-machine-learning-on-google-cloud-with-intel-xeon-processors/) for these types of models. 
-
-## Recommended Settings
-Although there is no one-size-fits-all solution to maximize Wide and Deep model performance on CPUs, understanding the bottlenecks and tuning the run-time 
-parameters based on your dataset and TensorFlow graph can be extremely beneficial.
- 
-A recommendation system with Wide and Deep model topology comes with two main caveats: 
-Unlike image recognition models such as ResNet50 or ResNet101, the "wide" component of this model performs more “independent” operations and 
-does not provide opportunities to exploit parallelism within each node, while, the "deep" component of this topology demands more parallelism within each node.
- 
-The wide or linear component of this topology depends on the data features, i.e. on the dataset width. 
-The deep component depends on the number of hidden units in the graph where threading can be enabled within the operations, 
-hence exhibiting a direct relation to compute power. This setback can be eliminated by setting the right number of intra_op_threads and OMP_NUM_THREADS. 
-Note that while tuning these important run-time parameters, do not over/under use the threadpool. 
- 
-| Run-time options  | Recommendations|
-| ------------- | ------------- |
-| Batch Size | 512 |
-| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
-|intra_op_parallelism_threads| 1 to physical cores | 
-|inter_op_parallelism_threads | 1 |
-|Data Layout| NC|
-|Sockets | all |
-|KMP_AFFINITY| granularity=fine,noverbose,compact,1,0|
-|KMP_BLOCKTIME| 1 |
-|OMP_NUM_THREADS | 1 to physical cores |
-
-*Note: Refer to [this article](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run time options.*
-
-Intel's data science team trained and published a Wide and Deep model on Kaggle's Display Advertising Challenge dataset, and has empirically tested and identified the best run-time settings 
-to run inference, which is illustrated below in the hands-on-tutorial section.
-
-Run the following commands to get your processor information:
-
-a. #physical cores per socket : 
-`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
-
-b. #all physical cores: 
-`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
- 
-Below is a code snippet you can incorporate into your Wide and Deep TensorFlow application to start tuning towards the best settings. 
-You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set 
-in the Python script.
-
-```bash
-export OMP_NUM_THREADS=no of physical cores
-export KMP_AFFINITY="granularity=fine,noverbose,compact,1,0"
-export KMP_BLOCKTIME=1
-export KMP_SETTINGS=1
-```
-(or)
-```
-import os
-os.environ["KMP_BLOCKTIME"] = "1"
-os.environ["KMP_SETTINGS"] = "1"
-os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
-os.environ["OMP_NUM_THREADS"]= no of physical cores 
-tf.config.threading.set_inter_op_parallelism_threads(1)
-tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
-```
-
-
-To control the execution to one NUMA node or socket id, run the python script with the command:
-
-```
-numactl --cpunodebind=0  --membind=0 python <application script> <args>
-```
-
-## Hands-on Tutorial
-
-
-This section shows how to measure inference performance on Intel's Model Zoo Wide and Deep pretrained model trained 
-on Kaggle's Display Advertising Challenge dataset by setting the above-discussed run time flags. 
-
-### FP32 inference
- 
-### Initial Setup
-1. Clone IntelAI models and download into your home directory.
-
-```bash
-git clone https://github.com/IntelAI/models.git
-```
-
-2. Download the pretrained model ```wide_deep_fp32_pretrained_model.pb``` into your `~/wide_deep_files` location.
-
-```
-mkdir ~/wide_deep_files
-cd ~/wide_deep_files
-wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/wide_deep_fp32_pretrained_model.pb
-
-```
-Refer to the Wide and Deep [README](/benchmarks/recommendation/tensorflow/wide_deep_large_ds) to get the latest location of the pretrained model.
-
-3. Install [Docker](https://docs.docker.com/install/) since the tutorial runs on a Docker container.
-
-
-4. Data Preparation: You will need approximately 20GB of available disk space to complete this step. 
-Follow the instructions below to download and prepare the dataset. 
-  - Prepare the data directory:
-    ```
-    mkdir ~/wide_deep_files/real_dataset
-    cd ~/wide_deep_files/real_dataset
-    ```
-	
-  - Download the eval set: 
-  
-    ```wget https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv```
-  
-  - Move the downloaded dataset to `~/models/models` and start a Docker container for preprocessing:
-    ```
-    mv eval.csv ~/models/models
-    cd ~/models/models
-    docker run -it --privileged -u root:root \
-            -w /models \
-            --volume $PWD:/models \
-            intel/intel-optimized-tensorflow:1.15.2 \
-            /bin/bash
-    ```
-  - Preprocess and convert eval dataset to TFRecord format. We will use a script in the Intel Model Zoo repository.
-    This step may take a while to complete.
-    ```
-    python recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \
-        --inputcsv-datafile eval.csv \
-        --calibrationcsv-datafile train.csv \
-        --outputfile-name preprocessed
-    ```
-  - Exit the docker container and find the processed dataset `eval_preprocessed.tfrecords` in the location `~/models/models`.
-
-### Run inference
-
-1. Pull the relevant Intel Optimizations for TensorFlow Docker image. We'll be running the pretrained model to infer in a Docker container. 
-   [Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
-```bash
-docker pull intel/intel-optimized-tensorflow:latest
-```
-2. cd to the inference script directory:
-```bash        
-cd ~/models/benchmarks
-```
-3. Run the Python script ``` launch_benchmark.py``` with the pretrained model. 
-The ```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance 
-inference on pretrained models of popular topologies. 
-The script will automatically set the recommended run-time options for supported topologies, 
-but if you choose to set your own options, refer to the full list of available flags and a detailed
-explanation of the ```launch_benchmark.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
-This step will automatically launch a new container on every run and terminate. Go to [Step 4](#step_4) to interactively run the script in the container.
-
-&nbsp;&nbsp;&nbsp;&nbsp;3.1. <b> *Online Inference*</b> (also called real-time inference, batch_size=1)
-
-Note: As per the recommended settings `socket-id` is set to -1 to run on all sockets.
-Set this parameter to a socket id to run the workload on a single socket.
-
-
-	python launch_benchmark.py \
-        --batch-size 1 \
-        --model-name wide_deep_large_ds \
-        --precision fp32 \
-        --mode inference \
-        --framework tensorflow \
-        --benchmark-only \
-        --docker-image intel/intel-optimized-tensorflow:latest \
-        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
-        --data-location ~/models/models/eval_preprocessed.tfrecords \
-        --verbose
-
-&nbsp;&nbsp;&nbsp;&nbsp;3.2. <b>*Batch Inference*</b> (batch_size=512)
-
-Note: As per the recommended settings `socket-id` is set to -1 to run on all sockets.
-Set this parameter to a socket id to run the workload on a single socket.
-		
-	python launch_benchmark.py \
-        --batch-size 512 \
-        --model-name wide_deep_large_ds \
-        --precision fp32 \
-        --mode inference \
-        --framework tensorflow \
-        --benchmark-only \
-        --docker-image intel/intel-optimized-tensorflow:latest \
-        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
-        --data-location ~/models/models/eval_preprocessed.tfrecords \
-        --verbose
-
-&nbsp;&nbsp;&nbsp;&nbsp;<u>Example Output:</u>
-
-	--------------------------------------------------
-	Total test records           :  2000000
-	Batch size is                :  512
-	Number of batches            :  3907
-	Inference duration (seconds) :  ...
-	Average Latency (ms/batch)   :  ...
-	Throughput is (records/sec)  :  ...
-	--------------------------------------------------
-	num_inter_threads: 28
-	num_intra_threads: 1
-	Received these standard args: Namespace(accuracy_only=False, batch_size=512, benchmark_dir='/workspace/benchmarks', benchmark_only=True, checkpoint=None, data_location='/dataset', data_num_inter_threads=None, data_num_intra_threads=None, framework='tensorflow', input_graph='/in_graph/wide_deep_fp32_pretrained_model.pb', intelai_models='/workspace/intelai_models', mode='inference', model_args=[], model_name='wide_deep_large_ds', model_source_dir='/workspace/models', num_cores=-1, num_inter_threads=28, num_intra_threads=1, output_dir='/workspace/benchmarks/common/tensorflow/logs', output_results=False, precision='fp32', socket_id=-1, use_case='recommendation', verbose=True)
-	Received these custom args: []
-	Current directory: /workspace/benchmarks
-	Running: python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset
-	PYTHONPATH: :/workspace/intelai_models:/workspace/benchmarks/common/tensorflow:/workspace/benchmarks
-	RUNCMD: python common/tensorflow/run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs  --benchmark-only  --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset
-	Batch Size: 512
-	Ran inference with batch size 512
-	Log location outside container: /home/<user>/models/benchmarks/common/tensorflow/logs/benchmark_wide_deep_large_ds_inference_fp32_20190316_164924.log
-
-The logs are captured in a directory outside of the container.<br> 
-
-&nbsp;&nbsp;&nbsp;&nbsp;3.3. <b> *Compute accuracy on eval dataset*</b>
-
-	python launch_benchmark.py \
-        --batch-size 512 \
-        --model-name wide_deep_large_ds \
-        --precision fp32 \
-        --mode inference \
-        --framework tensorflow \
-        --accuracy-only \
-        --docker-image intel/intel-optimized-tensorflow:latest \
-        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
-        --data-location ~/models/models/eval_preprocessed.tfrecords \
-        --verbose		
-		
-   &nbsp;&nbsp;&nbsp;&nbsp;<u>Example Output:</u>	
-   
-With `accuracy-only` flag, you can find an additional metric on accuracy as shown in the output below 
-
-	--------------------------------------------------
-	Total test records           :  2000000
-	Batch size is                :  512
-	Number of batches            :  3907
-	Classification accuracy (%)  :  77.6693
-	No of correct predictions    :  ...
-	Inference duration (seconds) :  ...
-	Average Latency (ms/batch)   :  ...
-	Throughput is (records/sec)  :  ...
-	--------------------------------------------------
-
-4. <a name="step_4"></a>If you want to run the benchmarking script interactively within the docker container, run ```launch_benchmark.py``` with ```--debug``` flag. This will launch a docker container based on the ```--docker_image```,
-perform necessary installs, run the ```launch_benchmark.py``` script, and does not terminate the container process. 
-		
-		python launch_benchmark.py \
-			--batch-size 1 \
-			--model-name wide_deep_large_ds \
-			--precision fp32 \
-			--mode inference \
-			--framework tensorflow \
-			--benchmark-only \
-			--docker-image intel/intel-optimized-tensorflow:latest \
-			--in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
-			--data-location ~/models/models/eval_preprocessed.tfrecords \
-			--debug				
-	
-&nbsp;&nbsp;<u>Example Output:</u>
-
-	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
-	
-To rerun the model script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For example, to rerun with the best batch inference (batch size=512) settings, run with ```BATCH_SIZE``` 
-and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```. 
-	
-	chmod +x ./start.sh	
-	
-	NOINSTALL=True BATCH_SIZE=512 SOCKET_ID=0 VERBOSE=True ./start.sh
-	
-All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags. 
-	
-5. <b> Inference on a large dataset (optional) </b>
-
-To run inference on a large dataset, download the test dataset in `~/wide_deep_files/real_dataset`. Note that this dataset supports only `benchmark-only` flag.
-
-```    
-cd ~/wide_deep_files/real_dataset
-```
-	    
-- Go to this [page](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) on the Criteo website.
-Agree to the terms of use, enter your name, and submit the form. Then copy the download link for the 4.3GB tar file called `dac.tar.gz` and use it in the `wget` command in the code block below.
-Untar the file to create three files:
-    1. readme.txt
-    2. train.txt (11GB) - you will not be using this, so delete it to save space
-    3. test.txt (1.4GB) - transform this into .csv
-
-    ```
-    wget <download_link> # replace <download_link> with link you got from Criteo website
-    tar -xvf dac.tar.gz
-    rm train.txt
-    tr '\t' ',' < test.txt > test.csv
-    ```
-	
-- Move the downloaded dataset to `~/models/models` and start a Docker container for preprocessing. This step is similar to `eval` dataset preprocessing:
-    
-    ```
-    mv test.csv ~/models/models
-    cd ~/models/models
-    docker run -it --privileged -u root:root \
-            -w /models \
-            --volume $PWD:/models \
-            intel/intel-optimized-tensorflow:latest \
-            /bin/bash
-    ```
-
-- Preprocess and convert test dataset to TFRecord format. We will use a script in the Intel Model Zoo repository.
-    This step may take a while to complete
-
-	```
-    python recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \
-        --inputcsv-datafile test.csv \
-        --outputfile-name preprocessed
-    ```
-
-- Exit the docker container and find the processed dataset `test_preprocessed.tfrecords` in the location `~/models/models`.
-
-&nbsp;&nbsp;&nbsp;&nbsp;5.1. <b>*Batch or Online Inference*</b>
-
-	cd ~/models/benchmarks
-
-	python launch_benchmark.py \
-			--batch-size 512 \
-			--model-name wide_deep_large_ds \
-			--precision fp32 \
-			--mode inference \
-			--framework tensorflow \
-			--benchmark-only \
-			--docker-image intel/intel-optimized-tensorflow:latest \
-			--in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
-			--data-location ~/models/models/test_preprocessed.tfrecords \
-			--verbose
-
-Set batch_size to 1 to run for online (real-time) inference
+# Recommendation System with Wide and Deep Model
+
+
+## Goal
+This tutorial will introduce CPU performance considerations for the popular [Wide and Deep](https://arxiv.org/abs/1606.07792) model to solve recommendation system problems
+and how to tune run-time parameters to maximize performance using Intel® Optimizations for TensorFlow.
+This tutorial also includes a hands-on demo on Intel Model Zoo's Wide and Deep pretrained model built using a dataset from [Kaggle's Display Advertising Challenge](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)
+to run online (real-time) and batch inference.
+
+## Background
+Google's latest innovation  to solve some of the shortcomings in traditional recommendation systems is the
+Wide and Deep model, which combines the best aspects of linear modeling and deep neural networks to
+outperform either approach individually by a significant percentage. In practice, linear models follow the simple mechanism of capturing feature relationships, resulting in
+a lot of derived features. This piece of the topology is called “wide” learning, while the complexities
+to generalize these relationships are solved by the "deep" piece of this topology. This wide and deep combination has
+proven to be a robust approach in handling the underfitting and overfitting problems caused by unique feature combinations, however
+at the cost of significant compute power.
+Google has published a blog on [Wide and Deep learning with TensorFlow](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html),
+and Datatonic has published [performance gains of CPU over GPU](https://datatonic.com/insights/accelerate-machine-learning-on-google-cloud-with-intel-xeon-processors/) for these types of models.
+
+## Recommended Settings
+Although there is no one-size-fits-all solution to maximize Wide and Deep model performance on CPUs, understanding the bottlenecks and tuning the run-time
+parameters based on your dataset and TensorFlow graph can be extremely beneficial.
+
+A recommendation system with Wide and Deep model topology comes with two main caveats:
+Unlike image recognition models such as ResNet50 or ResNet101, the "wide" component of this model performs more “independent” operations and
+does not provide opportunities to exploit parallelism within each node, while, the "deep" component of this topology demands more parallelism within each node.
+
+The wide or linear component of this topology depends on the data features, i.e. on the dataset width.
+The deep component depends on the number of hidden units in the graph where threading can be enabled within the operations,
+hence exhibiting a direct relation to compute power. This setback can be eliminated by setting the right number of intra_op_threads and OMP_NUM_THREADS.
+Note that while tuning these important run-time parameters, do not over/under use the threadpool.
+
+| Run-time options  | Recommendations|
+| ------------- | ------------- |
+| Batch Size | 512 |
+| Hyperthreading  | Enabled. Turn on in BIOS. Requires a restart. |
+|intra_op_parallelism_threads| 1 to physical cores |
+|inter_op_parallelism_threads | 1 |
+|Data Layout| NC|
+|Sockets | all |
+|KMP_AFFINITY| granularity=fine,noverbose,compact,1,0|
+|KMP_BLOCKTIME| 1 |
+|OMP_NUM_THREADS | 1 to physical cores |
+
+*Note: Refer to [this article](https://software.intel.com/en-us/articles/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference) to learn more about the run time options.*
+
+Intel's data science team trained and published a Wide and Deep model on Kaggle's Display Advertising Challenge dataset, and has empirically tested and identified the best run-time settings
+to run inference, which is illustrated below in the hands-on-tutorial section.
+
+Run the following commands to get your processor information:
+
+a. #physical cores per socket:
+`lscpu | grep "Core(s) per socket" | cut -d':' -f2 | xargs`
+
+b. #all physical cores:
+`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
+
+Below is a code snippet you can incorporate into your Wide and Deep TensorFlow application to start tuning towards the best settings.
+You can either set them in the CLI or in the Python script. Note that inter and intra_op_parallelism_threads settings can only be set
+in the Python script.
+
+```bash
+export OMP_NUM_THREADS=no of physical cores
+export KMP_AFFINITY="granularity=fine,noverbose,compact,1,0"
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+```
+(or)
+```
+import os
+os.environ["KMP_BLOCKTIME"] = "1"
+os.environ["KMP_SETTINGS"] = "1"
+os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
+os.environ["OMP_NUM_THREADS"]= no of physical cores
+tf.config.threading.set_inter_op_parallelism_threads(1)
+tf.config.threading.set_intra_op_parallelism_threads(<# physical cores>)
+```
+
+
+To control the execution to one NUMA node or socket id, run the python script with the command:
+
+```
+numactl --cpunodebind=0  --membind=0 python <application script> <args>
+```
+
+## Hands-on Tutorial
+
+
+This section shows how to measure inference performance on Intel's Model Zoo Wide and Deep pretrained model trained
+on Kaggle's Display Advertising Challenge dataset by setting the above-discussed run time flags.
+
+### FP32 inference
+
+### Initial Setup
+1. Clone IntelAI models and download into your home directory.
+
+```bash
+git clone https://github.com/IntelAI/models.git
+```
+
+2. Download the pretrained model ```wide_deep_fp32_pretrained_model.pb``` into your `~/wide_deep_files` location.
+
+```
+mkdir ~/wide_deep_files
+cd ~/wide_deep_files
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/wide_deep_fp32_pretrained_model.pb
+
+```
+Refer to the Wide and Deep [README](/benchmarks/recommendation/tensorflow/wide_deep_large_ds) to get the latest location of the pretrained model.
+
+3. Install [Docker](https://docs.docker.com/install/) since the tutorial runs on a Docker container.
+
+
+4. Data Preparation: You will need approximately 20GB of available disk space to complete this step.
+Follow the instructions below to download and prepare the dataset.
+  - Prepare the data directory:
+    ```
+    mkdir ~/wide_deep_files/real_dataset
+    cd ~/wide_deep_files/real_dataset
+    ```
+
+  - Download the eval set:
+
+    ```wget https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv```
+
+  - Move the downloaded dataset to `~/models/models` and start a Docker container for preprocessing:
+    ```
+    mv eval.csv ~/models/models
+    cd ~/models/models
+    docker run -it --privileged -u root:root \
+            -w /models \
+            --volume $PWD:/models \
+            intel/intel-optimized-tensorflow:1.15.2 \
+            /bin/bash
+    ```
+  - Preprocess and convert eval dataset to TFRecord format. We will use a script in the Intel Model Zoo repository.
+    This step may take a while to complete.
+    ```
+    python recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \
+        --inputcsv-datafile eval.csv \
+        --calibrationcsv-datafile train.csv \
+        --outputfile-name preprocessed
+    ```
+  - Exit the docker container and find the processed dataset `eval_preprocessed.tfrecords` in the location `~/models/models`.
+
+### Run inference
+
+1. Pull the relevant Intel Optimizations for TensorFlow Docker image. We'll be running the pretrained model to infer in a Docker container.
+   [Click here](https://software.intel.com/en-us/articles/intel-optimization-for-tensorflow-installation-guide) to find  all the available Docker images.
+```bash
+docker pull intel/intel-optimized-tensorflow:latest
+```
+2. cd to the inference script directory:
+```bash
+cd ~/models/benchmarks
+```
+3. Run the Python script ``` launch_benchmark.py``` with the pretrained model.
+The ```launch_benchmark.py``` script can be treated as an entry point to conveniently perform out-of-box high performance
+inference on pretrained models of popular topologies.
+The script will automatically set the recommended run-time options for supported topologies,
+but if you choose to set your own options, refer to the full list of available flags and a detailed
+explanation of the ```launch_benchmark.py``` script [here](/docs/general/tensorflow/LaunchBenchmark.md).
+This step will automatically launch a new container on every run and terminate. Go to [Step 4](#step_4) to interactively run the script in the container.
+
+&nbsp;&nbsp;&nbsp;&nbsp;3.1. <b> *Online Inference*</b> (also called real-time inference, batch_size=1)
+
+Note: As per the recommended settings `socket-id` is set to -1 to run on all sockets.
+Set this parameter to a socket id to run the workload on a single socket.
+
+
+	python launch_benchmark.py \
+        --batch-size 1 \
+        --model-name wide_deep_large_ds \
+        --precision fp32 \
+        --mode inference \
+        --framework tensorflow \
+        --benchmark-only \
+        --docker-image intel/intel-optimized-tensorflow:latest \
+        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
+        --data-location ~/models/models/eval_preprocessed.tfrecords \
+        --verbose
+
+&nbsp;&nbsp;&nbsp;&nbsp;3.2. <b>*Batch Inference*</b> (batch_size=512)
+
+Note: As per the recommended settings `socket-id` is set to -1 to run on all sockets.
+Set this parameter to a socket id to run the workload on a single socket.
+
+	python launch_benchmark.py \
+        --batch-size 512 \
+        --model-name wide_deep_large_ds \
+        --precision fp32 \
+        --mode inference \
+        --framework tensorflow \
+        --benchmark-only \
+        --docker-image intel/intel-optimized-tensorflow:latest \
+        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
+        --data-location ~/models/models/eval_preprocessed.tfrecords \
+        --verbose
+
+&nbsp;&nbsp;&nbsp;&nbsp;<u>Example Output:</u>
+
+	--------------------------------------------------
+	Total test records           :  2000000
+	Batch size is                :  512
+	Number of batches            :  3907
+	Inference duration (seconds) :  ...
+	Average Latency (ms/batch)   :  ...
+	Throughput is (records/sec)  :  ...
+	--------------------------------------------------
+	num_inter_threads: 28
+	num_intra_threads: 1
+	Received these standard args: Namespace(accuracy_only=False, batch_size=512, benchmark_dir='/workspace/benchmarks', benchmark_only=True, checkpoint=None, data_location='/dataset', data_num_inter_threads=None, data_num_intra_threads=None, framework='tensorflow', input_graph='/in_graph/wide_deep_fp32_pretrained_model.pb', intelai_models='/workspace/intelai_models', mode='inference', model_args=[], model_name='wide_deep_large_ds', model_source_dir='/workspace/models', num_cores=-1, num_inter_threads=28, num_intra_threads=1, output_dir='/workspace/benchmarks/common/tensorflow/logs', output_results=False, precision='fp32', socket_id=-1, use_case='recommendation', verbose=True)
+	Received these custom args: []
+	Current directory: /workspace/benchmarks
+	Running: python /workspace/intelai_models/inference/inference.py --num_intra_threads=1 --num_inter_threads=28 --batch_size=512 --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data_location=/dataset
+	PYTHONPATH: :/workspace/intelai_models:/workspace/benchmarks/common/tensorflow:/workspace/benchmarks
+	RUNCMD: python common/tensorflow/run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs  --benchmark-only  --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset
+	Batch Size: 512
+	Ran inference with batch size 512
+	Log location outside container: /home/<user>/models/benchmarks/common/tensorflow/logs/benchmark_wide_deep_large_ds_inference_fp32_20190316_164924.log
+
+The logs are captured in a directory outside of the container.<br>
+
+&nbsp;&nbsp;&nbsp;&nbsp;3.3. <b> *Compute accuracy on eval dataset*</b>
+
+	python launch_benchmark.py \
+        --batch-size 512 \
+        --model-name wide_deep_large_ds \
+        --precision fp32 \
+        --mode inference \
+        --framework tensorflow \
+        --accuracy-only \
+        --docker-image intel/intel-optimized-tensorflow:latest \
+        --in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
+        --data-location ~/models/models/eval_preprocessed.tfrecords \
+        --verbose
+
+   &nbsp;&nbsp;&nbsp;&nbsp;<u>Example Output:</u>
+
+With `accuracy-only` flag, you can find an additional metric on accuracy as shown in the output below
+
+	--------------------------------------------------
+	Total test records           :  2000000
+	Batch size is                :  512
+	Number of batches            :  3907
+	Classification accuracy (%)  :  77.6693
+	No of correct predictions    :  ...
+	Inference duration (seconds) :  ...
+	Average Latency (ms/batch)   :  ...
+	Throughput is (records/sec)  :  ...
+	--------------------------------------------------
+
+4. <a name="step_4"></a>If you want to run the benchmarking script interactively within the docker container, run ```launch_benchmark.py``` with ```--debug``` flag. This will launch a docker container based on the ```--docker_image```,
+perform necessary installs, run the ```launch_benchmark.py``` script, and does not terminate the container process.
+
+		python launch_benchmark.py \
+			--batch-size 1 \
+			--model-name wide_deep_large_ds \
+			--precision fp32 \
+			--mode inference \
+			--framework tensorflow \
+			--benchmark-only \
+			--docker-image intel/intel-optimized-tensorflow:latest \
+			--in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
+			--data-location ~/models/models/eval_preprocessed.tfrecords \
+			--debug
+
+&nbsp;&nbsp;<u>Example Output:</u>
+
+	root@a78677f56d69:/workspace/benchmarks/common/tensorflow#
+
+To rerun the model script, execute the ```start.sh``` bash script from your existing directory with additional or modified flags. For example, to rerun with the best batch inference (batch size=512) settings, run with ```BATCH_SIZE```
+and to skip the run from reinstalling packages pass ```True``` to ```NOINSTALL```.
+
+	chmod +x ./start.sh
+
+	NOINSTALL=True BATCH_SIZE=512 SOCKET_ID=0 VERBOSE=True ./start.sh
+
+All other flags will be defaulted to values passed in the first ```launch_benchmark.py``` that starts the container. [See here](/docs/general/tensorflow/LaunchBenchmark.md) to get the full list of flags.
+
+5. <b> Inference on a large dataset (optional) </b>
+
+To run inference on a large dataset, download the test dataset in `~/wide_deep_files/real_dataset`. Note that this dataset supports only `benchmark-only` flag.
+
+```
+cd ~/wide_deep_files/real_dataset
+```
+
+- Go to this [page](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) on the Criteo website.
+Agree to the terms of use, enter your name, and submit the form. Then copy the download link for the 4.3GB tar file called `dac.tar.gz` and use it in the `wget` command in the code block below.
+Untar the file to create three files:
+    1. readme.txt
+    2. train.txt (11GB) - you will not be using this, so delete it to save space
+    3. test.txt (1.4GB) - transform this into .csv
+
+    ```
+    wget <download_link> # replace <download_link> with link you got from Criteo website
+    tar -xvf dac.tar.gz
+    rm train.txt
+    tr '\t' ',' < test.txt > test.csv
+    ```
+
+- Move the downloaded dataset to `~/models/models` and start a Docker container for preprocessing. This step is similar to `eval` dataset preprocessing:
+
+    ```
+    mv test.csv ~/models/models
+    cd ~/models/models
+    docker run -it --privileged -u root:root \
+            -w /models \
+            --volume $PWD:/models \
+            intel/intel-optimized-tensorflow:latest \
+            /bin/bash
+    ```
+
+- Preprocess and convert test dataset to TFRecord format. We will use a script in the Intel Model Zoo repository.
+    This step may take a while to complete
+
+	```
+    python recommendation/tensorflow/wide_deep_large_ds/dataset/preprocess_csv_tfrecords.py \
+        --inputcsv-datafile test.csv \
+        --outputfile-name preprocessed
+    ```
+
+- Exit the docker container and find the processed dataset `test_preprocessed.tfrecords` in the location `~/models/models`.
+
+&nbsp;&nbsp;&nbsp;&nbsp;5.1. <b>*Batch or Online Inference*</b>
+
+	cd ~/models/benchmarks
+
+	python launch_benchmark.py \
+			--batch-size 512 \
+			--model-name wide_deep_large_ds \
+			--precision fp32 \
+			--mode inference \
+			--framework tensorflow \
+			--benchmark-only \
+			--docker-image intel/intel-optimized-tensorflow:latest \
+			--in-graph ~/wide_deep_files/wide_deep_fp32_pretrained_model.pb \
+			--data-location ~/models/models/test_preprocessed.tfrecords \
+			--verbose
+
+Set batch_size to 1 to run for online (real-time) inference
diff --git a/models/recommendation/pytorch/dlrm/training/bfloat16/README.md b/models/recommendation/pytorch/dlrm/training/bfloat16/README.md
index 83ea0ddcb..5142db99e 100755
--- a/models/recommendation/pytorch/dlrm/training/bfloat16/README.md
+++ b/models/recommendation/pytorch/dlrm/training/bfloat16/README.md
@@ -1,4 +1,4 @@
-﻿# DLRM MLPerf BF16 Training v0.7 Intel Submission
+# DLRM MLPerf BF16 Training v0.7 Intel Submission
 For License, Contribution and Code of conduct, please see here: https://github.com/facebookresearch/dlrm/tree/mlperf
 
 ## HW and SW requirements
diff --git a/models/recommendation/pytorch/dlrm/training/bfloat16/tools/visualize.py b/models/recommendation/pytorch/dlrm/training/bfloat16/tools/visualize.py
index 7a219ee09..2990b3bd0 100644
--- a/models/recommendation/pytorch/dlrm/training/bfloat16/tools/visualize.py
+++ b/models/recommendation/pytorch/dlrm/training/bfloat16/tools/visualize.py
@@ -1,619 +1,619 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-#
-# This script performs the visualization of the embedding tables created in
-# DLRM during the training procedure. We use two popular techniques for
-# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and
-# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).
-# These links also provide instructions on how to install these packages
-# in different environments.
-#
-# Warning: the size of the data to be visualized depends on the RAM on your machine.
-#
-#
-# A sample run of the code, with a kaggle model is shown below
-# $python ./tools/visualize.py --dataset=kaggle --load-model=./input/dlrm_kaggle.pytorch
-#
-#
-# The following command line arguments are available to the user:
-#
-#    --load-model      - DLRM model file
-#    --data-set        - one of ["kaggle", "terabyte"]
-#    --max-ind-range   - max index range used during the traning
-#    --output-dir      - output directory where output plots will be written, default will be on of these: ["kaggle_vis", "terabyte_vis"]
-#    --max-umap-size   - max number of points to visualize using UMAP, default=50000
-#    --use-tsne        - use T-SNE
-#    --max-tsne-size   - max number of points to visualize using T-SNE, default=1000)
-#
-
-import os
-import argparse
-import numpy as np
-import umap
-import json
-import torch
-import matplotlib
-import matplotlib.pyplot as plt
-
-from sklearn.metrics import accuracy_score
-from sklearn.metrics import f1_score
-from sklearn.metrics import precision_score
-from sklearn.metrics import recall_score
-
-from sklearn import manifold
-
-import dlrm_data_pytorch as dp
-from dlrm_s_pytorch import DLRM_Net
-
-
-def visualize_embeddings_umap(emb_l,
-                              output_dir    = "",
-                              max_size = 500000):
-
-    for k in range(0, len(emb_l)):
-
-        E = emb_l[k].weight.detach().cpu()
-        print("umap", E.shape)
-
-        if E.shape[0] < 20:
-            print("Skipping small embedding")
-            continue
-
-        n_vis = min(max_size, E.shape[0])
-
-        # reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
-        reducer = umap.UMAP(random_state=42)
-        Y = reducer.fit_transform(E[:n_vis,:])
-
-        plt.figure(figsize=(8,8))
-
-        linewidth = 0
-        size      = 1
-
-        if Y.shape[0] < 2500:
-            linewidth = 1
-            size      = 5
-
-        plt.scatter(-Y[:,0], -Y[:,1], s=size, marker='.', linewidth=linewidth)
-
-        plt.title("UMAP: categorical var. "+str(k)+"  ("+str(n_vis)+" of "+str(E.shape[0])+")")
-        plt.savefig(output_dir+"/cat-"+str(k)+"-"+str(n_vis)+"-of-"+str(E.shape[0])+"-umap.png")
-        plt.close()
-
-
-def visualize_embeddings_tsne(emb_l,
-                              output_dir = "",
-                              max_size   = 10000):
-
-    for k in range(0, len(emb_l)):
-
-        E = emb_l[k].weight.detach().cpu()
-        print("tsne", E.shape)
-
-        if E.shape[0] < 20:
-            print("Skipping small embedding")
-            continue
-
-        n_vis = min(max_size, E.shape[0])
-
-        tsne = manifold.TSNE(init='pca', random_state=0, method='exact')
-
-        Y = tsne.fit_transform(E[:n_vis,:])
-
-        plt.figure(figsize=(8,8))
-
-        linewidth = 0
-        if Y.shape[0] < 5000:
-            linewidth = 1
-
-        plt.scatter(-Y[:,0], -Y[:,1], s=1, marker='.', linewidth=linewidth)
-
-        plt.title("TSNE: categorical var. "+str(k)+"  ("+str(n_vis)+" of "+str(E.shape[0])+")")
-        plt.savefig(output_dir+"/cat-"+str(k)+"-"+str(n_vis)+"-of-"+str(E.shape[0])+"-tsne.png")
-        plt.close()
-
-
-def create_vis_data(dlrm, data_ld, max_size=50000, info=''):
-
-    all_features = []
-    all_X        = []
-    all_cat      = []
-    all_T        = []
-    all_c        = []
-    all_z        = []
-    all_pred     = []
-
-    z_size = len(dlrm.top_l)
-    print('z_size', z_size)
-    for i in range(0, z_size):
-        all_z.append([])
-
-    for j, (X, lS_o, lS_i, T) in enumerate(data_ld):
-
-        if j >= max_size:
-            break
-
-        all_feat_vec = []
-        all_cat_vec  = []
-
-        x = dlrm.apply_mlp(X, dlrm.bot_l)
-        # debug prints
-        #print("intermediate")
-        #print(x[0].detach().cpu().numpy())
-        all_feat_vec.append(x[0].detach().cpu().numpy())
-        all_X.append(x[0].detach().cpu().numpy())
-
-        # process sparse features(using embeddings), resulting in a list of row vectors
-        ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l)
-
-        for e in ly:
-            #print(e.detach().cpu().numpy())
-            all_feat_vec.append(e[0].detach().cpu().numpy())
-            all_cat_vec.append(e[0].detach().cpu().numpy())
-
-        all_feat_vec= np.concatenate(all_feat_vec, axis=0)
-        all_cat_vec= np.concatenate(all_cat_vec, axis=0)
-
-        all_features.append(all_feat_vec)
-        all_cat.append(all_cat_vec)
-        all_T.append(int(T.detach().cpu().numpy()[0,0]))
-
-        z = dlrm.interact_features(x, ly)
-        # print(z.detach().cpu().numpy())
-        all_z[0].append(z.detach().cpu().numpy().flatten())
-
-        # obtain probability of a click (using top mlp)
-        # print(dlrm.top_l)
-        # p = dlrm.apply_mlp(z, dlrm.top_l)
-
-        for i in range(0, z_size):
-            z = dlrm.top_l[i](z)
-
-            if i < z_size-1:
-                curr_z = z.detach().cpu().numpy().flatten()
-                all_z[i+1].append(curr_z)
-
-            # print('z',i, z.detach().cpu().numpy().flatten().shape)
-
-        p = z
-
-        # clamp output if needed
-        if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0:
-            z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold))
-        else:
-            z = p
-
-        all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5))
-
-        #print(int(z.detach().cpu().numpy()[0,0]+0.5))
-        if int(z.detach().cpu().numpy()[0,0]+0.5) == int(T.detach().cpu().numpy()[0,0]):
-            all_c.append(0)
-        else:
-            all_c.append(1)
-
-    # calculate classifier metrics
-    ac = accuracy_score(all_T, all_pred)
-    f1 = f1_score(all_T, all_pred)
-    ps = precision_score(all_T, all_pred)
-    rc = recall_score(all_T, all_pred)
-
-    print(info, 'accuracy', ac, 'f1', f1, 'precision', ps, 'recall', rc)
-
-    return all_features, all_X, all_cat, all_T, all_z, all_c
-
-def plot_all_data(Y_train_data,
-                  train_labels,
-                  Y_test_data,
-                  test_labels,
-                  total_train_size = '',
-                  total_test_size  = '',
-                  info             = '',
-                  output_dir       = ''):
-
-    size = 1
-    colors = ['red','green']
-
-    fig, (ax1, ax2) = plt.subplots(1, 2)
-    fig.suptitle('UMAP: ' + info)
-
-    ax1.scatter(-Y_train_data[:,0], -Y_train_data[:,1], s=size, c=train_labels, cmap=matplotlib.colors.ListedColormap(colors), marker='.', linewidth=0)
-    ax1.title.set_text('Train ('+str(len(train_labels))+' of '+ total_train_size+')')
-    if test_data is not None and test_labels is not None:
-        ax2.scatter(-Y_test_data[:,0], -Y_test_data[:,1], s=size, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors), marker='.', linewidth=0)
-        ax2.title.set_text('Test ('+str(len(test_labels))+' of '+ total_test_size+')')
-
-    plt.savefig(output_dir+"/"+info+'-umap.png')
-    plt.close()
-
-
-def plot_one_class(Y_train_data,
-                   train_labels,
-                   Y_test_data,
-                   test_labels,
-                   label            = 0,
-                   col              = 'red',
-                   total_train_size = '',
-                   total_test_size  = '',
-                   info             = '',
-                   output_dir       = ''):
-
-    size = 1
-
-    fig, (ax1, ax2) = plt.subplots(1, 2)
-    fig.suptitle('UMAP: '+ info )
-
-    ind_l_train     = [i for i,x in enumerate(train_labels) if x == label]
-    Y_train_l       = np.array([Y_train_data[i,:] for i in ind_l_train])
-
-    ax1.scatter(-Y_train_l[:,0], -Y_train_l[:,1], s=size, c=col, marker='.', linewidth=0)
-    ax1.title.set_text('Train, ('+str(len(train_labels))+' of '+ total_train_size+')')
-    if Y_test_data is not None and test_labels is not None:
-        ind_l_test = [i for i,x in enumerate(test_labels) if x == label]
-        Y_test_l   = np.array([Y_test_data[i,:] for i in ind_l_test])
-
-        ax2.scatter(-Y_test_l[:,0], -Y_test_l[:,1], s=size, c=col, marker='.', linewidth=0)
-        ax2.title.set_text('Test, ('+str(len(test_labels))+' of '+ total_test_size+')')
-
-    plt.savefig(output_dir+"/"+info+'-umap.png')
-    plt.close()
-
-
-def visualize_umap(train_data,
-                   train_c,
-                   train_targets,
-                   test_data       = None,
-                   test_c          = None,
-                   test_targets    = None,
-                   total_train_size = '',
-                   total_test_size  = '',
-                   info             = '',
-                   output_dir       = ''):
-
-#    reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
-    reducer = umap.UMAP(random_state=42)
-    train_Y = reducer.fit_transform(train_data)
-
-    if test_data is not None and test_targets is not None:
-        test_Y = reducer.transform(test_data)
-
-    # all classes
-    plot_all_data(Y_train_data     = train_Y,
-                  train_labels     = train_targets,
-                  Y_test_data      = test_Y,
-                  test_labels      = test_targets,
-                  total_train_size = total_train_size,
-                  total_test_size  = total_test_size,
-                  info             = info,
-                  output_dir       = output_dir)
-
-    # class 0
-    plot_one_class(Y_train_data     = train_Y,
-                   train_labels     = train_targets,
-                   Y_test_data      = test_Y,
-                   test_labels      = test_targets,
-                   label            = 0,
-                   col              = 'red',
-                   total_train_size = total_train_size,
-                   total_test_size  = total_test_size,
-                   info             = info+' class ' + str(0),
-                   output_dir       = output_dir)
-
-    # class 1
-    plot_one_class(Y_train_data     = train_Y,
-                   train_labels     = train_targets,
-                   Y_test_data      = test_Y,
-                   test_labels      = test_targets,
-                   label            = 1,
-                   col              = 'green',
-                   total_train_size = total_train_size,
-                   total_test_size  = total_test_size,
-                   info             = info + ' class ' + str(1),
-                   output_dir       = output_dir)
-
-    # correct classification
-    plot_one_class(Y_train_data     = train_Y,
-                   train_labels     = train_c,
-                   Y_test_data      = test_Y,
-                   test_labels      = test_c,
-                   label            = 0,
-                   col              = 'green',
-                   total_train_size = total_train_size,
-                   total_test_size  = total_test_size,
-                   info             = info + ' correct ',
-                   output_dir       = output_dir)
-
-    # errors
-    plot_one_class(Y_train_data     = train_Y,
-                   train_labels     = train_c,
-                   Y_test_data      = test_Y,
-                   test_labels      = test_c,
-                   label            = 1,
-                   col              = 'red',
-                   total_train_size = total_train_size,
-                   total_test_size  = total_test_size,
-                   info             = info + ' errors ',
-                   output_dir       = output_dir)
-
-
-
-def visualize_data_umap(dlrm,
-                        train_data_ld,
-                        test_data_ld  = None,
-                        max_umap_size = 50000,
-                        output_dir    = ''):
-
-    train_feat, train_X, train_cat, train_T, train_z, train_c = create_vis_data(dlrm=dlrm, data_ld=train_data_ld, max_size=max_umap_size, info='train')
-
-    test_feat = None
-    test_X    = None
-    test_cat  = None
-    test_T    = None
-
-    if test_data_ld is not None:
-        test_feat, test_X, test_cat, test_T, test_z, test_c = create_vis_data(dlrm=dlrm, data_ld=test_data_ld, max_size=max_umap_size, info='test')
-
-    visualize_umap(train_data       = train_feat,
-                   train_targets    = train_T,
-                   train_c          = train_c,
-                   test_data        = test_feat,
-                   test_c           = test_c,
-                   test_targets     = test_T,
-                   total_train_size = str(len(train_data_ld)),
-                   total_test_size  = str(len(test_data_ld)),
-                   info             = 'all-features',
-                   output_dir       = output_dir)
-
-    visualize_umap(train_data       = train_X,
-                   train_c          = train_c,
-                   train_targets    = train_T,
-                   test_data        = test_X,
-                   test_c           = test_c,
-                   test_targets     = test_T,
-                   total_train_size = str(len(train_data_ld)),
-                   total_test_size  = str(len(test_data_ld)),
-                   info             = 'cont-features',
-                   output_dir       = output_dir)
-
-    visualize_umap(train_data       = train_cat,
-                   train_c          = train_c,
-                   train_targets    = train_T,
-                   test_data        = test_cat,
-                   test_c           = test_c,
-                   test_targets     = test_T,
-                   total_train_size = str(len(train_data_ld)),
-                   total_test_size  = str(len(test_data_ld)),
-                   info             = 'cat-features',
-                   output_dir       = output_dir)
-
-    # UMAP for z data
-    for i in range(0,len(test_z)):
-        visualize_umap(train_data       = train_z[i],
-                       train_targets    = train_T,
-                       train_c          = train_c,
-                       test_data        = test_z[i],
-                       test_c           = test_c,
-                       test_targets     = test_T,
-                       total_train_size = str(len(train_data_ld)),
-                       total_test_size  = str(len(test_data_ld)),
-                       info             = 'z-data-'+str(i),
-                       output_dir       = output_dir)
-
-
-
-def analyse_categorical_data(X_cat, n_days=10, output_dir=""):
-
-    # analyse categorical variables
-    n_vec = len(X_cat)
-    n_cat = len(X_cat[0])
-    n_days = n_days
-
-    print('n_vec', n_vec, 'n_cat', n_cat)
-#    for c in train_data.X_cat:
-#        print(n_cat, c)
-
-    all_cat = np.array(X_cat)
-    print('all_cat.shape', all_cat.shape)
-    day_size = all_cat.shape[0]/n_days
-
-    for i in range(0,n_cat):
-        l_d   = []
-        l_s1  = []
-        l_s2  = []
-        l_int = []
-        l_rem = []
-
-        cat = all_cat[:,i]
-        print('cat', i, cat.shape)
-        for d in range(1,n_days):
-            offset = int(d*day_size)
-            #print(offset)
-            cat1 = cat[:offset]
-            cat2 = cat[offset:]
-
-            s1 = set(cat1)
-            s2 = set(cat2)
-
-            intersect = list(s1 & s2)
-            #print(intersect)
-            l_d.append(d)
-            l_s1.append(len(s1))
-            l_s2.append(len(s2))
-            l_int.append(len(intersect))
-            l_rem.append((len(s1)-len(intersect)))
-
-            print(d, ',', len(s1), ',', len(s2), ',', len(intersect), ',', (len(s1)-len(intersect)))
-
-        print("spit",    l_d)
-        print("before",  l_s1)
-        print("after",   l_s2)
-        print("inters.", l_int)
-        print("removed", l_rem)
-
-        plt.figure(figsize=(8,8))
-        plt.plot(l_d, l_s1,  'g', label='before')
-        plt.plot(l_d, l_s2,  'r', label='after')
-        plt.plot(l_d, l_int, 'b', label='intersect')
-        plt.plot(l_d, l_rem, 'y', label='removed')
-        plt.title("categorical var. "+str(i))
-        plt.legend()
-        plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
-        plt.close()
-
-
-def analyze_model_data(output_dir,
-                       dlrm,
-                       train_ld,
-                       test_ld,
-                       skip_embedding            = False,
-                       use_tsne                  = False,
-                       max_umap_size             = 50000,
-                       max_tsne_size             = 10000,
-                       skip_categorical_analysis = False,
-                       skip_data_plots           = False):
-
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    if skip_embedding == False:
-        visualize_embeddings_umap(emb_l      = dlrm.emb_l,
-                                  output_dir = output_dir,
-                                  max_size   = max_umap_size)
-
-        if use_tsne == True:
-            visualize_embeddings_tsne(emb_l      = dlrm.emb_l,
-                                      output_dir = output_dir,
-                                      max_size   = max_tsne_size)
-
-    # data visualization and analysis
-    if skip_data_plots == False:
-        visualize_data_umap(dlrm=dlrm, train_data_ld=train_ld, test_data_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir)
-
-    # analyse categorical variables
-    if skip_categorical_analysis == False:
-        analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir)
-
-
-if __name__ == "__main__":
-
-    output_dir = ""
-
-    ### parse arguments ###
-    parser = argparse.ArgumentParser(
-        description="Exploratory DLRM analysis"
-    )
-
-    parser.add_argument("--load-model", type=str, default="")
-    parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset")
-    # parser.add_argument("--dataset-path", required=True, help="path to the dataset")
-    parser.add_argument("--max-ind-range", type=int, default=-1)
-    # parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
-    parser.add_argument("--output-dir", type=str, default="")
-    parser.add_argument("--skip-embedding", action='store_true', default=False)
-    parser.add_argument("--skip-data-plots", action='store_true', default=False)
-    parser.add_argument("--skip-categorical-analysis", action='store_true', default=False)
-
-    # umap relatet
-    parser.add_argument("--max-umap-size", type=int, default=50000)
-    # tsne related
-    parser.add_argument("--use-tsne", action='store_true', default=False)
-    parser.add_argument("--max-tsne-size", type=int, default=1000)
-    # data file related
-    parser.add_argument("--raw-data-file", type=str, default="")
-    parser.add_argument("--processed-data-file", type=str, default="")
-    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
-    parser.add_argument("--data-randomize", type=str, default="none")  # total or day or none
-    parser.add_argument("--memory-map", action="store_true", default=False)
-    parser.add_argument("--mini-batch-size", type=int, default=1)
-    parser.add_argument("--num-workers", type=int, default=0)
-    parser.add_argument("--test-mini-batch-size", type=int, default=1)
-    parser.add_argument("--test-num-workers", type=int, default=0)
-    parser.add_argument("--num-batches", type=int, default=0)
-    # mlperf logging (disables other output and stops early)
-    parser.add_argument("--mlperf-logging", action="store_true", default=False)
-
-    args = parser.parse_args()
-
-    print('command line args: ', json.dumps(vars(args)))
-
-    if output_dir == "":
-        output_dir = args.data_set+"_vis_all"
-    print('output_dir:', output_dir)
-
-    if args.data_set == "kaggle":
-        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
-        m_spa=16
-        ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572])
-        ln_bot=np.array([13,512,256,64,16])
-        ln_top=np.array([367,512,256,1])
-
-    elif args.dataset == "terabyte":
-
-        if args.max_ind_range == 10000000:
-            # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
-            m_spa=64
-            ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61,9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36])
-            ln_bot=np.array([13,512,256,64])
-            ln_top=np.array([415,512,512,256,1])
-        elif args.max_ind_range == 40000000:
-            # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
-            m_spa=128
-            ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36])
-            ln_bot=np.array([13,512,256,128])
-            ln_top=np.array([479,1024,1024,512,256,1])
-        else:
-            raise ValueError("only --max-in-range 10M or 40M is supported")
-    else:
-        raise ValueError("only kaggle|terabyte dataset options are supported")
-
-    dlrm = DLRM_Net(
-            m_spa,
-            ln_emb,
-            ln_bot,
-            ln_top,
-            arch_interaction_op="dot",
-            arch_interaction_itself=False,
-            sigmoid_bot=-1,
-            sigmoid_top=ln_top.size - 2,
-            sync_dense_params=True,
-            loss_threshold=0.0,
-            ndevices=-1,
-            qr_flag=False,
-            qr_operation=None,
-            qr_collisions=None,
-            qr_threshold=None,
-            md_flag=False,
-            md_threshold=None,
-        )
-
-    # Load model is specified
-    if not (args.load_model == ""):
-        print("Loading saved model {}".format(args.load_model))
-
-        ld_model = torch.load(args.load_model, map_location=torch.device('cpu'))
-        dlrm.load_state_dict(ld_model["state_dict"])
-
-        print("Model loaded", args.load_model)
-        #print(dlrm)
-
-    # load data
-    train_data = None
-    train_ld   = None
-    test_data  = None
-    test_ld    = None
-
-    if args.raw_data_file is not "" or args.processed_data_file is not "":
-        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
-
-    analyze_model_data(output_dir                = output_dir,
-                       dlrm                      = dlrm,
-                       train_ld                  = train_ld,
-                       test_ld                   = test_ld,
-                       skip_embedding            = args.skip_embedding,
-                       use_tsne                  = args.use_tsne,
-                       max_umap_size             = args.max_umap_size,
-                       max_tsne_size             = args.max_tsne_size,
-                       skip_categorical_analysis = args.skip_categorical_analysis,
-                       skip_data_plots           = args.skip_data_plots)
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# This script performs the visualization of the embedding tables created in
+# DLRM during the training procedure. We use two popular techniques for
+# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and
+# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).
+# These links also provide instructions on how to install these packages
+# in different environments.
+#
+# Warning: the size of the data to be visualized depends on the RAM on your machine.
+#
+#
+# A sample run of the code, with a kaggle model is shown below
+# $python ./tools/visualize.py --dataset=kaggle --load-model=./input/dlrm_kaggle.pytorch
+#
+#
+# The following command line arguments are available to the user:
+#
+#    --load-model      - DLRM model file
+#    --data-set        - one of ["kaggle", "terabyte"]
+#    --max-ind-range   - max index range used during the traning
+#    --output-dir      - output directory where output plots will be written, default will be on of these: ["kaggle_vis", "terabyte_vis"]
+#    --max-umap-size   - max number of points to visualize using UMAP, default=50000
+#    --use-tsne        - use T-SNE
+#    --max-tsne-size   - max number of points to visualize using T-SNE, default=1000)
+#
+
+import os
+import argparse
+import numpy as np
+import umap
+import json
+import torch
+import matplotlib
+import matplotlib.pyplot as plt
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import f1_score
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+
+from sklearn import manifold
+
+import dlrm_data_pytorch as dp
+from dlrm_s_pytorch import DLRM_Net
+
+
+def visualize_embeddings_umap(emb_l,
+                              output_dir    = "",
+                              max_size = 500000):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu()
+        print("umap", E.shape)
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+
+        # reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
+        reducer = umap.UMAP(random_state=42)
+        Y = reducer.fit_transform(E[:n_vis,:])
+
+        plt.figure(figsize=(8,8))
+
+        linewidth = 0
+        size      = 1
+
+        if Y.shape[0] < 2500:
+            linewidth = 1
+            size      = 5
+
+        plt.scatter(-Y[:,0], -Y[:,1], s=size, marker='.', linewidth=linewidth)
+
+        plt.title("UMAP: categorical var. "+str(k)+"  ("+str(n_vis)+" of "+str(E.shape[0])+")")
+        plt.savefig(output_dir+"/cat-"+str(k)+"-"+str(n_vis)+"-of-"+str(E.shape[0])+"-umap.png")
+        plt.close()
+
+
+def visualize_embeddings_tsne(emb_l,
+                              output_dir = "",
+                              max_size   = 10000):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu()
+        print("tsne", E.shape)
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+
+        tsne = manifold.TSNE(init='pca', random_state=0, method='exact')
+
+        Y = tsne.fit_transform(E[:n_vis,:])
+
+        plt.figure(figsize=(8,8))
+
+        linewidth = 0
+        if Y.shape[0] < 5000:
+            linewidth = 1
+
+        plt.scatter(-Y[:,0], -Y[:,1], s=1, marker='.', linewidth=linewidth)
+
+        plt.title("TSNE: categorical var. "+str(k)+"  ("+str(n_vis)+" of "+str(E.shape[0])+")")
+        plt.savefig(output_dir+"/cat-"+str(k)+"-"+str(n_vis)+"-of-"+str(E.shape[0])+"-tsne.png")
+        plt.close()
+
+
+def create_vis_data(dlrm, data_ld, max_size=50000, info=''):
+
+    all_features = []
+    all_X        = []
+    all_cat      = []
+    all_T        = []
+    all_c        = []
+    all_z        = []
+    all_pred     = []
+
+    z_size = len(dlrm.top_l)
+    print('z_size', z_size)
+    for i in range(0, z_size):
+        all_z.append([])
+
+    for j, (X, lS_o, lS_i, T) in enumerate(data_ld):
+
+        if j >= max_size:
+            break
+
+        all_feat_vec = []
+        all_cat_vec  = []
+
+        x = dlrm.apply_mlp(X, dlrm.bot_l)
+        # debug prints
+        #print("intermediate")
+        #print(x[0].detach().cpu().numpy())
+        all_feat_vec.append(x[0].detach().cpu().numpy())
+        all_X.append(x[0].detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l)
+
+        for e in ly:
+            #print(e.detach().cpu().numpy())
+            all_feat_vec.append(e[0].detach().cpu().numpy())
+            all_cat_vec.append(e[0].detach().cpu().numpy())
+
+        all_feat_vec= np.concatenate(all_feat_vec, axis=0)
+        all_cat_vec= np.concatenate(all_cat_vec, axis=0)
+
+        all_features.append(all_feat_vec)
+        all_cat.append(all_cat_vec)
+        all_T.append(int(T.detach().cpu().numpy()[0,0]))
+
+        z = dlrm.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+        all_z[0].append(z.detach().cpu().numpy().flatten())
+
+        # obtain probability of a click (using top mlp)
+        # print(dlrm.top_l)
+        # p = dlrm.apply_mlp(z, dlrm.top_l)
+
+        for i in range(0, z_size):
+            z = dlrm.top_l[i](z)
+
+            if i < z_size-1:
+                curr_z = z.detach().cpu().numpy().flatten()
+                all_z[i+1].append(curr_z)
+
+            # print('z',i, z.detach().cpu().numpy().flatten().shape)
+
+        p = z
+
+        # clamp output if needed
+        if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0:
+            z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold))
+        else:
+            z = p
+
+        all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5))
+
+        #print(int(z.detach().cpu().numpy()[0,0]+0.5))
+        if int(z.detach().cpu().numpy()[0,0]+0.5) == int(T.detach().cpu().numpy()[0,0]):
+            all_c.append(0)
+        else:
+            all_c.append(1)
+
+    # calculate classifier metrics
+    ac = accuracy_score(all_T, all_pred)
+    f1 = f1_score(all_T, all_pred)
+    ps = precision_score(all_T, all_pred)
+    rc = recall_score(all_T, all_pred)
+
+    print(info, 'accuracy', ac, 'f1', f1, 'precision', ps, 'recall', rc)
+
+    return all_features, all_X, all_cat, all_T, all_z, all_c
+
+def plot_all_data(Y_train_data,
+                  train_labels,
+                  Y_test_data,
+                  test_labels,
+                  total_train_size = '',
+                  total_test_size  = '',
+                  info             = '',
+                  output_dir       = ''):
+
+    size = 1
+    colors = ['red','green']
+
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig.suptitle('UMAP: ' + info)
+
+    ax1.scatter(-Y_train_data[:,0], -Y_train_data[:,1], s=size, c=train_labels, cmap=matplotlib.colors.ListedColormap(colors), marker='.', linewidth=0)
+    ax1.title.set_text('Train ('+str(len(train_labels))+' of '+ total_train_size+')')
+    if test_data is not None and test_labels is not None:
+        ax2.scatter(-Y_test_data[:,0], -Y_test_data[:,1], s=size, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors), marker='.', linewidth=0)
+        ax2.title.set_text('Test ('+str(len(test_labels))+' of '+ total_test_size+')')
+
+    plt.savefig(output_dir+"/"+info+'-umap.png')
+    plt.close()
+
+
+def plot_one_class(Y_train_data,
+                   train_labels,
+                   Y_test_data,
+                   test_labels,
+                   label            = 0,
+                   col              = 'red',
+                   total_train_size = '',
+                   total_test_size  = '',
+                   info             = '',
+                   output_dir       = ''):
+
+    size = 1
+
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig.suptitle('UMAP: '+ info )
+
+    ind_l_train     = [i for i,x in enumerate(train_labels) if x == label]
+    Y_train_l       = np.array([Y_train_data[i,:] for i in ind_l_train])
+
+    ax1.scatter(-Y_train_l[:,0], -Y_train_l[:,1], s=size, c=col, marker='.', linewidth=0)
+    ax1.title.set_text('Train, ('+str(len(train_labels))+' of '+ total_train_size+')')
+    if Y_test_data is not None and test_labels is not None:
+        ind_l_test = [i for i,x in enumerate(test_labels) if x == label]
+        Y_test_l   = np.array([Y_test_data[i,:] for i in ind_l_test])
+
+        ax2.scatter(-Y_test_l[:,0], -Y_test_l[:,1], s=size, c=col, marker='.', linewidth=0)
+        ax2.title.set_text('Test, ('+str(len(test_labels))+' of '+ total_test_size+')')
+
+    plt.savefig(output_dir+"/"+info+'-umap.png')
+    plt.close()
+
+
+def visualize_umap(train_data,
+                   train_c,
+                   train_targets,
+                   test_data       = None,
+                   test_c          = None,
+                   test_targets    = None,
+                   total_train_size = '',
+                   total_test_size  = '',
+                   info             = '',
+                   output_dir       = ''):
+
+#    reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
+    reducer = umap.UMAP(random_state=42)
+    train_Y = reducer.fit_transform(train_data)
+
+    if test_data is not None and test_targets is not None:
+        test_Y = reducer.transform(test_data)
+
+    # all classes
+    plot_all_data(Y_train_data     = train_Y,
+                  train_labels     = train_targets,
+                  Y_test_data      = test_Y,
+                  test_labels      = test_targets,
+                  total_train_size = total_train_size,
+                  total_test_size  = total_test_size,
+                  info             = info,
+                  output_dir       = output_dir)
+
+    # class 0
+    plot_one_class(Y_train_data     = train_Y,
+                   train_labels     = train_targets,
+                   Y_test_data      = test_Y,
+                   test_labels      = test_targets,
+                   label            = 0,
+                   col              = 'red',
+                   total_train_size = total_train_size,
+                   total_test_size  = total_test_size,
+                   info             = info+' class ' + str(0),
+                   output_dir       = output_dir)
+
+    # class 1
+    plot_one_class(Y_train_data     = train_Y,
+                   train_labels     = train_targets,
+                   Y_test_data      = test_Y,
+                   test_labels      = test_targets,
+                   label            = 1,
+                   col              = 'green',
+                   total_train_size = total_train_size,
+                   total_test_size  = total_test_size,
+                   info             = info + ' class ' + str(1),
+                   output_dir       = output_dir)
+
+    # correct classification
+    plot_one_class(Y_train_data     = train_Y,
+                   train_labels     = train_c,
+                   Y_test_data      = test_Y,
+                   test_labels      = test_c,
+                   label            = 0,
+                   col              = 'green',
+                   total_train_size = total_train_size,
+                   total_test_size  = total_test_size,
+                   info             = info + ' correct ',
+                   output_dir       = output_dir)
+
+    # errors
+    plot_one_class(Y_train_data     = train_Y,
+                   train_labels     = train_c,
+                   Y_test_data      = test_Y,
+                   test_labels      = test_c,
+                   label            = 1,
+                   col              = 'red',
+                   total_train_size = total_train_size,
+                   total_test_size  = total_test_size,
+                   info             = info + ' errors ',
+                   output_dir       = output_dir)
+
+
+
+def visualize_data_umap(dlrm,
+                        train_data_ld,
+                        test_data_ld  = None,
+                        max_umap_size = 50000,
+                        output_dir    = ''):
+
+    train_feat, train_X, train_cat, train_T, train_z, train_c = create_vis_data(dlrm=dlrm, data_ld=train_data_ld, max_size=max_umap_size, info='train')
+
+    test_feat = None
+    test_X    = None
+    test_cat  = None
+    test_T    = None
+
+    if test_data_ld is not None:
+        test_feat, test_X, test_cat, test_T, test_z, test_c = create_vis_data(dlrm=dlrm, data_ld=test_data_ld, max_size=max_umap_size, info='test')
+
+    visualize_umap(train_data       = train_feat,
+                   train_targets    = train_T,
+                   train_c          = train_c,
+                   test_data        = test_feat,
+                   test_c           = test_c,
+                   test_targets     = test_T,
+                   total_train_size = str(len(train_data_ld)),
+                   total_test_size  = str(len(test_data_ld)),
+                   info             = 'all-features',
+                   output_dir       = output_dir)
+
+    visualize_umap(train_data       = train_X,
+                   train_c          = train_c,
+                   train_targets    = train_T,
+                   test_data        = test_X,
+                   test_c           = test_c,
+                   test_targets     = test_T,
+                   total_train_size = str(len(train_data_ld)),
+                   total_test_size  = str(len(test_data_ld)),
+                   info             = 'cont-features',
+                   output_dir       = output_dir)
+
+    visualize_umap(train_data       = train_cat,
+                   train_c          = train_c,
+                   train_targets    = train_T,
+                   test_data        = test_cat,
+                   test_c           = test_c,
+                   test_targets     = test_T,
+                   total_train_size = str(len(train_data_ld)),
+                   total_test_size  = str(len(test_data_ld)),
+                   info             = 'cat-features',
+                   output_dir       = output_dir)
+
+    # UMAP for z data
+    for i in range(0,len(test_z)):
+        visualize_umap(train_data       = train_z[i],
+                       train_targets    = train_T,
+                       train_c          = train_c,
+                       test_data        = test_z[i],
+                       test_c           = test_c,
+                       test_targets     = test_T,
+                       total_train_size = str(len(train_data_ld)),
+                       total_test_size  = str(len(test_data_ld)),
+                       info             = 'z-data-'+str(i),
+                       output_dir       = output_dir)
+
+
+
+def analyse_categorical_data(X_cat, n_days=10, output_dir=""):
+
+    # analyse categorical variables
+    n_vec = len(X_cat)
+    n_cat = len(X_cat[0])
+    n_days = n_days
+
+    print('n_vec', n_vec, 'n_cat', n_cat)
+#    for c in train_data.X_cat:
+#        print(n_cat, c)
+
+    all_cat = np.array(X_cat)
+    print('all_cat.shape', all_cat.shape)
+    day_size = all_cat.shape[0]/n_days
+
+    for i in range(0,n_cat):
+        l_d   = []
+        l_s1  = []
+        l_s2  = []
+        l_int = []
+        l_rem = []
+
+        cat = all_cat[:,i]
+        print('cat', i, cat.shape)
+        for d in range(1,n_days):
+            offset = int(d*day_size)
+            #print(offset)
+            cat1 = cat[:offset]
+            cat2 = cat[offset:]
+
+            s1 = set(cat1)
+            s2 = set(cat2)
+
+            intersect = list(s1 & s2)
+            #print(intersect)
+            l_d.append(d)
+            l_s1.append(len(s1))
+            l_s2.append(len(s2))
+            l_int.append(len(intersect))
+            l_rem.append((len(s1)-len(intersect)))
+
+            print(d, ',', len(s1), ',', len(s2), ',', len(intersect), ',', (len(s1)-len(intersect)))
+
+        print("spit",    l_d)
+        print("before",  l_s1)
+        print("after",   l_s2)
+        print("inters.", l_int)
+        print("removed", l_rem)
+
+        plt.figure(figsize=(8,8))
+        plt.plot(l_d, l_s1,  'g', label='before')
+        plt.plot(l_d, l_s2,  'r', label='after')
+        plt.plot(l_d, l_int, 'b', label='intersect')
+        plt.plot(l_d, l_rem, 'y', label='removed')
+        plt.title("categorical var. "+str(i))
+        plt.legend()
+        plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
+        plt.close()
+
+
+def analyze_model_data(output_dir,
+                       dlrm,
+                       train_ld,
+                       test_ld,
+                       skip_embedding            = False,
+                       use_tsne                  = False,
+                       max_umap_size             = 50000,
+                       max_tsne_size             = 10000,
+                       skip_categorical_analysis = False,
+                       skip_data_plots           = False):
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if skip_embedding == False:
+        visualize_embeddings_umap(emb_l      = dlrm.emb_l,
+                                  output_dir = output_dir,
+                                  max_size   = max_umap_size)
+
+        if use_tsne == True:
+            visualize_embeddings_tsne(emb_l      = dlrm.emb_l,
+                                      output_dir = output_dir,
+                                      max_size   = max_tsne_size)
+
+    # data visualization and analysis
+    if skip_data_plots == False:
+        visualize_data_umap(dlrm=dlrm, train_data_ld=train_ld, test_data_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir)
+
+    # analyse categorical variables
+    if skip_categorical_analysis == False:
+        analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir)
+
+
+if __name__ == "__main__":
+
+    output_dir = ""
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Exploratory DLRM analysis"
+    )
+
+    parser.add_argument("--load-model", type=str, default="")
+    parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset")
+    # parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    # parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--skip-embedding", action='store_true', default=False)
+    parser.add_argument("--skip-data-plots", action='store_true', default=False)
+    parser.add_argument("--skip-categorical-analysis", action='store_true', default=False)
+
+    # umap relatet
+    parser.add_argument("--max-umap-size", type=int, default=50000)
+    # tsne related
+    parser.add_argument("--use-tsne", action='store_true', default=False)
+    parser.add_argument("--max-tsne-size", type=int, default=1000)
+    # data file related
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="none")  # total or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--test-mini-batch-size", type=int, default=1)
+    parser.add_argument("--test-num-workers", type=int, default=0)
+    parser.add_argument("--num-batches", type=int, default=0)
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+    print('command line args: ', json.dumps(vars(args)))
+
+    if output_dir == "":
+        output_dir = args.data_set+"_vis_all"
+    print('output_dir:', output_dir)
+
+    if args.data_set == "kaggle":
+        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+        m_spa=16
+        ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572])
+        ln_bot=np.array([13,512,256,64,16])
+        ln_top=np.array([367,512,256,1])
+
+    elif args.dataset == "terabyte":
+
+        if args.max_ind_range == 10000000:
+            # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+            m_spa=64
+            ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61,9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36])
+            ln_bot=np.array([13,512,256,64])
+            ln_top=np.array([415,512,512,256,1])
+        elif args.max_ind_range == 40000000:
+            # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+            m_spa=128
+            ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36])
+            ln_bot=np.array([13,512,256,128])
+            ln_top=np.array([479,1024,1024,512,256,1])
+        else:
+            raise ValueError("only --max-in-range 10M or 40M is supported")
+    else:
+        raise ValueError("only kaggle|terabyte dataset options are supported")
+
+    dlrm = DLRM_Net(
+            m_spa,
+            ln_emb,
+            ln_bot,
+            ln_top,
+            arch_interaction_op="dot",
+            arch_interaction_itself=False,
+            sigmoid_bot=-1,
+            sigmoid_top=ln_top.size - 2,
+            sync_dense_params=True,
+            loss_threshold=0.0,
+            ndevices=-1,
+            qr_flag=False,
+            qr_operation=None,
+            qr_collisions=None,
+            qr_threshold=None,
+            md_flag=False,
+            md_threshold=None,
+        )
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+
+        ld_model = torch.load(args.load_model, map_location=torch.device('cpu'))
+        dlrm.load_state_dict(ld_model["state_dict"])
+
+        print("Model loaded", args.load_model)
+        #print(dlrm)
+
+    # load data
+    train_data = None
+    train_ld   = None
+    test_data  = None
+    test_ld    = None
+
+    if args.raw_data_file is not "" or args.processed_data_file is not "":
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+
+    analyze_model_data(output_dir                = output_dir,
+                       dlrm                      = dlrm,
+                       train_ld                  = train_ld,
+                       test_ld                   = test_ld,
+                       skip_embedding            = args.skip_embedding,
+                       use_tsne                  = args.use_tsne,
+                       max_umap_size             = args.max_umap_size,
+                       max_tsne_size             = args.max_tsne_size,
+                       skip_categorical_analysis = args.skip_categorical_analysis,
+                       skip_data_plots           = args.skip_data_plots)
diff --git a/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf.patch b/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf.patch
index a7385a211..23cd2568c 100644
--- a/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf.patch
+++ b/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf.patch
@@ -1995,100 +1995,100 @@ index 0000000..493ce38
 --- /dev/null
 +++ b/produce_min_max_log.py
 @@ -0,0 +1,94 @@
-+# Copyright 2019 Google LLC
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+#      http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+
-+#!/usr/bin/env python
-+# encoding: utf-8
-+
-+import time
-+import os
-+
-+import tensorflow as tf
-+from tensorflow.core.framework import graph_pb2
-+from tensorflow.python.platform import gfile
-+
-+from absl import app, flags
-+
-+import preprocessing
-+import dual_net
-+
-+
-+flags.DEFINE_string('input_graph', None, 'The path of input graph.')
-+flags.DEFINE_string('data_location', None, 'The path of input data.')
-+flags.DEFINE_integer('num_steps', 20, 'Number of eval steps.')
-+flags.DEFINE_integer('batch_size', 20, 'eval batch size.')
-+flags.DEFINE_boolean('random_rotation', True, 'Do random rotation if true.')
-+
-+
-+FLAGS = flags.FLAGS
-+
-+def run_graph(graph, tf_records):
-+
-+  data_graph = tf.Graph()
-+  with data_graph.as_default():
-+    features, labels = preprocessing.get_input_tensors(
-+              FLAGS.batch_size,
-+              tf_records,
-+              shuffle_buffer_size=100000000,
-+              random_rotation=FLAGS.random_rotation, seed=2,
-+              dist_train=False, make_one_shot=True)
-+
-+  infer_graph = tf.Graph()
-+  with infer_graph.as_default():
-+    tf.import_graph_def(graph, name='')
-+
-+  input_tensor = dual_net.get_input_tensor(infer_graph)
-+  output_tensor = dual_net.get_output_tensor(infer_graph)
-+
-+  config = tf.ConfigProto(
-+                intra_op_parallelism_threads=FLAGS.num_intra_threads,
-+                inter_op_parallelism_threads=FLAGS.num_inter_threads)
-+  data_sess = tf.Session(graph=data_graph, config=config)
-+  infer_sess = tf.Session(graph=infer_graph, config=config)
-+
-+  elapsed = 0
-+  #with tf.contrib.tfprof.ProfileContext('/home/letiank/skx-8180/train_dir/minigo', trace_steps=range(70, 80), dump_steps=[110]):
-+  for it in range(FLAGS.num_steps):
-+    features_np = data_sess.run(features)
-+    start_time = time.time()
-+    infer_sess.run(output_tensor, feed_dict={input_tensor: features_np})
-+    elapsed += time.time() - start_time
-+
-+def read_graph(input_graph):
-+  if not gfile.Exists(input_graph):
-+    print("Input graph file '" + input_graph + "' does not exist!")
-+    exit(-1)
-+
-+  input_graph_def = graph_pb2.GraphDef()
-+  with gfile.Open(input_graph, "rb") as f:
-+    data = f.read()
-+    input_graph_def.ParseFromString(data)
-+
-+  return input_graph_def
-+
-+
-+def main(unused_argv):
-+  """Run the reinforcement learning loop."""
-+
-+  graph = read_graph(FLAGS.input_graph)
-+  tf_records = sorted(tf.gfile.Glob(FLAGS.data_location), reverse=True)[:1]
-+  print(tf_records)
-+  run_graph(graph, tf_records)
-+
-+if __name__ == "__main__":
-+    app.run(main)
++# Copyright 2019 Google LLC
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++#!/usr/bin/env python
++# encoding: utf-8
++
++import time
++import os
++
++import tensorflow as tf
++from tensorflow.core.framework import graph_pb2
++from tensorflow.python.platform import gfile
++
++from absl import app, flags
++
++import preprocessing
++import dual_net
++
++
++flags.DEFINE_string('input_graph', None, 'The path of input graph.')
++flags.DEFINE_string('data_location', None, 'The path of input data.')
++flags.DEFINE_integer('num_steps', 20, 'Number of eval steps.')
++flags.DEFINE_integer('batch_size', 20, 'eval batch size.')
++flags.DEFINE_boolean('random_rotation', True, 'Do random rotation if true.')
++
++
++FLAGS = flags.FLAGS
++
++def run_graph(graph, tf_records):
++
++  data_graph = tf.Graph()
++  with data_graph.as_default():
++    features, labels = preprocessing.get_input_tensors(
++              FLAGS.batch_size,
++              tf_records,
++              shuffle_buffer_size=100000000,
++              random_rotation=FLAGS.random_rotation, seed=2,
++              dist_train=False, make_one_shot=True)
++
++  infer_graph = tf.Graph()
++  with infer_graph.as_default():
++    tf.import_graph_def(graph, name='')
++
++  input_tensor = dual_net.get_input_tensor(infer_graph)
++  output_tensor = dual_net.get_output_tensor(infer_graph)
++
++  config = tf.ConfigProto(
++                intra_op_parallelism_threads=FLAGS.num_intra_threads,
++                inter_op_parallelism_threads=FLAGS.num_inter_threads)
++  data_sess = tf.Session(graph=data_graph, config=config)
++  infer_sess = tf.Session(graph=infer_graph, config=config)
++
++  elapsed = 0
++  #with tf.contrib.tfprof.ProfileContext('/home/letiank/skx-8180/train_dir/minigo', trace_steps=range(70, 80), dump_steps=[110]):
++  for it in range(FLAGS.num_steps):
++    features_np = data_sess.run(features)
++    start_time = time.time()
++    infer_sess.run(output_tensor, feed_dict={input_tensor: features_np})
++    elapsed += time.time() - start_time
++
++def read_graph(input_graph):
++  if not gfile.Exists(input_graph):
++    print("Input graph file '" + input_graph + "' does not exist!")
++    exit(-1)
++
++  input_graph_def = graph_pb2.GraphDef()
++  with gfile.Open(input_graph, "rb") as f:
++    data = f.read()
++    input_graph_def.ParseFromString(data)
++
++  return input_graph_def
++
++
++def main(unused_argv):
++  """Run the reinforcement learning loop."""
++
++  graph = read_graph(FLAGS.input_graph)
++  tf_records = sorted(tf.gfile.Glob(FLAGS.data_location), reverse=True)[:1]
++  print(tf_records)
++  run_graph(graph, tf_records)
++
++if __name__ == "__main__":
++    app.run(main)
 diff --git a/requirements-colab.txt b/requirements-colab.txt
 index febb463..f24b44d 100644
 --- a/requirements-colab.txt
diff --git a/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf_large_scale.patch b/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf_large_scale.patch
index 062ed2acd..ebbd989ba 100644
--- a/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf_large_scale.patch
+++ b/models/reinforcement/tensorflow/minigo/training/fp32/minigo_mlperf_large_scale.patch
@@ -3028,100 +3028,100 @@ index 0000000..493ce38
 --- /dev/null
 +++ b/produce_min_max_log.py
 @@ -0,0 +1,94 @@
-+# Copyright 2019 Google LLC
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+#      http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+
-+#!/usr/bin/env python
-+# encoding: utf-8
-+
-+import time
-+import os
-+
-+import tensorflow as tf
-+from tensorflow.core.framework import graph_pb2
-+from tensorflow.python.platform import gfile
-+
-+from absl import app, flags
-+
-+import preprocessing
-+import dual_net
-+
-+
-+flags.DEFINE_string('input_graph', None, 'The path of input graph.')
-+flags.DEFINE_string('data_location', None, 'The path of input data.')
-+flags.DEFINE_integer('num_steps', 20, 'Number of eval steps.')
-+flags.DEFINE_integer('batch_size', 20, 'eval batch size.')
-+flags.DEFINE_boolean('random_rotation', True, 'Do random rotation if true.')
-+
-+
-+FLAGS = flags.FLAGS
-+
-+def run_graph(graph, tf_records):
-+
-+  data_graph = tf.Graph()
-+  with data_graph.as_default():
-+    features, labels = preprocessing.get_input_tensors(
-+              FLAGS.batch_size,
-+              tf_records,
-+              shuffle_buffer_size=100000000,
-+              random_rotation=FLAGS.random_rotation, seed=2,
-+              dist_train=False, make_one_shot=True)
-+
-+  infer_graph = tf.Graph()
-+  with infer_graph.as_default():
-+    tf.import_graph_def(graph, name='')
-+
-+  input_tensor = dual_net.get_input_tensor(infer_graph)
-+  output_tensor = dual_net.get_output_tensor(infer_graph)
-+
-+  config = tf.ConfigProto(
-+                intra_op_parallelism_threads=FLAGS.num_intra_threads,
-+                inter_op_parallelism_threads=FLAGS.num_inter_threads)
-+  data_sess = tf.Session(graph=data_graph, config=config)
-+  infer_sess = tf.Session(graph=infer_graph, config=config)
-+
-+  elapsed = 0
-+  #with tf.contrib.tfprof.ProfileContext('/home/letiank/skx-8180/train_dir/minigo', trace_steps=range(70, 80), dump_steps=[110]):
-+  for it in range(FLAGS.num_steps):
-+    features_np = data_sess.run(features)
-+    start_time = time.time()
-+    infer_sess.run(output_tensor, feed_dict={input_tensor: features_np})
-+    elapsed += time.time() - start_time
-+
-+def read_graph(input_graph):
-+  if not gfile.Exists(input_graph):
-+    print("Input graph file '" + input_graph + "' does not exist!")
-+    exit(-1)
-+
-+  input_graph_def = graph_pb2.GraphDef()
-+  with gfile.Open(input_graph, "rb") as f:
-+    data = f.read()
-+    input_graph_def.ParseFromString(data)
-+
-+  return input_graph_def
-+
-+
-+def main(unused_argv):
-+  """Run the reinforcement learning loop."""
-+
-+  graph = read_graph(FLAGS.input_graph)
-+  tf_records = sorted(tf.gfile.Glob(FLAGS.data_location), reverse=True)[:1]
-+  print(tf_records)
-+  run_graph(graph, tf_records)
-+
-+if __name__ == "__main__":
-+    app.run(main)
++# Copyright 2019 Google LLC
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++#!/usr/bin/env python
++# encoding: utf-8
++
++import time
++import os
++
++import tensorflow as tf
++from tensorflow.core.framework import graph_pb2
++from tensorflow.python.platform import gfile
++
++from absl import app, flags
++
++import preprocessing
++import dual_net
++
++
++flags.DEFINE_string('input_graph', None, 'The path of input graph.')
++flags.DEFINE_string('data_location', None, 'The path of input data.')
++flags.DEFINE_integer('num_steps', 20, 'Number of eval steps.')
++flags.DEFINE_integer('batch_size', 20, 'eval batch size.')
++flags.DEFINE_boolean('random_rotation', True, 'Do random rotation if true.')
++
++
++FLAGS = flags.FLAGS
++
++def run_graph(graph, tf_records):
++
++  data_graph = tf.Graph()
++  with data_graph.as_default():
++    features, labels = preprocessing.get_input_tensors(
++              FLAGS.batch_size,
++              tf_records,
++              shuffle_buffer_size=100000000,
++              random_rotation=FLAGS.random_rotation, seed=2,
++              dist_train=False, make_one_shot=True)
++
++  infer_graph = tf.Graph()
++  with infer_graph.as_default():
++    tf.import_graph_def(graph, name='')
++
++  input_tensor = dual_net.get_input_tensor(infer_graph)
++  output_tensor = dual_net.get_output_tensor(infer_graph)
++
++  config = tf.ConfigProto(
++                intra_op_parallelism_threads=FLAGS.num_intra_threads,
++                inter_op_parallelism_threads=FLAGS.num_inter_threads)
++  data_sess = tf.Session(graph=data_graph, config=config)
++  infer_sess = tf.Session(graph=infer_graph, config=config)
++
++  elapsed = 0
++  #with tf.contrib.tfprof.ProfileContext('/home/letiank/skx-8180/train_dir/minigo', trace_steps=range(70, 80), dump_steps=[110]):
++  for it in range(FLAGS.num_steps):
++    features_np = data_sess.run(features)
++    start_time = time.time()
++    infer_sess.run(output_tensor, feed_dict={input_tensor: features_np})
++    elapsed += time.time() - start_time
++
++def read_graph(input_graph):
++  if not gfile.Exists(input_graph):
++    print("Input graph file '" + input_graph + "' does not exist!")
++    exit(-1)
++
++  input_graph_def = graph_pb2.GraphDef()
++  with gfile.Open(input_graph, "rb") as f:
++    data = f.read()
++    input_graph_def.ParseFromString(data)
++
++  return input_graph_def
++
++
++def main(unused_argv):
++  """Run the reinforcement learning loop."""
++
++  graph = read_graph(FLAGS.input_graph)
++  tf_records = sorted(tf.gfile.Glob(FLAGS.data_location), reverse=True)[:1]
++  print(tf_records)
++  run_graph(graph, tf_records)
++
++if __name__ == "__main__":
++    app.run(main)
 diff --git a/quantize_graph.py b/quantize_graph.py
 new file mode 100644
 index 0000000..4789825
diff --git a/third_party/licenses.txt b/third_party/licenses.txt
index 06e472798..689311f7f 100644
--- a/third_party/licenses.txt
+++ b/third_party/licenses.txt
@@ -1,237 +1,237 @@
-﻿Intel-model-zoo v 1.6
-
-This file contains the list of third party software (“third party programs”) contained in the Intel software and their required notices and/or license terms. This third party software, even if included with the distribution of the Intel software, may be governed by separate license terms, including without limitation, third party license terms, other Intel software license terms, and open source software license terms. These separate license terms govern your use of the third party programs as set forth in the “third-party-programs.txt” or other similarly-named text file.
-
-Third party programs and their corresponding required notices and/or license terms are listed below.
---------------------------------------------------------------------------------------
-1.      AITTSMD MTCNN-Tensorflow
-
-	Matterport, Inc. MaskRCNN:
-	Copyright (c) 2017 Matterport, Inc.
-
-
-The MIT License (MIT)
-
-	Permission is hereby granted, free of charge, to any person obtaining a copy
-	of this software and associated documentation files (the "Software"), to deal
-	in the Software without restriction, including without limitation the rights
-	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-	copies of the Software, and to permit persons to whom the Software is
-	furnished to do so, subject to the following conditions:
-
-	The above copyright notice and this permission notice shall be included in
-	all copies or substantial portions of the Software.
-
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-	THE SOFTWARE.
-
---------------------------------------------------------------------------------------
-
-2.  	ericjang draw
-
-	IntelAI Tool
-	Copyright (c) 2019 Intel Corporation
-
-	LevinJ SSD-tensorflow-VOC
-	Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-	Tensorflow benchmarks
-	Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-	TensorFlow InceptionV4
-	Copyright 2016 The TensorFlow Authors.  All rights reserved.
-
-	Tensorflow NCF
-	Copyright 2016 The TensorFlow Authors.  All rights reserved.
-
-	TensorFlow NMT
-	Copyright 2017 Google Inc. All Rights Reserved.
-
-	Apache License 2.0
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
---------------------------------------------------------------------------------------
-The following third party programs have their own third party program files. These additional third party program files are as follows:
-
-1. N/A: <Location>
-
---------------------------------------------------------------------------------------
-Other names and brands may be claimed as the property of others.
+Intel-model-zoo v 1.6
+
+This file contains the list of third party software (“third party programs”) contained in the Intel software and their required notices and/or license terms. This third party software, even if included with the distribution of the Intel software, may be governed by separate license terms, including without limitation, third party license terms, other Intel software license terms, and open source software license terms. These separate license terms govern your use of the third party programs as set forth in the “third-party-programs.txt” or other similarly-named text file.
+
+Third party programs and their corresponding required notices and/or license terms are listed below.
+--------------------------------------------------------------------------------------
+1.      AITTSMD MTCNN-Tensorflow
+
+	Matterport, Inc. MaskRCNN:
+	Copyright (c) 2017 Matterport, Inc.
+
+
+The MIT License (MIT)
+
+	Permission is hereby granted, free of charge, to any person obtaining a copy
+	of this software and associated documentation files (the "Software"), to deal
+	in the Software without restriction, including without limitation the rights
+	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+	copies of the Software, and to permit persons to whom the Software is
+	furnished to do so, subject to the following conditions:
+
+	The above copyright notice and this permission notice shall be included in
+	all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+	THE SOFTWARE.
+
+--------------------------------------------------------------------------------------
+
+2.  	ericjang draw
+
+	IntelAI Tool
+	Copyright (c) 2019 Intel Corporation
+
+	LevinJ SSD-tensorflow-VOC
+	Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+	Tensorflow benchmarks
+	Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+	TensorFlow InceptionV4
+	Copyright 2016 The TensorFlow Authors.  All rights reserved.
+
+	Tensorflow NCF
+	Copyright 2016 The TensorFlow Authors.  All rights reserved.
+
+	TensorFlow NMT
+	Copyright 2017 Google Inc. All Rights Reserved.
+
+	Apache License 2.0
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+--------------------------------------------------------------------------------------
+The following third party programs have their own third party program files. These additional third party program files are as follows:
+
+1. N/A: <Location>
+
+--------------------------------------------------------------------------------------
+Other names and brands may be claimed as the property of others.