From cc1a83e1651fc9ce75cac7f69a594c763f67f967 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Sat, 30 Mar 2024 08:15:13 +0800
Subject: [PATCH] update dependency version (#3895)

* add torch-ccl into compile bundle

* fix dead link in doc

* update footer link

* update deepspeed dependency version, remove cpu related md files from build_doc.sh

* add xpu perf

* version to 2.1.20

* fix example import

* update torch ccl version

* add mpi path in the scripts

* update dependency version

* move known issue to tutorial repo

* update known issue link

* add note for not contain cpu features

* update log version

* update feature and example doc

* update model zoo version

* add paper to publications

* remove cheetsheet

---------

Co-authored-by: Zheng, Zhaoqiong <zhaoqiong.zheng@intel.com>
Co-authored-by: Ye Ting <ting.ye@intel.com>
---
 dependency_version.yml                        |  14 +-
 docs/_static/custom.css                       |   3 +
 docs/_templates/footer.html                   |   2 +-
 docs/index.rst                                |   9 +-
 docs/tutorials/api_doc.rst                    |   5 +-
 docs/tutorials/blogs_publications.md          |   1 +
 docs/tutorials/cheat_sheet.md                 |  23 ---
 docs/tutorials/examples.md                    |  36 ++---
 docs/tutorials/features.rst                   |  25 ++--
 docs/tutorials/features/amp_cpu.md            | 102 --------------
 docs/tutorials/features/int4.md               |  42 ------
 docs/tutorials/installation.rst               |   2 +-
 .../{performance_tuning => }/known_issues.md  |   0
 docs/tutorials/llm.rst                        |  16 +--
 .../llm/llm_optimize_transformers.md          |  18 ---
 docs/tutorials/performance_tuning.rst         |  11 --
 docs/tutorials/releases.md                    |  10 +-
 examples/gpu/inference/python/llm/Dockerfile  | 131 +++++++++---------
 examples/gpu/inference/python/llm/README.md   |   9 +-
 .../python/llm/tools/env_activate.sh          |   2 -
 .../inference/python/llm/tools/env_setup.sh   |  24 +++-
 scripts/build_doc.sh                          |   6 -
 scripts/compile_bundle.sh                     |  27 ++--
 tools/basekit_driver_install_helper.sh        | 123 ++++++++--------
 24 files changed, 222 insertions(+), 419 deletions(-)
 delete mode 100644 docs/tutorials/cheat_sheet.md
 delete mode 100644 docs/tutorials/features/amp_cpu.md
 delete mode 100644 docs/tutorials/features/int4.md
 rename docs/tutorials/{performance_tuning => }/known_issues.md (100%)
 delete mode 100644 docs/tutorials/performance_tuning.rst

diff --git a/dependency_version.yml b/dependency_version.yml
index b3435783b..b9df37a0f 100644
--- a/dependency_version.yml
+++ b/dependency_version.yml
@@ -4,21 +4,21 @@ gcc:
 llvm:
   version: 16.0.6
 pytorch:
-  version: 2.1.0a0
+  version: 2.1.0.post0+cxx11.abi
   commit: v2.1.0
 torchaudio:
-  version: 2.1.0a0
+  version: 2.1.0.post0+cxx11.abi
   commit: v2.1.0
 torchvision:
-  version: 0.16.0a0
+  version: 0.16.0.post0+cxx11.abi
   commit: v0.16.0
 torch-ccl:
   repo: https://github.com/intel/torch-ccl.git
-  commit: 5f20135ccf8f828738cb3bc5a5ae7816df8100ae
-  version: 2.1.100+xpu
+  commit: 5ee65b42c42a0d91c4cf459d9be40020274003b6
+  version: 2.1.200+xpu
 deepspeed:
   repo: https://github.com/microsoft/DeepSpeed.git
-  version: 
+  version: v0.11.2
   commit: 4fc181b01077521ba42379013ce91a1c294e5d8e
 intel-extension-for-deepspeed:
   repo: https://github.com/intel/intel-extension-for-deepspeed.git
@@ -28,7 +28,7 @@ transformers:
   commit: v4.31.0
 protobuf:
   version: 3.20.3
-llm_eval:
+lm_eval:
   version: 0.3.0
 basekit:
   dpcpp-cpp-rt:
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index ec82c8204..a8a04605f 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -15,6 +15,9 @@
 a#wap_dns {
    display: none;
 }
+a#wap_nac {
+   display: none;
+}
 
 /* replace the copyright to eliminate the copyright symbol enforced by
    the ReadTheDocs theme */
diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html
index 2ba1962f1..a48d262fc 100644
--- a/docs/_templates/footer.html
+++ b/docs/_templates/footer.html
@@ -1,3 +1,3 @@
 {% extends '!footer.html' %} {% block extrafooter %} {{super}} 
-<p></p><div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html'>| Do Not Share My Personal Information</a> </div> <p></p> <div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div>
+<p></p><div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a href="/#" data-wap_ref="dns" id="wap_dns"><small>Your Privacy Choices</small></a> <a href=https://www.intel.com/content/www/us/en/privacy/privacy-residents-certain-states.html data-wap_ref="nac" id="wap_nac"><small>Notice at Collection</small></a> </div> <p></p> <div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div>
 {% endblock %}
diff --git a/docs/index.rst b/docs/index.rst
index d1a1f04d0..d82e70052 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,7 +15,7 @@ Large Language Models (LLMs) are introduced in the Intel® Extension for PyTorch
 The extension can be loaded as a Python module for Python programs or linked as a C++ library for C++ programs. In Python scripts, users can enable it dynamically by importing ``intel_extension_for_pytorch``.
 
 .. note:: 
-   
+   - CPU features are not included in GPU-only packages.   
    - GPU features are not included in CPU-only packages.
    - Optimizations for CPU-only may have a newer code base due to different development schedules.
 
@@ -26,8 +26,8 @@ Intel® Extension for PyTorch* has been released as an open–source project at
 
 You can find more information about the product at:
 
-- `Features <https://intel.github.io/intel-extension-for-pytorch/gpu/latest/tutorials/features>`_
-- `Performance <./tutorials/performance.html>`_ 
+- `Features <https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/features>`_
+- `Performance <https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/performance>`_ 
 
 Architecture
 ------------
@@ -62,7 +62,7 @@ The team tracks bugs and enhancement requests using `GitHub issues <https://gith
    tutorials/performance
    tutorials/technical_details
    tutorials/releases
-   tutorials/performance_tuning/known_issues
+   tutorials/known_issues
    tutorials/blogs_publications
    tutorials/license
 
@@ -74,7 +74,6 @@ The team tracks bugs and enhancement requests using `GitHub issues <https://gith
    tutorials/installation
    tutorials/getting_started
    tutorials/examples
-   tutorials/cheat_sheet
 
 .. toctree::
    :maxdepth: 3
diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst
index bdc56783d..e20e13689 100644
--- a/docs/tutorials/api_doc.rst
+++ b/docs/tutorials/api_doc.rst
@@ -9,7 +9,7 @@ Device-Agnostic
 .. autofunction:: optimize_transformers
 .. autofunction:: get_fp32_math_mode
 .. autofunction:: set_fp32_math_mode
-.. autoclass:: verbose
+
 
 GPU-Specific
 ************
@@ -43,8 +43,7 @@ Miscellaneous
 
 .. currentmodule:: intel_extension_for_pytorch.xpu.fp8.fp8
 .. autofunction:: fp8_autocast
-.. currentmodule:: intel_extension_for_pytorch.quantization
-.. autofunction:: _gptq
+
 
 Random Number Generator
 =======================
diff --git a/docs/tutorials/blogs_publications.md b/docs/tutorials/blogs_publications.md
index 8a60f8ff3..f32c8ccb5 100644
--- a/docs/tutorials/blogs_publications.md
+++ b/docs/tutorials/blogs_publications.md
@@ -1,6 +1,7 @@
 Blogs & Publications
 ====================
 
+* [LLM inference solution on Intel GPU, Dec 2023](https://arxiv.org/abs/2401.05391)
 * [Accelerate Llama 2 with Intel AI Hardware and Software Optimizations, Jul 2023](https://www.intel.com/content/www/us/en/developer/articles/news/llama2.html)
 * [Accelerate PyTorch\* Training and Inference Performance using Intel® AMX, Jul 2023](https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-pytorch-training-inference-on-amx.html)
 * [Intel® Deep Learning Boost (Intel® DL Boost) - Improve Inference Performance of Hugging Face BERT Base Model in Google Cloud Platform (GCP) Technology Guide, Apr 2023](https://networkbuilders.intel.com/solutionslibrary/intel-deep-learning-boost-intel-dl-boost-improve-inference-performance-of-hugging-face-bert-base-model-in-google-cloud-platform-gcp-technology-guide)
diff --git a/docs/tutorials/cheat_sheet.md b/docs/tutorials/cheat_sheet.md
deleted file mode 100644
index e938ce8d8..000000000
--- a/docs/tutorials/cheat_sheet.md
+++ /dev/null
@@ -1,23 +0,0 @@
-Cheat Sheet
-===========
-
-Get started with Intel® Extension for PyTorch\* using the following commands:
-
-|Description    | Command |
-| -------- | ------- |
-| Basic CPU Installation | `python -m pip install intel_extension_for_pytorch`    |
-| Basic GPU Installation | `pip install torch==<version> -f https://developer.intel.com/ipex-whl-stable-xpu`<br>`pip install intel_extension_for_pytorch==<version> -f https://developer.intel.com/ipex-whl-stable-xpu`|
-| Import Intel® Extension for PyTorch\*    | `import intel_extension_for_pytorch as ipex`|
-| Capture a Verbose Log (Command Prompt)    | `export ONEDNN_VERBOSE=1`   |
-| Optimization During Training   | `model = ...`<br>`optimizer = ...`<br>`model.train()`<br>`model, optimizer = ipex.optimize(model, optimizer=optimizer)`|
-| Optimization During Inference  | `model = ...`<br>`model.eval()`<br>`model = ipex.optimize(model)`   |
-| Optimization Using the Low-Precision Data Type bfloat16 <br>During Training (Default FP32) | `model = ...`<br>`optimizer = ...`<br>`model.train()`<br/><br/>`model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16)`<br/><br/>`with torch.no_grad():`<br>`    with torch.cpu.amp.autocast():`<br>`        model(data)`   |
-| Optimization Using the Low-Precision Data Type bfloat16 <br>During Inference (Default FP32) | `model = ...`<br>`model.eval()`<br/><br/>`model = ipex.optimize(model, dtype=torch.bfloat16)`<br/><br/>`with torch.cpu.amp.autocast():`<br>`    model(data)`
-| [Prototype] Fast BERT Optimization | `from transformers import BertModel`<br>`model = BertModel.from_pretrained("bert-base-uncased")`<br>`model.eval()`<br/><br/>`model = ipex.fast_bert(model, dtype=torch.bfloat16)`|
-| Run CPU Launch Script (Command Prompt): <br>Automate Configuration Settings for Performance | `ipexrun [knobs] <your_pytorch_script> [args]`|
-| [Prototype] Run HyperTune to perform hyperparameter/execution configuration search | `python -m intel_extension_for_pytorch.cpu.hypertune --conf-file <your_conf_file> <your_python_script> [args]`|
-| [Prototype] Enable Graph capture | `model = …`<br>`model.eval()`<br>`model = ipex.optimize(model, graph_mode=True)`|
-| Post-Training INT8 Quantization (Static)  | `model = …`<br>`model.eval()`<br>`data = …`<br/><br/>`qconfig = ipex.quantization.default_static_qconfig`<br/><br/>`prepared_model = ipex.quantization.prepare(model, qconfig, example_inputs=data, anyplace=False)`<br/><br/>`for d in calibration_data_loader():`<br>`  prepared_model(d)`<br/><br/>`converted_model = ipex.quantization.convert(prepared_model)`|
-| Post-Training INT8 Quantization (Dynamic) | `model = …`<br>`model.eval()`<br>`data = …`<br/><br/>`qconfig = ipex.quantization.default_dynamic_qconfig`<br/><br/>`prepared_model = ipex.quantization.prepare(model, qconfig, example_inputs=data)`<br/><br/>`converted_model = ipex.quantization.convert(prepared_model)` |
-| [Prototype] Post-Training INT8 Quantization (Tuning Recipe): | `model = …`<br>`model.eval()`<br>`data = …`<br/><br/>`qconfig = ipex.quantization.default_static_qconfig`<br/><br/>`prepared_model = ipex.quantization.prepare(model, qconfig, example_inputs=data, inplace=False)`<br/><br/>`tuned_model = ipex.quantization.autotune(prepared_model, calibration_data_loader, eval_function, sampling_sizes=[100],`<br>`    accuracy_criterion={'relative': .01}, tuning_time=0)`<br/><br/>`convert_model = ipex.quantization.convert(tuned_model)`|
-
diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md
index b929766e1..221251125 100644
--- a/docs/tutorials/examples.md
+++ b/docs/tutorials/examples.md
@@ -4,8 +4,6 @@ Examples
 These examples will help you get started using Intel® Extension for PyTorch\*
 with Intel GPUs.
 
-For examples on Intel CPUs, check the [CPU examples](../../../cpu/latest/tutorials/examples.html).
-
 **Prerequisites**:
 Before running these examples, install the `torchvision` and `transformers` Python packages.
 
@@ -27,7 +25,7 @@ Before running these examples, install the `torchvision` and `transformers` Pyth
 To use Intel® Extension for PyTorch\* on training, you need to make the following changes in your code:
 
 1. Import `intel_extension_for_pytorch` as `ipex`.
-2. Use the `ipex.optimize` function, which applies optimizations against the model object, as well as an optimizer object.
+2. Use the `ipex.optimize` function for additional performance boost, which applies optimizations against the model object, as well as an optimizer object.
 3. Use Auto Mixed Precision (AMP) with BFloat16 data type.
 4. Convert input tensors, loss criterion and model to XPU, as shown below:
 
@@ -219,18 +217,20 @@ The <LIBPYTORCH_PATH> is the absolute path of libtorch we install at the first s
 
 If *Found IPEX* is shown as dynamic library paths, the extension was linked into the binary. This can be verified with the Linux command *ldd*.
 
+The value of x, y, z in the following log will change depending on the version you choose.
+
 ```bash
 $ CC=icx CXX=icpx cmake -DCMAKE_PREFIX_PATH=/workspace/libtorch ..
--- The C compiler identification is IntelLLVM 2024.0.0
--- The CXX compiler identification is IntelLLVM 2024.0.0
+-- The C compiler identification is IntelLLVM 202x.y.z
+-- The CXX compiler identification is IntelLLVM 202x.y.z
 -- Detecting C compiler ABI info
 -- Detecting C compiler ABI info - done
--- Check for working C compiler: /workspace/intel/oneapi/compiler/2024.0.0/linux/bin/icx - skipped
+-- Check for working C compiler: /workspace/intel/oneapi/compiler/202x.y.z/linux/bin/icx - skipped
 -- Detecting C compile features
 -- Detecting C compile features - done
 -- Detecting CXX compiler ABI info
 -- Detecting CXX compiler ABI info - done
--- Check for working CXX compiler: /workspace/intel/oneapi/compiler/2024.0.0/linux/bin/icpx - skipped
+-- Check for working CXX compiler: /workspace/intel/oneapi/compiler/202x.y.z/linux/bin/icpx - skipped
 -- Detecting CXX compile features
 -- Detecting CXX compile features - done
 -- Looking for pthread.h
@@ -252,16 +252,16 @@ $ ldd example-app
         libintel-ext-pt-cpu.so => /workspace/libtorch/lib/libintel-ext-pt-cpu.so (0x00007fd5a1a1b000)
         libintel-ext-pt-gpu.so => /workspace/libtorch/lib/libintel-ext-pt-gpu.so (0x00007fd5862b0000)
         ...
-        libmkl_intel_lp64.so.2 => /workspace/intel/oneapi/mkl/2024.0.0/lib/intel64/libmkl_intel_lp64.so.2 (0x00007fd584ab0000)
-        libmkl_core.so.2 => /workspace/intel/oneapi/mkl/2024.0.0/lib/intel64/libmkl_core.so.2 (0x00007fd5806cc000)
-        libmkl_gnu_thread.so.2 => /workspace/intel/oneapi/mkl/2024.0.0/lib/intel64/libmkl_gnu_thread.so.2 (0x00007fd57eb1d000)
-        libmkl_sycl.so.3 => /workspace/intel/oneapi/mkl/2024.0.0/lib/intel64/libmkl_sycl.so.3 (0x00007fd55512c000)
-        libOpenCL.so.1 => /workspace/intel/oneapi/compiler/2024.0.0/linux/lib/libOpenCL.so.1 (0x00007fd55511d000)
-        libsvml.so => /workspace/intel/oneapi/compiler/2024.0.0/linux/compiler/lib/intel64_lin/libsvml.so (0x00007fd553b11000)
-        libirng.so => /workspace/intel/oneapi/compiler/2024.0.0/linux/compiler/lib/intel64_lin/libirng.so (0x00007fd553600000)
-        libimf.so => /workspace/intel/oneapi/compiler/2024.0.0/linux/compiler/lib/intel64_lin/libimf.so (0x00007fd55321b000)
-        libintlc.so.5 => /workspace/intel/oneapi/compiler/2024.0.0/linux/compiler/lib/intel64_lin/libintlc.so.5 (0x00007fd553a9c000)
-        libsycl.so.6 => /workspace/intel/oneapi/compiler/2024.0.0/linux/lib/libsycl.so.6 (0x00007fd552f36000)
+        libmkl_intel_lp64.so.2 => /workspace/intel/oneapi/mkl/202x.y.z/lib/intel64/libmkl_intel_lp64.so.2 (0x00007fd584ab0000)
+        libmkl_core.so.2 => /workspace/intel/oneapi/mkl/202x.y.z/lib/intel64/libmkl_core.so.2 (0x00007fd5806cc000)
+        libmkl_gnu_thread.so.2 => /workspace/intel/oneapi/mkl/202x.y.z/lib/intel64/libmkl_gnu_thread.so.2 (0x00007fd57eb1d000)
+        libmkl_sycl.so.3 => /workspace/intel/oneapi/mkl/202x.y.z/lib/intel64/libmkl_sycl.so.3 (0x00007fd55512c000)
+        libOpenCL.so.1 => /workspace/intel/oneapi/compiler/202x.y.z/linux/lib/libOpenCL.so.1 (0x00007fd55511d000)
+        libsvml.so => /workspace/intel/oneapi/compiler/202x.y.z/linux/compiler/lib/intel64_lin/libsvml.so (0x00007fd553b11000)
+        libirng.so => /workspace/intel/oneapi/compiler/202x.y.z/linux/compiler/lib/intel64_lin/libirng.so (0x00007fd553600000)
+        libimf.so => /workspace/intel/oneapi/compiler/202x.y.z/linux/compiler/lib/intel64_lin/libimf.so (0x00007fd55321b000)
+        libintlc.so.5 => /workspace/intel/oneapi/compiler/202x.y.z/linux/compiler/lib/intel64_lin/libintlc.so.5 (0x00007fd553a9c000)
+        libsycl.so.6 => /workspace/intel/oneapi/compiler/202x.y.z/linux/lib/libsycl.so.6 (0x00007fd552f36000)
         ...
 ```
 
@@ -286,4 +286,4 @@ Intel® Extension for PyTorch\* provides its C++ dynamic library to allow users
 
 ## Intel® AI Reference Models
 
-Use cases that have already been optimized by Intel engineers are available at [Intel® AI Reference Models](https://github.com/IntelAI/models/tree/v2.12.0) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [Use Cases](https://github.com/IntelAI/models/tree/v2.12.0#use-cases) section. Models verified on Intel GPUs are marked in the `Model Documentation` column. You can get performance benefits out-of-the-box by simply running scripts in the Intel® AI Reference Models.
+Use cases that have already been optimized by Intel engineers are available at [Intel® AI Reference Models](https://github.com/IntelAI/models/tree/v3.1.1) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [Use Cases](https://github.com/IntelAI/models/tree/v3.1.1?tab=readme-ov-file#use-cases) section. Models verified on Intel GPUs are marked in the `Model Documentation` column. You can get performance benefits out-of-the-box by simply running scripts in the Intel® AI Reference Models.
diff --git a/docs/tutorials/features.rst b/docs/tutorials/features.rst
index b907d5443..fb22239c2 100644
--- a/docs/tutorials/features.rst
+++ b/docs/tutorials/features.rst
@@ -1,8 +1,8 @@
 Features
 ========
 
-Device-Agnostic
-***************
+GPU-Specific
+************
 
 Easy-to-use Python API
 ----------------------
@@ -46,16 +46,15 @@ Quantization
 
 Intel® Extension for PyTorch* currently supports imperative mode and TorchScript mode for post-training static quantization on GPU. This section illustrates the quantization workflow on Intel GPUs.
 
-Check more detailed information for `INT8 Quantization [XPU] <features/int8_overview_xpu.md>`_. 
+Check more detailed information for `INT8 Quantization <features/int8_overview_xpu.md>`_. 
 
-On Intel® GPUs, Intel® Extension for PyTorch* also provides INT4 and FP8 Quantization.  Check more detailed information for `FP8 Quantization <./features/float8.md>`_ and `INT4 Quantization <./features/int4.md>`_ 
+On Intel® GPUs, Intel® Extension for PyTorch* also provides FP8 Quantization.  Check more detailed information for `FP8 Quantization <./features/float8.md>`_.
 
 .. toctree::
    :hidden:
    :maxdepth: 1
 
    features/int8_overview_xpu
-   features/int4
    features/float8
 
 
@@ -74,9 +73,6 @@ For more detailed information, check `DDP <features/DDP.md>`_ and `Horovod (Prot
    features/horovod
 
 
-GPU-Specific
-************
-
 DLPack Solution
 ---------------
 
@@ -131,11 +127,12 @@ For more detailed information, check `FSDP <features/FSDP.md>`_.
 
    features/FSDP
 
-Inductor
---------
+torch.compile for GPU (Beta)
+----------------------------
+
 Intel® Extension for PyTorch\* now empowers users to seamlessly harness graph compilation capabilities for optimal PyTorch model performance on Intel GPU via the flagship `torch.compile <https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile>`_ API through the default "inductor" backend (`TorchInductor <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747/1>`_ ). 
 
-For more detailed information, check `Inductor <features/torch_compile_gpu.md>`_.
+For more detailed information, check `torch.compile for GPU <features/torch_compile_gpu.md>`_.
 
 .. toctree::
    :hidden:
@@ -144,7 +141,7 @@ For more detailed information, check `Inductor <features/torch_compile_gpu.md>`_
    features/torch_compile_gpu
 
 Legacy Profiler Tool (Prototype)
------------------------------------
+--------------------------------
 
 The legacy profiler tool is an extension of PyTorch* legacy profiler for profiling operators' overhead on XPU devices. With this tool, you can get the information in many fields of the run models or code scripts. Build Intel® Extension for PyTorch* with profiler support as default and enable this tool by adding a `with` statement before the code segment.
 
@@ -157,7 +154,7 @@ For more detailed information, check `Legacy Profiler Tool <features/profiler_le
    features/profiler_legacy
 
 Simple Trace Tool (Prototype)
---------------------------------
+-----------------------------
 
 Simple Trace is a built-in debugging tool that lets you control printing out the call stack for a piece of code. Once enabled, it can automatically print out verbose messages of called operators in a stack format with indenting to distinguish the context. 
 
@@ -170,7 +167,7 @@ For more detailed information, check `Simple Trace Tool <features/simple_trace.m
    features/simple_trace
 
 Kineto Supported Profiler Tool (Prototype)
----------------------------------------------
+------------------------------------------
 
 The Kineto supported profiler tool is an extension of PyTorch\* profiler for profiling operators' executing time cost on GPU devices. With this tool, you can get information in many fields of the run models or code scripts. Build Intel® Extension for PyTorch\* with Kineto support as default and enable this tool using the `with` statement before the code segment.
 
diff --git a/docs/tutorials/features/amp_cpu.md b/docs/tutorials/features/amp_cpu.md
deleted file mode 100644
index cd8b0bc4f..000000000
--- a/docs/tutorials/features/amp_cpu.md
+++ /dev/null
@@ -1,102 +0,0 @@
-Auto Mixed Precision (AMP) on CPU
-=================================
-
-## Introduction
-
-`torch.cpu.amp` provides convenience for auto data type conversion at runtime. Deep learning workloads can benefit from lower-precision floating point data types such as `torch.float16` or `torch.bfloat16`, because of its lighter calculation workload and smaller memory usage. Accuracy is sacrificed when using lower-precision floating point data types so there's a trade-off between accuracy and performance. Thus, some operations should use the slower but more accurate`torch.float32`, while others can be converted to use the faster but less accurate `torch.float16` data type. The Auto Mixed Precision (AMP) feature automates the tuning of data type conversions over all operators.
-
-`torch.cpu.amp` only supports `torch.bfloat16`. It is the default lower precision floating point data type when `torch.cpu.amp` is enabled. `torch.cpu.amp` primarily benefits when running on Intel CPU with BFloat16 instruction set support.
-
-## Use Case
-
-The following simple network should show a speedup with mixed precision.
-
-```
-class SimpleNet(torch.nn.Module):
-    def __init__(self):
-        super(SimpleNet, self).__init__()
-        self.conv = torch.nn.Conv2d(64, 128, (3, 3), stride=(2, 2), padding=(1, 1), bias=False)
-
-    def forward(self, x):
-        return self.conv(x)
-```
-
-### Default Precision
-
-Without `torch.cpu.amp`, the network executes all operators with default precision (`torch.float32`).
-```
-model = SimpleNet()
-x = torch.rand(64, 64, 224, 224)
-y = model(x)
-```
-
-### Inference with Imperative Path
-
-`torch.cpu.amp.autocast` is designed to be a context manager that allow scopes of your script to run with mixed precision. In these scopes, operations run in a data type chosen by the `autocast` class to improve performance while maintaining accuracy. See the operations category section for details on what precision the `autocast` class chooses for each operator, and under what circumstances.
-
-```
-model = SimpleNet().eval()
-x = torch.rand(64, 64, 224, 224)
-with torch.cpu.amp.autocast():
-    y = model(x)
-```
-
-### Inference with TorchScript Path
-
-`torch.cpu.amp.autocast` can be used with `torch.jit.trace` to apply graph optimization. Due to PyTorch limitation, only `torch.jit.trace` is supported.
-
-```
-model = SimpleNet().eval()
-x = torch.rand(64, 64, 224, 224)
-with torch.cpu.amp.autocast():
-    model = torch.jit.trace(model, x)
-    model = torch.jit.freeze(model)
-    y = model(x)
-```
-
-### Training Support
-
-`torch.cpu.amp.autocast` can be used in training to improve performance.
-
-```
-model = SimpleNet()
-optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
-for images, label in train_loader():
-    with torch.cpu.amp.autocast():
-        loss = criterion(model(images), label)
-    loss.backward()
-    optimizer.step()
-```
-
-## Autocast Op Reference
-
-### Op Eligibility
-
-Ops that run in `float64` or non-floating-point dtypes are not eligible for mixed precision, and will run in these types whether or not autocast is enabled.
-
-Only out-of-place ops and Tensor methods are eligible for mixed precision. In-place variants and calls that explicitly supply an `out=...` Tensor
-are allowed in autocast-enabled regions, but won't go through autocasting. For example, in an autocast-enabled region `a.addmm(b, c)` can autocast, but `a.addmm_(b, c)` and `a.addmm(b, c, out=d)` cannot. For best performance and stability, use out-of-place ops in autocast-enabled regions.
-
-### Op-Specific Behavior
-
-The following lists describe the behavior of eligible ops in autocast-enabled regions. These ops always go through autocasting whether they are invoked as part of a `torch.nn.Module`, as a function, or as a `torch.Tensor` method. If functions are exposed in multiple namespaces, they go through autocasting regardless of the namespace.
-
-Ops not listed below do not go through autocasting. They run in the type defined by their inputs. However, autocasting may still change the type in which unlisted ops run if they're downstream from autocasted ops.
-
-If an op is unlisted, we assume it's numerically stable in `bfloat16`. If you believe that an unlisted op is numerically unstable in `bfloat16`, file a [GitHub issue](https://github.com/intel/intel-extension-for-pytorch/issues).
-
-#### Ops that can autocast to `bfloat16`
-
-`conv1d`, `conv2d`, `conv3d`, `conv_transpose1d`, `conv_transpose2d`, `conv_transpose3d`, `bmm`, `mm`, `baddbmm`, `addmm`, `addbmm`, `linear`, `matmul`, `conv_tbc`, `group_norm`, `_native_multi_head_attention`
-
-#### Ops that can autocast to `float32`
-
-`avg_pool3d`, `binary_cross_entropy`, `grid_sampler`, `polar`, `prod`, `quantile`, `nanquantile`, `stft`, `cdist`, `trace`, `view_as_complex`, `cholesky`, `cholesky_inverse`, `cholesky_solve`, `inverse`, `lu_solve`, `matrix_rank`, `orgqr`, `ormqr`, `pinverse`, `max_unpool2d`, `max_unpool3d`, `adaptive_avg_pool3d`, `reflection_pad1d`, `reflection_pad2d`, `replication_pad1d`, `replication_pad2d`, `replication_pad3d`, `mse_loss`, `cosine_embedding_loss`, `nll_loss`, `nll_loss2d`, `hinge_embedding_loss`, `poisson_nll_loss`, `smooth_l1_loss`, `cross_entropy_loss`, `l1_loss`, `huber_loss`, `margin_ranking_loss`, `soft_margin_loss`, `triplet_margin_loss`, `multi_margin_loss`, `ctc_loss`, `kl_div`, `multilabel_margin_loss`, `binary_cross_entropy_with_logits`, `fft_fft`, `fft_ifft`, `fft_fft2`, `fft_ifft2`, `fft_fftn`, `fft_ifftn`, `fft_rfft`, `fft_irfft`, `fft_rfft2`, `fft_irfft2`, `fft_rfftn`, `fft_irfftn`, `fft_hfft`, `fft_ihfft`, `linalg_cond`, `linalg_matrix_rank`, `linalg_solve`, `linalg_cholesky`, `linalg_svdvals`, `linalg_eigvals`, `linalg_eigvalsh`, `linalg_inv`, `linalg_householder_product`, `linalg_tensorinv`, `linalg_tensorsolve`, `fake_quantize_per_tensor_affine`, `eig`, `geqrf`, `lstsq`, `_lu_with_info`, `qr`, `svd`, `symeig`, `triangular_solve`, `fractional_max_pool2d`, `fractional_max_pool3d`, `adaptive_max_pool3d`, `multilabel_margin_loss_forward`, `linalg_qr`, `linalg_cholesky_ex`, `linalg_svd`, `linalg_eig`, `linalg_eigh`, `linalg_lstsq`, `linalg_inv_ex`
-
-#### Ops that promote to the widest input type
-
-These ops don't require a particular dtype for stability, but take multiple inputs and require that the inputs' dtypes match.  If all of the inputs are `bfloat16`, the op runs in `bfloat16`.  If any of the inputs is `float32`, autocast casts all inputs to `float32` and runs the op in `float32`.
-
-`cat`, `stack`, `index_copy`
-
-Some ops not listed here (e.g., binary ops like `add`) natively promote inputs without autocasting's intervention.  If inputs are a mixture of `bfloat16` and `float32`, these ops run in `float32` and produce `float32` output, regardless of whether autocast is enabled.
diff --git a/docs/tutorials/features/int4.md b/docs/tutorials/features/int4.md
deleted file mode 100644
index cec17efa3..000000000
--- a/docs/tutorials/features/int4.md
+++ /dev/null
@@ -1,42 +0,0 @@
-INT4 inference [GPU] (Prototype)
-=====================================
-
-## INT4 Data Type
-
-INT4 is a 4-bit fixed point data type, which is used to reduce memory footprint, improve the computation efficiency, and save power in Deep Learning domain.
-
-INT4 data type is being used in weight only quantization in current stage. It will be converted to Float16 data type for computation.
-
-## INT4 Quantization
-
-On GPU, offline Weight Only Quantization (WOQ) is used for INT4 data compression. WOQ calibration tool using Generative Pre-trained Transformer models Quantization (GPT-Q) algorithm is created for improving the accuracy for INT4 weight quantization.
-
-## Supported running mode
-
-DNN Inference is supported with INT4 data type.
-
-## Supported operators
-
-INT4 Linear operator and widely used linear fusion operators in Large Langugue Models like `mm_qkv_int4`, `mm_bias_int4`, `mm_silu_int4`, `mm_resmul_int4`, `mm_bias_gelu_int4`, `mm_bias_resadd_resadd_int4` are supported.
-
-## INT4 usage example
-
-You can use a well quantized INT4 model to perform INT4 inference directly, or use the WOQ tool to compress the high precision model to INT4 model firstly, then to execute INT4 inference with IPEX on GPU.
-
-### Weight Only Quantization Tool
-
-This tool is used for applying quantization to the given model using gptq method.
-
-Please note that we only support HuggingFace transformers model structure at present. GPT-J-6B is a model we intensively verified.
-
-```python
-from transformers import GPTJForCausalLM
-
-model_path = ...
-dataset = ...
-model = GPTJForCausalLM.from_pretrained(model_path)
-model.eval()
-
-ipex.quantization._gptq(model, dataset, 'quantized_weight.pt', wbits=4)
-```
-
diff --git a/docs/tutorials/installation.rst b/docs/tutorials/installation.rst
index 3767e92c3..9703f4e3c 100644
--- a/docs/tutorials/installation.rst
+++ b/docs/tutorials/installation.rst
@@ -1,6 +1,6 @@
 Installation
 ============
 
-Select your preferences and follow the installation instructions provided on the `Installation page <../../../index.html#installation?platform=gpu&version=v2.1.10%2Bxpu>`_.
+Select your preferences and follow the installation instructions provided on the `Installation page <../../../index.html#installation?platform=gpu&version=v2.1.20%2Bxpu>`_.
 
 After successful installation, refer to the `Quick Start <getting_started.md>`_ and `Examples <examples.md>`_ sections to start using the extension in your code.
diff --git a/docs/tutorials/performance_tuning/known_issues.md b/docs/tutorials/known_issues.md
similarity index 100%
rename from docs/tutorials/performance_tuning/known_issues.md
rename to docs/tutorials/known_issues.md
diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst
index 10b470224..e33397ccf 100644
--- a/docs/tutorials/llm.rst
+++ b/docs/tutorials/llm.rst
@@ -3,7 +3,7 @@ Large Language Models (LLM) Optimizations Overview
 
 In the current technological landscape, Generative AI (GenAI) workloads and models have gained widespread attention and popularity. LLMs have emerged as the dominant models driving these GenAI applications. Most of LLMs are GPT-like architectures that consist of multiple Decoder layers. 
 The MultiHeadAttention and FeedForward layer are two key components of every Decoder layer. The generation task is memory bound because iterative decode and kv_cache require special management to reduce memory overheads. Intel® Extension for PyTorch* provides a lot of specific optimizations for these LLMs. 
-On the operator level, the extension provides highly efficient GEMM kernel to speed up Linear layer and customized operators to reduce the memory footprint. To better trade-off the performance and accuracy, different low-precision solutions e.g., smoothQuant and weight-only-quantization are also enabled. Besides, tensor parallel can also adopt to get lower latency for LLMs.
+On the operator level, the extension provides highly efficient GEMM kernel to speed up Linear layer and customized operators to reduce the memory footprint. To better trade-off the performance and accuracy, different low-precision solutions e.g., smoothQuant is enabled. Besides, tensor parallel can also adopt to get lower latency for LLMs.
 
 These LLM-specific optimizations can be automatically applied with a single frontend API function in Python interface, `ipex.optimize_transformers()`. Check `optimize_transformers <./llm/llm_optimize_transformers.md>`_ for more details.
 
@@ -35,16 +35,10 @@ Optimized Models
      - ✅
      - ✅
      - ✅
-   * - Weight only quantzation INT4
-     - ❎
-     - ✅
-     - ❎
-     - ❎
-
 
 *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp16). For other LLMs families, we are working in progress to cover those optimizations, which will expand the model list above.
 
-Check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.10%2Bxpu/examples/gpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts..
+Check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.20%2Bxpu/examples/gpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts..
 
 Optimization Methodologies
 --------------------------
@@ -111,9 +105,3 @@ heavier computations and places higher requirements to the underlying
 hardware. Given that, quantization becomes a more important methodology
 for inference workloads.
 
-Intel® Extension for PyTorch\* also delivers INT4 optimizations via
-4-bit weight-only quantization (WOQ). As the name indicates, WOQ
-quantizes only weights to 4-bit integers to further improve the
-computation efficiency via saved memory bandwidth utilization. This
-technique reduces text generation latency especially from the second
-token.
diff --git a/docs/tutorials/llm/llm_optimize_transformers.md b/docs/tutorials/llm/llm_optimize_transformers.md
index 4682b8ac6..82c8a2cfd 100644
--- a/docs/tutorials/llm/llm_optimize_transformers.md
+++ b/docs/tutorials/llm/llm_optimize_transformers.md
@@ -113,24 +113,6 @@ print(modelJit.graph_for(inference_dta))
 
 ```
 
-### Weight Only Quantization (WOQ)
-
-Supports INT4.
-
-``` python
-from transformers import GPTJForCausalLM
-
-model_path = ...
-dataset = ...
-model = GPTJForCausalLM.from_pretrained(model_path)
-model.eval()
-
-ipex.quantization._gptq(model, dataset, 'quantized_weight.pt', wbits=4)
-
-# inference with model.generate()
-...
-```
-
 ### Distributed Inference with DeepSpeed
 
 Distributed inference can be performed with `DeepSpeed`. Based on original Intel® Extension for PyTorch\* scripts, the following code changes are required.
diff --git a/docs/tutorials/performance_tuning.rst b/docs/tutorials/performance_tuning.rst
deleted file mode 100644
index 2e4ebd747..000000000
--- a/docs/tutorials/performance_tuning.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Performance Tuning Guide
-========================
-
-Intel® Extension for PyTorch\* should yield a satisfying performance with its default configuration for general use cases. This page shows solutions for some known issues.
-
-- `Known Issues <performance_tuning/known_issues.html>`_
-
-.. toctree::
-   :hidden:
-
-   performance_tuning/known_issues
diff --git a/docs/tutorials/releases.md b/docs/tutorials/releases.md
index bfb3f10c0..b2a9c0f02 100644
--- a/docs/tutorials/releases.md
+++ b/docs/tutorials/releases.md
@@ -24,7 +24,7 @@ Intel® Extension for PyTorch\* v2.1.20+xpu is a minor release which supports In
 
 ### Known Issues
 
-Please refer to [Known Issues webpage](./performance_tuning/known_issues.md).
+Please refer to [Known Issues webpage](./known_issues.md).
 
 ## 2.1.10+xpu
 
@@ -59,7 +59,7 @@ This release provides the following features:
 
 ### Known Issues
 
-Please refer to [Known Issues webpage](./performance_tuning/known_issues.md).
+Please refer to [Known Issues webpage](./known_issues.md).
 
 ## 2.0.110+xpu
 
@@ -84,7 +84,7 @@ This release adds the following fusion patterns in PyTorch\* JIT mode for Intel
 
 ### Known Issues
 
-Please refer to [Known Issues webpage](./performance_tuning/known_issues.md).
+Please refer to [Known Issues webpage](./known_issues.md).
 
 ## 1.13.120+xpu
 
@@ -112,7 +112,7 @@ This release adds the following fusion patterns in PyTorch\* JIT mode for Intel
 
 ### Known Issues
 
-Please refer to [Known Issues webpage](./performance_tuning/known_issues.md).
+Please refer to [Known Issues webpage](./known_issues.md).
 
 ## 1.13.10+xpu
 
@@ -137,7 +137,7 @@ This release adds the following fusion patterns in PyTorch\* JIT mode for Intel
 
 ### Known Issues
 
-Please refer to [Known Issues webpage](./performance_tuning/known_issues.md).
+Please refer to [Known Issues webpage](./known_issues.md).
 
 ## 1.10.200+gpu
 
diff --git a/examples/gpu/inference/python/llm/Dockerfile b/examples/gpu/inference/python/llm/Dockerfile
index 1cc4d7720..59dac10c0 100644
--- a/examples/gpu/inference/python/llm/Dockerfile
+++ b/examples/gpu/inference/python/llm/Dockerfile
@@ -1,66 +1,65 @@
-
-ARG BASE_IMAGE=ubuntu:22.04
-FROM ${BASE_IMAGE} AS base
-SHELL ["/bin/bash", "-c"]
-RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
-    if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
-    if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::https::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
-RUN apt update && \
-    apt full-upgrade -y && \
-    DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
-    sudo \
-    git \
-    wget \
-    curl \
-    vim \
-    patch \
-    gcc \
-    g++ \
-    make \
-    pkg-config \
-    software-properties-common \
-    gnupg \
-    gpg-agent
-COPY ./tools/basekit_driver_install_helper.sh .
-RUN bash ./basekit_driver_install_helper.sh driver
-
-ARG GID_RENDER=109
-RUN useradd -m -s /bin/bash ubuntu && \
-    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
-    groupadd -g $GID_RENDER render && \
-    usermod -a -G video,render ubuntu
-USER ubuntu
-WORKDIR /home/ubuntu
-
-RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-    bash miniconda.sh -b -p ./miniconda3 && \
-    rm miniconda.sh && \
-    echo "source ~/miniconda3/bin/activate" >> ./.bashrc
-
-FROM base AS dev
-# --build-arg COMPILE=ON to compile from source
-ARG COMPILE
-RUN bash /basekit_driver_install_helper.sh dev
-COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
-RUN . ./miniconda3/bin/activate && \
-    conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
-    cd intel-extension-for-pytorch/examples/gpu/inference/python/llm && \
-    if [ -z ${COMPILE} ]; then MODE=6; else MODE=2; fi && \
-    bash tools/env_setup.sh ${MODE} /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest /opt/intel/oneapi/ccl/latest pvc,ats-m150,acm-g11
-
-FROM base AS deploy
-RUN bash /basekit_driver_install_helper.sh runtime && \
-    sudo apt clean && \
-    sudo rm -rf /var/lib/apt/lists/* && \
-    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
-    sudo rm /basekit_driver_install_helper.sh
-COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/gpu/inference/python/llm ./llm
-COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
-RUN . ./miniconda3/bin/activate && \
-    conda create -y -n py310 python=3.10 && conda activate py310 && \
-    echo "conda activate py310" >> ./.bashrc && \
-    ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \
-    cd ./llm && \
-    bash tools/env_setup.sh 1 && \
-    python -m pip cache purge && \
-    conda clean -a -y
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+SHELL ["/bin/bash", "-c"]
+RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::https::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
+RUN apt update && \
+    apt full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
+    sudo \
+    git \
+    wget \
+    curl \
+    vim \
+    patch \
+    gcc \
+    g++ \
+    make \
+    pkg-config \
+    software-properties-common \
+    gnupg \
+    gpg-agent
+COPY ./tools/basekit_driver_install_helper.sh .
+RUN bash ./basekit_driver_install_helper.sh driver
+
+ARG GID_RENDER=109
+RUN useradd -m -s /bin/bash ubuntu && \
+    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
+    groupadd -g $GID_RENDER render && \
+    usermod -a -G video,render ubuntu
+USER ubuntu
+WORKDIR /home/ubuntu
+
+RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+    bash miniconda.sh -b -p ./miniconda3 && \
+    rm miniconda.sh && \
+    echo "source ~/miniconda3/bin/activate" >> ./.bashrc
+
+FROM base AS dev
+# --build-arg COMPILE=ON to compile from source
+ARG COMPILE
+RUN bash /basekit_driver_install_helper.sh dev
+COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
+RUN . ./miniconda3/bin/activate && \
+    conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
+    cd intel-extension-for-pytorch/examples/gpu/inference/python/llm && \
+    if [ -z ${COMPILE} ]; then MODE=6; else MODE=2; fi && \
+    bash tools/env_setup.sh ${MODE} /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest /opt/intel/oneapi/ccl/latest /opt/intel/oneapi/mpi/latest pvc,ats-m150,acm-g11
+
+FROM base AS deploy
+RUN bash /basekit_driver_install_helper.sh runtime && \
+    sudo apt clean && \
+    sudo rm -rf /var/lib/apt/lists/* && \
+    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    sudo rm /basekit_driver_install_helper.sh
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/gpu/inference/python/llm ./llm
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
+RUN . ./miniconda3/bin/activate && \
+    conda create -y -n py310 python=3.10 && conda activate py310 && \
+    echo "conda activate py310" >> ./.bashrc && \
+    ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \
+    cd ./llm && \
+    bash tools/env_setup.sh 1 && \
+    python -m pip cache purge && \
+    conda clean -a -y
\ No newline at end of file
diff --git a/examples/gpu/inference/python/llm/README.md b/examples/gpu/inference/python/llm/README.md
index 6ca138c5c..eebf4a892 100644
--- a/examples/gpu/inference/python/llm/README.md
+++ b/examples/gpu/inference/python/llm/README.md
@@ -49,10 +49,7 @@ docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_REN
 
 
 # Run the container with command below
-docker run --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path \
---ipc=host --net=host --cap-add=ALL -v /lib/modules:/lib/modules --workdir /workspace  \
---volume `pwd`/examples/gpu/inference/python/llm/:/workspace/llm ipex-llm:2.1.20 /bin/bash
-
+docker run --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --net=host --cap-add=ALL -v /lib/modules:/lib/modules --workdir /workspace  --volume `pwd`/examples/gpu/inference/python/llm/:/workspace/llm ipex-llm:2.1.20 /bin/bash
 
 # When the command prompt shows inside the docker container, enter llm examples directory
 cd llm
@@ -83,9 +80,9 @@ conda activate llm
 conda install pkg-config
 # Setup the environment with the provided script
 cd examples/gpu/inference/python/llm
-# If you want to install Intel® Extension for PyTorch\* from prebuilt wheel files, use the command below:
+# If you want to install Intel® Extension for PyTorch* from prebuilt wheel files, use the command below:
 bash ./tools/env_setup.sh 7
-# If you want to install Intel® Extension for PyTorch\* from source, use the commands below:
+# If you want to install Intel® Extension for PyTorch* from source, use the commands below:
 bash ./tools/env_setup.sh 3 <DPCPP_ROOT> <ONEMKL_ROOT> <ONECCL_ROOT> <AOT>
 export LD_PRELOAD=$(bash ../../../../../tools/get_libstdcpp_lib.sh)
 export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
diff --git a/examples/gpu/inference/python/llm/tools/env_activate.sh b/examples/gpu/inference/python/llm/tools/env_activate.sh
index aa6100114..39ad46712 100644
--- a/examples/gpu/inference/python/llm/tools/env_activate.sh
+++ b/examples/gpu/inference/python/llm/tools/env_activate.sh
@@ -13,5 +13,3 @@ fi
 
 
 export TORCH_LLM_ALLREDUCE=1
-
-
diff --git a/examples/gpu/inference/python/llm/tools/env_setup.sh b/examples/gpu/inference/python/llm/tools/env_setup.sh
index 7c90f4bcd..e6275d1bd 100644
--- a/examples/gpu/inference/python/llm/tools/env_setup.sh
+++ b/examples/gpu/inference/python/llm/tools/env_setup.sh
@@ -15,9 +15,10 @@ MODE=0x03
 DPCPP_ROOT=
 ONEMKL_ROOT=
 ONECCL_ROOT=
+MPI_ROOT=
 AOT=
 if [[ $# -eq 0 ]]; then
-    echo "Usage: bash $0 <MODE> [DPCPPROOT] [MKLROOT] [CCLROOT] [AOT]"
+    echo "Usage: bash $0 <MODE> [DPCPPROOT] [MKLROOT] [CCLROOT] [MPIROOT] [AOT]"
     echo "Set MODE to 7 to install from wheel files. Set it to 3 to compile from source. When compiling from source, you need to set arguments below."
     echo "DPCPPROOT, MKLROOT and CCLROOT should be absolute or relative path to the root directory of DPC++ compiler, oneMKL and oneCCL in oneAPI Base Toolkit respectively."
     echo "AOT should be set to the text string for environment variable USE_AOT_DEVLIST. Setting it to \"none\" to disable AOT."
@@ -42,6 +43,10 @@ if [[ $# -gt 0 ]]; then
     ONECCL_ROOT=$1
     shift
 fi
+if [[ $# -gt 0 ]]; then
+    MPI_ROOT=$1
+    shift
+fi
 if [[ $# -gt 0 ]]; then
     AOT=$1
     shift
@@ -64,9 +69,10 @@ if [ $((${MODE} & 0x06)) -eq 2 ] &&
    ([ -z ${DPCPP_ROOT} ] ||
    [ -z ${ONEMKL_ROOT} ] ||
    [ -z ${ONECCL_ROOT} ] ||
+   [ -z ${MPI_ROOT} ] ||
    [ -z ${AOT} ]); then
-    echo "Source code compilation is needed. Please set arguments DPCPP_ROOT, ONEMKL_ROOT, ONECCL_ROOT and AOT."
-    echo "DPCPPROOT, MKLROOT and CCLROOT should be absolute or relative path to the root directory of DPC++ compiler, oneMKL and oneCCL in oneAPI Base Toolkit respectively."
+    echo "Source code compilation is needed. Please set arguments DPCPP_ROOT, ONEMKL_ROOT, ONECCL_ROOT, MPI_ROOT and AOT."
+    echo "DPCPPROOT, MKLROOT, CCLROOT and MPIROOT should be absolute or relative path to the root directory of DPC++ compiler, oneMKL, oneCCL and MPI in oneAPI Base Toolkit respectively."
     echo "AOT should be set to the text string for environment variable USE_AOT_DEVLIST. Setting it to \"none\" to disable AOT."
     exit 2
 fi
@@ -98,7 +104,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     VER_TORCH=$(python tools/yaml_utils.py -f dependency_version.yml -d pytorch -k version)
     TRANSFORMERS_COMMIT=$(python tools/yaml_utils.py -f dependency_version.yml -d transformers -k commit)
     VER_PROTOBUF=$(python tools/yaml_utils.py -f dependency_version.yml -d protobuf -k version)
-    VER_LLM_EVAL=$(python tools/yaml_utils.py -f dependency_version.yml -d llm_eval -k version)
+    VER_LM_EVAL=$(python tools/yaml_utils.py -f dependency_version.yml -d lm_eval -k version)
     VER_IPEX_MAJOR=$(grep "VERSION_MAJOR" version.txt | cut -d " " -f 2)
     VER_IPEX_MINOR=$(grep "VERSION_MINOR" version.txt | cut -d " " -f 2)
     VER_IPEX_PATCH=$(grep "VERSION_PATCH" version.txt | cut -d " " -f 2)
@@ -131,10 +137,15 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
             exit 6
         fi
 
+        if [ ! -f ${MPI_ROOT}/env/vars.sh ]; then
+            echo "MPI environment ${MPI_ROOT} doesn't seem to exist."
+            exit 6
+        fi
+
         # Install PyTorch and Intel® Extension for PyTorch*
         cp intel-extension-for-pytorch/scripts/compile_bundle.sh .
         sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh
-        bash compile_bundle.sh ${DPCPP_ROOT} ${ONEMKL_ROOT} ${ONECCL_ROOT} ${AOT} 1
+        bash compile_bundle.sh ${DPCPP_ROOT} ${ONEMKL_ROOT} ${ONECCL_ROOT}  ${MPI_ROOT} ${AOT} 1
         cp pytorch/dist/*.whl ${WHEELFOLDER}
         cp intel-extension-for-pytorch/dist/*.whl ${WHEELFOLDER}
         cp torch-ccl/dist/*.whl ${WHEELFOLDER}
@@ -144,8 +155,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
 
     echo "python -m pip install impi-devel" >> ${AUX_INSTALL_SCRIPT}
     echo "python -m pip install cpuid accelerate datasets sentencepiece diffusers protobuf==${VER_PROTOBUF} huggingface_hub mpi4py mkl" >> ${AUX_INSTALL_SCRIPT}
-    echo "python -m pip install lm_eval==${VER_LLM_EVAL}" >> ${AUX_INSTALL_SCRIPT}
-    
+    echo "python -m pip install lm_eval==${VER_LM_EVAL}" >> ${AUX_INSTALL_SCRIPT}
 
     # Install Transformers
     if [ -d transformers ]; then
diff --git a/scripts/build_doc.sh b/scripts/build_doc.sh
index f644cd762..4d0cf5f46 100644
--- a/scripts/build_doc.sh
+++ b/scripts/build_doc.sh
@@ -240,16 +240,10 @@ elif [[ ${DEVICE} == "gpu" ]]; then
     rm -rf ../csrc/include/xpu
     mv ../csrc/include/xpu_bk ../csrc/include/xpu
 fi
-cp tutorials/features/graph_capture.md tutorials/features/graph_capture.md.bk
-parse_example "../examples/cpu/features/graph_capture.py" tutorials/features/graph_capture.md "(marker_feature_graph_capture)" "python"
-cp tutorials/features/int8_recipe_tuning_api.md tutorials/features/int8_recipe_tuning_api.md.bk
-parse_example "../examples/cpu/features/int8_recipe_tuning/int8_autotune.py" tutorials/features/int8_recipe_tuning_api.md "(marker_feature_int8_autotune)" "python"
 
 make clean
 make html
 
-mv tutorials/features/graph_capture.md.bk tutorials/features/graph_capture.md
-mv tutorials/features/int8_recipe_tuning_api.md.bk tutorials/features/int8_recipe_tuning_api.md
 mv tutorials/examples.md.bk tutorials/examples.md
 if [[ ${DEVICE} == "cpu" ]]; then
     mv tutorials/features/fast_bert.md.bk tutorials/features/fast_bert.md
diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh
index 55ae404d0..803beb54b 100644
--- a/scripts/compile_bundle.sh
+++ b/scripts/compile_bundle.sh
@@ -8,7 +8,7 @@ set -eo pipefail
 VER_IPEX=v2.1.20+xpu
 
 if [[ $# -lt 3 ]]; then
-    echo "Usage: bash $0 <DPCPPROOT> <MKLROOT> <CCLROOT> <AOT>"
+    echo "Usage: bash $0 <DPCPPROOT> <MKLROOT> <CCLROOT> <MPIROOT> <AOT>"
     echo "DPCPPROOT, MKLROOT and CCLROOT are mandatory, should be absolute or relative path to the root directory of DPC++ compiler, oneMKL and oneCCL respectively."
     echo "AOT should be set to the text string for environment variable USE_AOT_DEVLIST. Setting it to \"none\" to disable AOT."
     exit 1
@@ -16,7 +16,8 @@ fi
 DPCPP_ROOT=$1
 ONEMKL_ROOT=$2
 ONECCL_ROOT=$3
-AOT=$4
+MPI_ROOT=$4
+AOT=$5
 if [[ ${AOT} == "none" ]]; then
     AOT=""
 fi
@@ -33,10 +34,10 @@ fi
 #           └--------------- Undefined
 MODE=0x07
 if [ $# -gt 4 ]; then
-    if [[ ! $5 =~ ^[0-9]+$ ]] && [[ ! $5 =~ ^0x[0-9a-fA-F]+$ ]]; then
+    if [[ ! $6 =~ ^[0-9]+$ ]] && [[ ! $6 =~ ^0x[0-9a-fA-F]+$ ]]; then
         echo "Warning: Unexpected argument. Using default value."
     else
-        MODE=$5
+        MODE=$6
     fi
 fi
 
@@ -53,10 +54,17 @@ if [ ! -f ${ONEMKL_ENV} ]; then
     exit 3
 fi
 
+CCL_ENV=${ONECCL_ROOT}/env/vars.sh
 if [ ! -f ${ONECCL_ROOT}/env/vars.sh ]; then
     echo "oneCCL environment ${ONECCL_ROOT} doesn't seem to exist."
     exit 6
 fi
+
+MPI_ENV=${MPI_ROOT}/env/vars.sh
+if [ ! -f ${MPI_ROOT}/env/vars.sh ]; then
+    echo "oneCCL environment ${MPI_ROOT} doesn't seem to exist."
+    exit 6
+fi
 ONEAPIROOT=${ONEMKL_ROOT}/../..
 
 # Check existance of required Linux commands
@@ -264,8 +272,8 @@ cd pytorch
 git apply ../intel-extension-for-pytorch/torch_patches/*.patch
 python -m pip install -r requirements.txt
 conda install --force-reinstall intel::mkl-static intel::mkl-include -y
-mv version.txt version.txt.bk
-echo "${COMMIT_TORCH:1}a0" > version.txt
+export PYTORCH_BUILD_VERSION="${COMMIT_TORCH:1}.post0+cxx11.abi"
+export PYTORCH_BUILD_NUMBER=0
 # Ensure cmake can find python packages when using conda or virtualenv
 if [ -n "${CONDA_PREFIX-}" ]; then
     export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(command -v conda))/../"}
@@ -286,7 +294,8 @@ unset USE_NUMA
 unset _GLIBCXX_USE_CXX11_ABI
 unset USE_STATIC_MKL
 unset CMAKE_PREFIX_PATH
-mv version.txt.bk version.txt
+unset PYTORCH_BUILD_NUMBER
+unset PYTORCH_BUILD_VERSION
 conda remove mkl-static mkl-include -y
 python -m pip install dist/*.whl
 cd ..
@@ -304,6 +313,8 @@ fi
 # don't fail on external scripts
 source ${DPCPP_ENV}
 source ${ONEMKL_ENV}
+source ${CCL_ENV}
+source ${MPI_ENV}
 #  TorchAudio
 if [ $((${MODE} & 0x02)) -ne 0 ]; then
     cd audio
@@ -413,4 +424,4 @@ CMD="${CMD} import intel_extension_for_pytorch as ipex; print(f'ipex_version:
 if [ $((${MODE} & 0x01)) -ne 0 ]; then
     CMD="${CMD} import oneccl_bindings_for_pytorch as torch_ccl; print(f'torchccl_version:    {torch_ccl.__version__}');"
 fi
-python -c "${CMD}"
+python -c "${CMD}"
\ No newline at end of file
diff --git a/tools/basekit_driver_install_helper.sh b/tools/basekit_driver_install_helper.sh
index d9b94f96e..13e164b59 100644
--- a/tools/basekit_driver_install_helper.sh
+++ b/tools/basekit_driver_install_helper.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 if [ $# -eq 0 ]; then
     echo "Usage: bash $0 <MODE>"
@@ -23,19 +24,31 @@ if [ $UID -ne 0 ]; then
     SUDO="sudo"
 fi
 
-OS_ID=""
+source /etc/os-release
+OS_ID=${ID}
 OS_VERSION=""
-while read line
-do
-    KEY=$(echo ${line} | cut -d '=' -f 1)
-    VAL=$(echo ${line} | cut -d '=' -f 2)
-    if [ "${KEY}" = "ID" ]; then
-        OS_ID=${VAL}
-    fi
-    if [ "${KEY}" = "VERSION_ID" ]; then
-        OS_VERSION=${VAL}
-    fi
-done < <(cat /etc/os-release)
+if [ "${OS_ID}" = "ubuntu" ]; then
+    OS_VERSION=${VERSION_CODENAME}
+    if [[ ! " jammy " =~ " ${OS_VERSION} " ]]; then
+        echo "Ubuntu version ${OS_VERSION} not supported"
+        exit 3
+    fi
+elif [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
+    OS_VERSION=${VERSION_ID}
+    if [ "${OS_VERSION}" = "8" ]; then
+        OS_VERSION="8.6"
+    fi
+    if [ "${OS_VERSION}" = "9" ]; then
+        OS_VERSION="9.0"
+    fi
+    if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${OS_VERSION} " ]]; then
+        echo "RHEL version ${OS_VERSION} not supported"
+        exit 3
+    fi
+else
+    echo "${OS_ID} not supported."
+    exit 3
+fi
 
 function add-repo-driver() {
     SUDO=$1
@@ -47,26 +60,12 @@ function add-repo-driver() {
 
     if [ "${OS_ID}" = "ubuntu" ]; then
         wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | ${SUDO} gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-        echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | ${SUDO} tee /etc/apt/sources.list.d/intel-gpu-jammy.list
+        echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu ${OS_VERSION}/lts/2350 unified" | ${SUDO} tee /etc/apt/sources.list.d/intel-gpu-${OS_VERSION}.list
         ${SUDO} apt update
     fi
-    if [ "${OS_ID}" = "\"rhel\"" ] || [ "${OS_ID}" = "\"centos\"" ]; then
-        if [ "${OS_VERSION}" = "\"8\"" ] || [ "${OS_VERSION}" = "\"8.6\"" ]; then
-            ${SUDO} dnf install -y 'dnf-command(config-manager)'
-            ${SUDO} dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/8.6/unified/intel-gpu-8.6.repo
-        fi
-        if [ "${OS_VERSION}" = "\"8.8\"" ]; then
-            ${SUDO} dnf install -y 'dnf-command(config-manager)'
-            ${SUDO} dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/8.8/unified/intel-gpu-8.8.repo
-        fi
-        if [ "${OS_VERSION}" = "\"9\"" ] || [ "${OS_VERSION}" = "\"9.0\"" ]; then
-            ${SUDO} dnf install -y 'dnf-command(config-manager)'
-            ${SUDO} dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/9.0/unified/intel-gpu-9.0.repo
-        fi
-        if [ "${OS_VERSION}" = "\"9.2\"" ]; then
-            ${SUDO} dnf install -y 'dnf-command(config-manager)'
-            ${SUDO} dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/9.2/unified/intel-gpu-9.2.repo
-        fi
+    if [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
+          ${SUDO} dnf install -y 'dnf-command(config-manager)'
+          ${SUDO} dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/${OS_VERSION}/lts/2350/unified/intel-gpu-${OS_VERSION}.repo
     fi
 }
 
@@ -83,7 +82,7 @@ function add-repo-basekit() {
         echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | ${SUDO} tee /etc/apt/sources.list.d/oneAPI.list
         ${SUDO} apt update
     fi
-    if [ "${OS_ID}" = "\"rhel\"" ] || [ "${OS_ID}" = "\"centos\"" ]; then
+    if [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
         tee > /tmp/oneAPI.repo << EOF
 [oneAPI]
 name=Intel® oneAPI repository
@@ -108,19 +107,19 @@ function install-driver() {
 
     if [ "${OS_ID}" = "ubuntu" ]; then
         ${SUDO} apt update
-        ${SUDO} apt install -y intel-opencl-icd=23.30.26918.50-736~22.04 \
-        level-zero=1.13.1-719~22.04 \
-        level-zero-dev=1.13.1-719~22.04 \
-        intel-level-zero-gpu=1.3.26918.50-736~22.04 \
-        xpu-smi=1.2.22-31~22.04
-    fi
-    if [ "${OS_ID}" = "\"rhel\"" ] || [ "${OS_ID}" = "\"centos\"" ]; then
-        ${SUDO} dnf install -y intel-opencl-23.30.26918.50 \
-        level-zero-1.13.1 \
-        level-zero-devel-1.13.1 \
-        intel-level-zero-gpu-1.3.26918.50 \
-        intel-ocloc-23.30.26918.50 \
-        xpu-smi-1.2.22
+        ${SUDO} apt install -y intel-opencl-icd=23.43.27642.40-803~22.04 \
+        level-zero=1.14.0-744~22.04 \
+        level-zero-dev=1.14.0-744~22.04 \
+        intel-level-zero-gpu=1.3.27642.40-803~22.04 \
+        xpu-smi=1.2.26-37~22.04
+    fi
+    if [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
+        ${SUDO} dnf install -y intel-opencl-23.43.27642.40 \
+        level-zero-1.14.0 \
+        level-zero-devel-1.14.0 \
+        intel-level-zero-gpu-1.3.27642.40 \
+        intel-ocloc-23.43.27642.40 \
+        xpu-smi-1.2.26
     fi
 }
 
@@ -135,16 +134,16 @@ function install-dev() {
 
     if [ "${OS_ID}" = "ubuntu" ]; then
         ${SUDO} apt update
-        ${SUDO} apt install -y intel-level-zero-gpu-dev=1.3.26918.50-736~22.04 \
-        intel-oneapi-dpcpp-cpp-2024.0 \
-        intel-oneapi-mkl-devel=2024.0.0-49656 \
-        intel-oneapi-ccl-devel=2021.11.1-6
+        ${SUDO} apt install -y intel-level-zero-gpu-dev=1.3.27642.40-803~22.04 \
+        intel-oneapi-dpcpp-cpp-2024.1=2024.1.0-963 \
+        intel-oneapi-mkl-devel=2024.1.0-691 \
+        intel-oneapi-ccl-devel=2021.12.0-309
     fi
-    if [ "${OS_ID}" = "\"rhel\"" ] || [ "${OS_ID}" = "\"centos\"" ]; then
-        ${SUDO} dnf install -y intel-level-zero-gpu-devel-1.3.26918.50 \
-        intel-oneapi-dpcpp-cpp-2024.0 \
-        intel-oneapi-mkl-devel-2024.0.0-49656 \
-        intel-oneapi-ccl-devel-2021.11.1-6
+    if [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
+        ${SUDO} dnf install -y intel-level-zero-gpu-devel-1.3.27642.40 \
+        intel-oneapi-dpcpp-cpp-2024.1-2024.1.0-963 \
+        intel-oneapi-mkl-devel-2024.1.0-691 \
+        intel-oneapi-ccl-devel-2021.12.0-309
     fi
 }
 
@@ -159,17 +158,21 @@ function install-runtime() {
 
     if [ "${OS_ID}" = "ubuntu" ]; then
         ${SUDO} apt update
-        ${SUDO} apt install -y intel-oneapi-runtime-dpcpp-cpp=2024.0.0-49819 \
-        intel-oneapi-runtime-mkl=2024.0.0-49656 \
-        intel-oneapi-runtime-ccl=2021.11.1-6
+        ${SUDO} apt install -y intel-oneapi-runtime-dpcpp-cpp=2024.1.0-963 \
+        intel-oneapi-runtime-mkl=2024.1.0-691 \
+        intel-oneapi-runtime-ccl=2021.12.0-309
     fi
-    if [ "${OS_ID}" = "\"rhel\"" ] || [ "${OS_ID}" = "\"centos\"" ]; then
-        ${SUDO} dnf install -y intel-oneapi-runtime-dpcpp-cpp-2024.0.0-49819 \
-        intel-oneapi-runtime-mkl-2024.0.0-49656 \
-        intel-oneapi-runtime-ccl-2021.11.1-6
+    if [ "${OS_ID}" = "rhel" ] || [ "${OS_ID}" = "centos" ]; then
+        ${SUDO} dnf install -y intel-oneapi-runtime-dpcpp-cpp-2024.1.0-963 \
+        intel-oneapi-runtime-mkl-2024.1.0-691 \
+        intel-oneapi-runtime-ccl-2021.12.0-309
     fi
 }
 
+for CMD in wget gpg; do
+    command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" not found." ; exit 1)
+done
+
 if [ "${MODE}" = "driver" ]; then
     install-driver ${SUDO} ${OS_ID} ${OS_VERSION}
 fi