From 8122dcb02474964d252e33db30f500878e238144 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Wed, 22 May 2024 06:57:04 +0900 Subject: [PATCH] update llm model list in 2.1.30 doc (#2916) --- xpu/2.1.30+xpu/_sources/tutorials/llm.rst.txt | 32 ++++++++------ xpu/2.1.30+xpu/genindex.html | 2 +- xpu/2.1.30+xpu/index.html | 2 +- xpu/2.1.30+xpu/search.html | 2 +- xpu/2.1.30+xpu/searchindex.js | 2 +- xpu/2.1.30+xpu/tutorials/api_doc.html | 2 +- .../tutorials/blogs_publications.html | 2 +- xpu/2.1.30+xpu/tutorials/contribution.html | 2 +- xpu/2.1.30+xpu/tutorials/examples.html | 2 +- xpu/2.1.30+xpu/tutorials/features.html | 2 +- xpu/2.1.30+xpu/tutorials/features/DDP.html | 2 +- xpu/2.1.30+xpu/tutorials/features/DLPack.html | 2 +- .../tutorials/features/DPC++_Extension.html | 2 +- xpu/2.1.30+xpu/tutorials/features/FSDP.html | 2 +- .../features/advanced_configuration.html | 2 +- .../tutorials/features/amp_gpu.html | 2 +- .../features/auto_channels_last.html | 2 +- .../tutorials/features/compute_engine.html | 2 +- .../tutorials/features/deepspeed_kernels.html | 2 +- xpu/2.1.30+xpu/tutorials/features/float8.html | 2 +- .../tutorials/features/horovod.html | 2 +- .../tutorials/features/int8_overview_xpu.html | 2 +- .../tutorials/features/ipex_log.html | 2 +- xpu/2.1.30+xpu/tutorials/features/nhwc.html | 2 +- .../tutorials/features/profiler_kineto.html | 2 +- .../tutorials/features/profiler_legacy.html | 2 +- .../tutorials/features/simple_trace.html | 2 +- .../tutorials/features/torch_compile_gpu.html | 2 +- xpu/2.1.30+xpu/tutorials/getting_started.html | 2 +- xpu/2.1.30+xpu/tutorials/installation.html | 2 +- xpu/2.1.30+xpu/tutorials/introduction.html | 2 +- xpu/2.1.30+xpu/tutorials/known_issues.html | 2 +- xpu/2.1.30+xpu/tutorials/license.html | 2 +- xpu/2.1.30+xpu/tutorials/llm.html | 42 ++++++++++++------- .../llm/int4_weight_only_quantization.html | 2 +- .../llm/llm_optimize_transformers.html | 2 +- xpu/2.1.30+xpu/tutorials/performance.html | 2 +- xpu/2.1.30+xpu/tutorials/releases.html | 2 +- .../tutorials/technical_details.html | 2 +- .../tutorials/technical_details/AOT.html | 2 +- .../technical_details/ipex_optimize.html | 2 +- .../technical_details/memory_management.html | 2 +- .../optimizer_fusion_gpu.html | 2 +- 43 files changed, 88 insertions(+), 68 deletions(-) diff --git a/xpu/2.1.30+xpu/_sources/tutorials/llm.rst.txt b/xpu/2.1.30+xpu/_sources/tutorials/llm.rst.txt index b21ce5314..2f72444f9 100644 --- a/xpu/2.1.30+xpu/_sources/tutorials/llm.rst.txt +++ b/xpu/2.1.30+xpu/_sources/tutorials/llm.rst.txt @@ -21,28 +21,36 @@ Optimized Models :header-rows: 1 * - Model Family - - LLAMA2 - - GPT-J - - Qwen - - OPT - - BLOOM - * - Verified < MODEL ID > (Huggingface hub) + - Verified < MODEL ID > (Huggingface hub) + - FP16 + - INT4 WOQ + * - Llama2 - "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" - - "EleutherAI/gpt-j-6b" - - "Qwen/Qwen-7B" - - "facebook/opt-30b", "facebook/opt-1.3b" - - "bigscience/bloom-7b1", "bigscience/bloom" - * - FP16 - ✅ - ✅ + * - GPT-J + - "EleutherAI/gpt-j-6b" + - ✅ - ✅ + * - Qwen + - "Qwen/Qwen-7B" - ✅ - ✅ - * - INT4 WOQ + * - OPT + - "facebook/opt-30b", "facebook/opt-1.3b" - ✅ + - ❎ + * - Bloom + - "bigscience/bloom-7b1", "bigscience/bloom" - ✅ + - ❎ + * - ChatGLM3-6B + - "THUDM/chatglm3-6b" - ✅ - ❎ + * - Baichuan2-13B + - "baichuan-inc/Baichuan2-13B-Chat" + - ✅ - ❎ diff --git a/xpu/2.1.30+xpu/genindex.html b/xpu/2.1.30+xpu/genindex.html index d263ce6cf..2a67f1c98 100644 --- a/xpu/2.1.30+xpu/genindex.html +++ b/xpu/2.1.30+xpu/genindex.html @@ -350,7 +350,7 @@

X

Built with Sphinx using a theme provided by Read the Docs. - +

Cookies | Privacy | Your Privacy Choices | Notice at Collection

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/index.html b/xpu/2.1.30+xpu/index.html index b56ee66bd..0141a22e7 100644 --- a/xpu/2.1.30+xpu/index.html +++ b/xpu/2.1.30+xpu/index.html @@ -175,7 +175,7 @@

SupportSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/search.html b/xpu/2.1.30+xpu/search.html index 0d5fbf3b7..8a5b63321 100644 --- a/xpu/2.1.30+xpu/search.html +++ b/xpu/2.1.30+xpu/search.html @@ -127,7 +127,7 @@ Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/searchindex.js b/xpu/2.1.30+xpu/searchindex.js index a17b2f1b1..3f81efe32 100644 --- a/xpu/2.1.30+xpu/searchindex.js +++ b/xpu/2.1.30+xpu/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/DDP", "tutorials/features/DLPack", "tutorials/features/DPC++_Extension", "tutorials/features/FSDP", "tutorials/features/advanced_configuration", "tutorials/features/amp_gpu", "tutorials/features/auto_channels_last", "tutorials/features/compute_engine", "tutorials/features/deepspeed_kernels", "tutorials/features/float8", "tutorials/features/horovod", "tutorials/features/int8_overview_xpu", "tutorials/features/ipex_log", "tutorials/features/nhwc", "tutorials/features/profiler_kineto", "tutorials/features/profiler_legacy", "tutorials/features/simple_trace", "tutorials/features/torch_compile_gpu", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/int4_weight_only_quantization", "tutorials/llm/llm_optimize_transformers", "tutorials/performance", "tutorials/releases", "tutorials/technical_details", "tutorials/technical_details/AOT", "tutorials/technical_details/ipex_optimize", "tutorials/technical_details/memory_management", "tutorials/technical_details/optimizer_fusion_gpu"], "filenames": ["index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/DDP.md", "tutorials/features/DLPack.md", "tutorials/features/DPC++_Extension.md", "tutorials/features/FSDP.md", "tutorials/features/advanced_configuration.md", "tutorials/features/amp_gpu.md", "tutorials/features/auto_channels_last.md", "tutorials/features/compute_engine.md", "tutorials/features/deepspeed_kernels.md", "tutorials/features/float8.md", "tutorials/features/horovod.md", "tutorials/features/int8_overview_xpu.md", "tutorials/features/ipex_log.md", "tutorials/features/nhwc.md", "tutorials/features/profiler_kineto.md", "tutorials/features/profiler_legacy.md", "tutorials/features/simple_trace.md", "tutorials/features/torch_compile_gpu.md", "tutorials/getting_started.md", "tutorials/installation.rst", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/int4_weight_only_quantization.md", "tutorials/llm/llm_optimize_transformers.md", "tutorials/performance.md", "tutorials/releases.md", "tutorials/technical_details.rst", "tutorials/technical_details/AOT.md", "tutorials/technical_details/ipex_optimize.md", "tutorials/technical_details/memory_management.rst", "tutorials/technical_details/optimizer_fusion_gpu.md"], "titles": ["Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Contribution", "Examples", "Features", "DistributedDataParallel (DDP)", "DLPack Solution", "DPC++ Extension", "Fully Sharded Data Parallel (FSDP)", "Advanced Configuration", "Auto Mixed Precision (AMP) on GPU", "Auto Channels Last", "Compute Engine (Experimental feature for debug)", "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels", "Float8 Data Type Support (Prototype)", "Horovod with PyTorch (Prototype)", "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]", "IPEX_LOGGING (Prototype)", "Channels Last", "Kineto Supported Profiler Tool (Prototype)", "Legacy Profiler Tool (Deprecated)", "Simple Trace Tool (Prototype)", "torch.compile for GPU (Beta)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimizations Overview", "Weight-Only Quantization (Prototype)", "Transformers Optimization Frontend API", "Performance", "Releases", "Technical Details", "Ahead of Time (AOT) Compilation", "ipex.optimize Frontend API", "Memory Management", "Optimizer Fusion on GPU"], "terms": {"intel optim": 0, "intel\u00ae extension for pytorch*": 0, "gpu": [0, 1, 2, 3, 4, 8, 10, 13, 14, 15, 16, 20, 21, 24, 26, 27, 29, 32, 35, 36, 37], "discrete gpu": 0, "intel discrete gpu": 0, "extend": [0, 5, 7, 23, 26, 29, 30, 33], "latest": [0, 6, 7, 16, 24, 26, 27, 29, 32], "perform": [0, 1, 2, 4, 5, 8, 11, 12, 13, 14, 17, 19, 23, 26, 29, 30, 31, 33, 36, 38], "optim": [0, 1, 2, 5, 6, 9, 11, 12, 13, 14, 16, 19, 23, 24, 26, 27, 30, 33], "hardwar": [0, 2, 5, 26, 29, 33], "take": [0, 1, 8, 11, 19, 26, 30, 33], "advantag": [0, 1, 12, 19, 26, 33], "advanc": [0, 8, 24, 30, 33, 37], "vector": [0, 1, 4, 8, 14, 19, 33], "512": [0, 4, 33], "avx": [0, 33], "neural": [0, 2, 5, 13, 15, 33], "network": [0, 2, 5, 11, 13, 15, 33], "instruct": [0, 3, 4, 24, 25, 26, 27, 29, 30, 33], "vnni": [0, 33], "matrix": [0, 23, 26, 33], "amx": [0, 2, 33], "cpu": [0, 1, 2, 4, 5, 6, 20, 24, 27, 32, 33, 34, 36], "well": [0, 1, 3, 4, 29, 30, 33], "x": [0, 4, 8, 9, 11, 19, 26, 30, 35], "e": [0, 1, 4, 8, 11, 17, 19, 26, 27, 29, 33, 34, 35], "xmx": [0, 26, 33], "ai": [0, 2, 10, 26, 27, 29, 33], "engin": [0, 4, 19, 26, 30, 33], "discret": [0, 26, 33], "moreov": [0, 29, 33], "provid": [0, 1, 3, 4, 5, 6, 8, 9, 11, 13, 14, 18, 25, 26, 27, 29, 30, 31, 33, 35, 36, 38], "easi": [0, 2, 16, 26, 33], "acceler": [0, 1, 2, 5, 15, 23, 26, 30, 33], "through": [0, 1, 4, 5, 8, 11, 23, 26, 30, 33], "xpu": [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 22, 23, 24, 26, 27, 30, 31, 32, 36], "devic": [0, 1, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 18, 19, 21, 23, 26, 27, 29, 30, 31, 33, 34, 35, 36], "In": [0, 1, 4, 5, 8, 11, 13, 19, 20, 22, 23, 29, 30, 33], "current": [0, 1, 3, 5, 7, 9, 13, 17, 18, 20, 22, 27, 29, 30, 31, 35, 36, 38], "technolog": [0, 29], "landscap": [0, 29], "gener": [0, 3, 4, 6, 7, 8, 13, 17, 18, 19, 29, 30, 31, 33, 35, 36], "genai": [0, 29], "workload": [0, 4, 5, 11, 17, 27, 29, 31, 33, 34], "model": [0, 1, 2, 5, 6, 9, 10, 11, 12, 13, 15, 16, 17, 23, 24, 27, 31, 32, 33, 36], "have": [0, 1, 3, 4, 6, 7, 8, 12, 13, 17, 19, 20, 22, 24, 27, 28, 29, 30, 35], "gain": [0, 29], "widespread": [0, 29], "attent": [0, 29, 30], "popular": [0, 7, 29, 30, 32], "larg": [0, 1, 5, 9, 18, 27, 30, 31, 33, 38], "languag": [0, 8, 30, 31, 33], "llm": [0, 2, 27, 31, 33], "emerg": [0, 29], "domin": [0, 29], "drive": [0, 29], "applic": [0, 1, 4, 29, 30, 34, 35, 36, 37], "start": [0, 1, 2, 3, 4, 6, 16, 18, 20, 22, 25, 27], "from": [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 21, 22, 23, 27, 29, 30, 31, 33, 34, 35, 36, 37], "2": [0, 2, 4, 6, 7, 8, 9, 11, 18, 19, 20, 22, 23, 27, 28, 29, 30, 32, 34, 35, 36], "1": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 18, 19, 20, 22, 23, 27, 29, 30, 36, 38], "0": [0, 1, 3, 4, 6, 8, 9, 10, 11, 16, 18, 20, 22, 23, 27, 28, 30, 32, 38], "specif": [0, 1, 4, 5, 7, 10, 13, 14, 16, 18, 19, 29, 33, 34, 36], "certain": [0, 1, 30, 31], "ar": [0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 24, 27, 29, 30, 31, 33, 34, 35, 36, 38], "introduc": [0, 2, 8, 19, 33], "For": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 18, 19, 20, 24, 26, 29, 33, 34, 35, 36, 37, 38], "more": [0, 1, 3, 5, 6, 8, 9, 10, 11, 17, 20, 22, 23, 24, 27, 29, 30, 33, 34, 35, 37, 38], "inform": [0, 1, 3, 5, 6, 7, 8, 9, 17, 18, 19, 20, 23, 24, 29, 30, 33, 34], "refer": [0, 1, 6, 8, 9, 10, 12, 13, 19, 20, 24, 25, 26, 33, 35, 36], "section": [0, 4, 5, 11, 17, 23, 25, 26, 30, 31, 36], "The": [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 38], "can": [0, 1, 3, 4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 19, 20, 22, 24, 27, 29, 30, 31, 33, 34, 35, 36, 37, 38], "load": [0, 1, 4, 8, 20, 27, 33, 36], "python": [0, 1, 3, 6, 7, 8, 9, 10, 16, 20, 23, 24, 29, 30, 31, 33, 34, 36], "modul": [0, 1, 4, 5, 6, 8, 9, 11, 14, 17, 19, 30, 31, 33, 34, 36], "program": [0, 1], "link": [0, 4], "c": [0, 5, 6, 8, 11, 22], "librari": [0, 4, 5, 6, 7, 8, 9, 10, 13, 14, 18, 20, 22, 33], "script": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 17, 22, 24, 27, 29, 31, 36], "user": [0, 1, 4, 5, 6, 10, 12, 13, 19, 20, 23, 27, 33, 34, 35, 36, 37], "enabl": [0, 1, 2, 4, 5, 6, 10, 11, 15, 17, 19, 20, 23, 24, 29, 30, 33, 34, 35, 36], "dynam": [0, 4, 15, 27], "import": [0, 1, 3, 4, 6, 8, 9, 15, 16, 17, 19, 20, 22, 23, 24, 27, 29, 30, 31, 33], "intel_extension_for_pytorch": [0, 1, 3, 4, 6, 7, 8, 9, 15, 16, 17, 19, 20, 22, 23, 24, 27, 30, 31, 33], "featur": [0, 2, 3, 4, 11, 14, 19, 23, 26, 27, 33, 34, 35, 36], "includ": [0, 1, 4, 8, 10, 14, 20, 24, 27, 28, 29, 30, 32, 33, 34, 36], "onli": [0, 1, 3, 4, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 24, 33, 36], "packag": [0, 4, 6, 8, 9, 23, 24, 33], "mai": [0, 1, 2, 3, 7, 8, 11, 12, 13, 17, 19, 20, 23, 27, 30, 33], "newer": 0, "code": [0, 1, 3, 5, 8, 9, 10, 13, 16, 19, 20, 22, 24, 25, 27, 28, 31, 33, 34, 35, 37, 38], "base": [0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 16, 20, 24, 29, 30, 31, 32, 33], "due": [0, 11, 17, 20, 27, 29, 30, 33], "differ": [0, 1, 4, 6, 7, 19, 20, 29, 30], "develop": [0, 2, 4, 8, 27, 34, 35], "schedul": [0, 9, 20], "ha": [0, 1, 4, 5, 7, 8, 13, 19, 23, 27, 30, 33, 35], "been": [0, 1, 4, 5, 8, 19, 23, 27, 33], "releas": [0, 1, 6, 12, 14, 19, 23, 27, 35, 37], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 16, 17, 18, 19, 20, 24, 27, 29, 30, 33, 38], "open": [0, 27, 33], "sourc": [0, 3, 8, 10, 16, 20, 22, 23, 24, 27, 28, 35], "project": [0, 4, 8], "github": [0, 3, 6, 7, 9, 11], "you": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 16, 18, 19, 20, 22, 23, 24, 27, 29, 30, 31, 33, 34, 35, 37], "find": [0, 4, 7, 8, 20, 27, 32, 33, 34], "how": [0, 4, 5, 6, 7, 8, 18, 19, 24], "get": [0, 1, 2, 4, 5, 6, 7, 8, 9, 18, 20, 27, 29, 33], "main": [0, 3, 4, 9, 23, 29, 30], "branch": 0, "quick": [0, 25, 26], "about": [0, 1, 3, 6, 8, 9, 23], "product": [0, 29, 33, 34, 35], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38], "structur": [0, 5, 7], "shown": [0, 1, 4, 6, 19, 20, 22, 29, 30], "follow": [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 27, 28, 29, 31, 33, 36], "figur": [0, 7, 29], "eager": [0, 17, 33], "mode": [0, 1, 3, 5, 10, 19, 24, 33], "frontend": [0, 1, 5, 29, 30, 33], "custom": [0, 1, 5, 8, 10, 13, 14, 20, 29, 33], "fusion": [0, 1, 4, 17, 23, 31, 33, 34, 36], "int8": [0, 2, 5, 17, 30, 31, 33], "quantiz": [0, 1, 2, 4, 14, 31, 33], "api": [0, 2, 4, 8, 9, 10, 17, 20, 23, 29, 30, 33, 34], "further": [0, 1, 4, 5, 18, 19, 29, 30], "improv": [0, 2, 11, 15, 29, 30, 33, 36], "achiev": [0, 1, 4, 33], "convert": [0, 1, 4, 5, 7, 11, 12, 15, 17, 19, 30, 31, 33, 36], "graph": [0, 1, 5, 11, 17, 23, 33, 36], "us": [0, 1, 2, 3, 6, 9, 10, 14, 15, 16, 17, 18, 19, 21, 23, 24, 25, 27, 28, 29, 30, 33, 34, 36, 37], "pass": [0, 3, 4, 8, 20, 27, 30], "reduc": [0, 1, 5, 9, 15, 20, 29, 30, 33, 38], "oper": [0, 1, 4, 5, 7, 8, 10, 11, 17, 18, 20, 21, 22, 23, 30, 33, 34], "kernel": [0, 1, 5, 8, 10, 13, 18, 20, 23, 27, 29, 30, 32, 33, 36], "invoc": [0, 27, 33], "overhead": [0, 1, 8, 29, 33, 34, 38], "result": [0, 1, 8, 19], "compar": [0, 1, 5, 19, 30, 33, 38], "normal": [0, 4, 9, 16, 20, 29, 30, 34, 36], "yield": 0, "better": [0, 1, 13, 17, 19, 29, 30, 33, 36, 38], "techniqu": [0, 1, 8], "like": [0, 1, 2, 3, 8, 10, 17, 18, 20, 27, 29, 30, 33], "amplifi": 0, "them": [0, 3, 10, 16, 19, 20, 27, 29, 30, 33, 36, 38], "comprehens": [0, 37], "both": [0, 1, 4, 5, 7, 15, 17, 19, 29, 30, 31, 33, 35, 36, 38], "torchscript": [0, 1, 5, 24, 38], "torchdynamo": 0, "With": [0, 1, 5, 6, 8, 16, 17, 20, 22], "we": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 17, 19, 23, 27, 29, 30, 32, 33, 37, 38], "recommend": [0, 4, 12, 13, 17, 23, 24, 27, 30, 33], "torch": [0, 1, 6, 7, 8, 9, 11, 13, 16, 17, 18, 19, 20, 22, 24, 27, 30, 31, 32, 33, 36, 38], "jit": [0, 1, 4, 11, 17, 19, 24, 31, 33, 34, 35], "trace": [0, 4, 10, 11, 17, 18, 24, 31], "your": [0, 3, 4, 6, 8, 9, 11, 16, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 37], "prefer": [0, 13, 25], "option": [0, 1, 10, 20, 23, 34, 35, 36], "wider": [0, 4], "rang": [0, 6, 8, 9, 15, 16, 17, 20, 31], "ipex": [0, 1, 2, 4, 5, 8, 12, 18, 22, 24, 27, 29, 30, 31, 33], "backend": [0, 1, 2, 5, 6, 7, 8, 9, 13, 16, 23, 27, 29, 30, 33, 35], "avail": [0, 1, 4, 5, 6, 8, 10, 13, 23, 24, 30, 31, 33, 37], "good": [0, 1, 3, 19, 29, 38], "On": [0, 5, 15, 19, 29, 30], "automat": [0, 1, 4, 5, 12, 15, 17, 19, 20, 22, 29, 30, 33, 34, 35], "dispatch": [0, 8, 33], "underli": [0, 5, 29, 37], "detect": [0, 4], "set": [0, 1, 3, 4, 5, 6, 8, 9, 10, 13, 16, 22, 23, 24, 27, 30, 32, 33, 35, 36], "isa": 0, "leverag": [0, 23], "unit": [0, 8], "runtim": [0, 1, 5, 7, 8, 9, 11, 18, 20, 24, 27, 33, 35], "offer": [0, 3, 20, 30, 37], "finer": 0, "grain": [0, 2], "thread": [0, 1, 4, 8, 22, 32], "control": [0, 5, 20, 22], "weight": [0, 1, 4, 6, 8, 15, 16, 17, 19, 31, 33, 34, 36], "share": [0, 3, 5, 6, 7, 8, 27, 33], "increas": [0, 1, 2, 16, 20, 27, 29, 30, 33, 34, 35, 37], "effici": [0, 6, 8, 9, 15, 23, 29, 30, 33, 38], "implement": [0, 3, 4, 5, 6, 7, 8, 9, 19, 29, 30, 33, 38], "regist": [0, 10, 30, 33], "mechan": [0, 8, 33], "These": [0, 4, 11, 15, 29, 30, 33], "nativ": [0, 5, 11, 27, 33, 38], "calcul": [0, 1, 8, 11, 18, 20, 30, 33], "util": [0, 4, 5, 6, 7, 8, 9, 13, 15, 16, 18, 19, 27, 30, 35, 36], "dpc": [0, 7, 10, 27, 33], "compil": [0, 3, 4, 10, 20, 24, 27, 33], "sycl": [0, 1, 5, 7, 10, 13, 18, 33, 34, 36], "standard": [0, 8, 29], "also": [0, 1, 4, 5, 7, 8, 10, 17, 18, 19, 20, 27, 29, 30, 31, 33, 34, 35, 37, 38], "number": [0, 3, 4, 6, 8, 9, 16, 18, 20, 22, 27, 32, 33, 38], "which": [0, 1, 4, 5, 6, 7, 8, 10, 11, 15, 18, 19, 20, 22, 27, 29, 30, 33, 35, 37], "found": [0, 4, 17, 19, 31, 33], "doc": [0, 3, 17, 31, 36], "directori": [0, 3, 8, 10, 24, 27, 31, 33], "team": [0, 3], "track": [0, 1], "bug": [0, 3, 33, 34, 35], "enhanc": [0, 2, 23, 29, 30, 33], "request": [0, 1, 3, 34], "issu": [0, 3, 11, 27, 29, 30], "befor": [0, 1, 3, 4, 5, 10, 17, 19, 20, 22, 27, 30, 34, 35, 36], "submit": [0, 1, 3, 8], "suggest": [0, 19, 20], "report": [0, 18, 27], "search": [0, 3, 29, 33], "exist": [0, 3, 17, 20, 27, 30, 33], "see": [0, 1, 3, 8, 11, 15, 19, 20, 22, 27, 33, 35], "alreadi": [0, 3, 4, 16, 19, 29, 34], "dtype": [1, 4, 11, 13, 17, 20, 23, 24, 30, 31, 33, 36], "none": [1, 6, 9, 38], "level": [1, 5, 8, 10, 19, 20, 29, 30, 33, 35], "o1": 1, "inplac": [1, 17, 19, 30, 31], "fals": [1, 4, 9, 11, 17, 19, 20, 22, 24, 30, 31], "conv_bn_fold": 1, "linear_bn_fold": 1, "weights_prepack": 1, "replace_dropout_with_ident": 1, "optimize_lstm": 1, "split_master_weight_for_bf16": 1, "fuse_update_step": 1, "auto_kernel_select": 1, "sample_input": [1, 12], "graph_mod": 1, "concat_linear": 1, "appli": [1, 4, 5, 11, 16, 19, 24, 29, 30, 31, 33, 38], "given": [1, 29], "nn": [1, 4, 6, 9, 11, 13, 15, 19, 34, 36], "If": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13, 17, 18, 19, 20, 27, 30, 33, 34, 35, 36], "train": [1, 2, 6, 9, 15, 16, 17, 19, 24, 27, 30, 31, 33, 34, 36], "otherwis": [1, 9, 10, 30], "infer": [1, 2, 5, 14, 15, 17, 19, 23, 24, 27, 33, 36], "conv": [1, 11, 18, 36], "bn": 1, "fold": 1, "prepack": [1, 19, 29], "so": [1, 4, 5, 7, 11, 16, 19, 22, 23, 27, 33, 37, 38], "onednn": [1, 2, 5, 10, 13, 18, 23, 29, 33], "order": [1, 7, 13, 15, 19, 22, 27], "cach": [1, 3, 8, 10, 14, 18, 33, 34, 37, 38], "reus": 1, "layout": [1, 5], "call": [1, 4, 5, 6, 8, 11, 18, 19, 20, 22, 30, 36, 37], "block": [1, 3, 30, 33, 34, 36], "although": 1, "itself": [1, 19, 20], "fast": [1, 8, 16, 30, 34], "enough": [1, 27], "usag": [1, 5, 7, 11, 13, 17, 19, 20, 24, 26, 33], "perspect": [1, 19], "drawback": 1, "run": [1, 3, 4, 5, 6, 8, 9, 10, 11, 20, 22, 24, 27, 33, 34, 35, 36], "split": [1, 8, 10, 18, 34, 36], "one": [1, 3, 6, 7, 8, 10, 13, 16, 17, 18, 19, 20, 27, 31, 33, 38], "sever": [1, 10, 18, 20, 27, 32, 38], "dimens": [1, 8, 13, 19, 36], "data": [1, 4, 6, 8, 11, 12, 14, 16, 17, 19, 23, 24, 27, 30, 31, 33, 35, 38], "fix": [1, 3, 27, 33], "size": [1, 4, 6, 7, 8, 9, 10, 16, 18, 19, 27, 29, 30, 33, 34, 35], "each": [1, 5, 6, 8, 9, 10, 11, 13, 16, 18, 20, 22, 36], "time": [1, 3, 5, 6, 8, 9, 18, 19, 20, 21, 27, 29, 30, 38], "execut": [1, 4, 5, 7, 8, 10, 11, 13, 18, 20, 21, 27, 33, 34, 35, 36, 38], "detail": [1, 3, 4, 5, 6, 8, 10, 11, 12, 13, 17, 19, 23, 24, 26, 29, 30, 32, 33, 36], "mermori": 1, "format": [1, 3, 5, 6, 7, 9, 12, 13, 15, 16, 18, 20, 22, 30, 33, 36], "manual": [1, 5, 13, 19, 23, 36], "To": [1, 3, 4, 5, 6, 9, 16, 19, 20, 22, 24, 29, 30, 33], "thi": [1, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38], "predefin": 1, "shape": [1, 7, 13, 20, 29], "prior": [1, 24], "match": [1, 11, 36], "requir": [1, 3, 4, 6, 7, 8, 11, 13, 15, 17, 19, 24, 27, 29, 30, 31, 33, 36], "won": [1, 11, 20, 22], "t": [1, 3, 7, 8, 11, 19, 20, 22, 23, 33], "convers": [1, 4, 11, 17, 30, 33], "directli": [1, 8, 30], "go": [1, 3, 4, 8, 11, 34, 35], "methodologi": [1, 4, 5, 8, 38], "possibl": [1, 4, 7, 13], "avoid": [1, 3, 8, 27], "thu": [1, 8, 11, 13, 19, 33], "paramet": [1, 4, 5, 6, 9, 11, 16, 18, 20, 23, 29, 30, 33, 36, 38], "work": [1, 3, 4, 6, 8, 9, 10, 19, 20, 24, 27, 29, 31], "bfloat16": [1, 2, 5, 13, 23, 24, 33, 36], "half": 1, "k": [1, 30], "float16": [1, 5, 23, 24, 30, 31, 33], "cast": [1, 11], "accord": [1, 29, 30], "default": [1, 4, 5, 6, 7, 9, 10, 18, 20, 22, 23, 27, 33, 36], "valu": [1, 4, 10, 15, 27, 29, 30], "mean": [1, 10, 19, 22, 27, 29, 30, 33], "do": [1, 3, 5, 8, 11, 19, 20, 27, 29, 30], "noth": 1, "note": [1, 2, 3, 6, 7, 8, 9, 12, 14, 16, 19, 20, 23, 27, 29, 30, 33, 35], "type": [1, 3, 4, 5, 7, 8, 9, 17, 19, 24, 27, 30, 33, 34, 35, 36], "conv2d": [1, 9, 11, 19, 33], "linear": [1, 6, 9, 11, 13, 15, 19, 33, 34, 36], "convtranspose2d": 1, "case": [1, 4, 5, 8, 9, 10, 12, 19, 27, 30], "addit": [1, 4, 23, 30, 33, 34, 35, 36], "embed": [1, 29], "lstm": [1, 13], "sgd": [1, 4, 6, 11, 16, 23, 33, 34, 36, 38], "string": [1, 9, 18], "o0": 1, "No": [1, 19, 22, 27, 33], "function": [1, 3, 4, 5, 8, 9, 11, 17, 18, 20, 22, 23, 24, 29, 30, 31, 33, 38], "just": [1, 8, 30, 31, 33, 34, 35], "return": [1, 4, 6, 8, 9, 11, 19, 20], "origin": [1, 7, 15, 16, 30, 31, 36, 38], "dropout": [1, 9, 33, 34, 36], "remov": [1, 3, 10, 20, 33], "inferenc": 1, "master": [1, 6, 34, 36], "fuse": [1, 29, 30, 33, 34, 36, 38], "updat": [1, 3, 6, 9, 33, 34, 36, 38], "step": [1, 3, 4, 6, 8, 9, 11, 13, 16, 18, 20, 22, 23, 30, 34, 36], "overridden": 1, "explicitli": [1, 4, 8, 10, 11, 20], "bool": 1, "whether": [1, 11, 19, 20, 36], "conv_bn": 1, "It": [1, 5, 6, 7, 8, 9, 14, 18, 19, 22, 24, 27, 30, 31, 33, 34, 35], "knob": 1, "overwrit": 1, "configur": [1, 4, 8, 17, 18, 20, 24, 30, 33, 35], "linear_bn": 1, "convolut": [1, 5, 11, 23, 34, 36], "reorder": [1, 19, 29], "replac": [1, 3, 6, 9, 17, 30, 34, 36], "ident": [1, 4, 19, 34, 36], "aten": [1, 7, 8, 10], "opportunit": 1, "bf16": [1, 2, 24, 33, 38], "save": [1, 3, 4, 9, 15, 16, 19, 33, 36], "solut": [1, 2, 9, 27, 29, 33], "doesn": [1, 19, 20], "support": [1, 3, 4, 6, 7, 8, 10, 13, 17, 18, 21, 23, 26, 27, 29, 31, 33, 35, 36, 38], "all": [1, 3, 4, 6, 8, 9, 10, 11, 13, 16, 18, 20, 22, 29, 30, 31, 32, 33, 36, 37, 38], "param": [1, 38], "tupl": [1, 6], "tensor": [1, 4, 5, 7, 8, 11, 13, 17, 20, 29, 33, 37], "feed": [1, 12, 19], "sampl": [1, 6, 12], "input": [1, 4, 6, 8, 9, 12, 13, 17, 18, 19, 20, 23, 30, 32], "impact": 1, "pack": [1, 7], "intel": [1, 2, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31, 33, 34, 35, 36, 38], "extens": [1, 2, 4, 7, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38], "pytorch": [1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38], "per": [1, 7, 8, 10, 16, 17, 32], "some": [1, 3, 5, 8, 9, 11, 13, 19, 20, 24, 27], "heurist": 1, "real": [1, 4, 6], "best": [1, 11, 17, 29], "try": [1, 3, 4, 6, 18, 20, 27], "select": [1, 4, 5, 25, 30, 33], "true": [1, 4, 6, 8, 9, 13, 15, 17, 23, 24, 30, 31, 36], "might": [1, 19, 27, 38], "cost": [1, 5, 8, 20, 21], "extra": [1, 6, 33], "auto": [1, 4, 8, 19, 29, 33], "prototyp": [1, 33, 35], "combin": [1, 17], "method": [1, 7, 8, 11, 20, 22, 29, 30], "multipl": [1, 3, 5, 6, 11, 19, 23, 27, 29, 33, 35], "subgraph": 1, "modifi": [1, 3, 16], "other": [1, 5, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 24, 27, 29, 30, 33, 37, 38], "place": [1, 6, 11, 18, 29], "scenario": [1, 7, 14, 17, 30], "convolutuon": 1, "counterpart": [1, 33], "pleas": [1, 3, 5, 6, 9, 18, 20, 21, 23, 24, 30, 33, 35, 36], "invok": [1, 4, 8, 11, 24, 27, 31, 33], "ddp": [1, 5, 9, 33], "distribut": [1, 2, 6, 9, 10, 16, 27, 33, 34, 35], "deepcopi": 1, "rather": [1, 18, 19], "than": [1, 10, 13, 14, 17, 18, 19, 20, 23, 27, 30, 33], "allreduc": [1, 6, 16, 33], "caus": [1, 27, 29, 33, 35], "unpredict": 1, "accuraci": [1, 2, 9, 11, 29, 30, 33], "loss": [1, 4, 6, 9, 11, 16, 19, 23, 29, 30], "exampl": [1, 3, 5, 10, 11, 13, 16, 18, 19, 20, 22, 24, 25, 26, 29, 30, 31, 38], "load_state_dict": 1, "path": [1, 4, 8, 9, 10, 18, 19, 27, 36], "eval": [1, 4, 9, 11, 17, 24, 31], "optimized_model": 1, "evalu": [1, 24, 33], "optimized_optim": 1, "altern": [1, 4, 5, 6, 19], "motiv": [1, 4], "ad": [1, 4, 6, 20, 23, 33], "alia": [1, 4, 8], "unifi": [1, 4], "style": [1, 3, 4, 8], "modular": [1, 4], "optimize_transform": [1, 29, 30, 31, 33], "float32": [1, 20, 24], "quantization_config": [1, 30], "qconfig_summary_fil": 1, "low_precision_checkpoint": 1, "deployment_mod": 1, "transform": [1, 2, 4, 9, 14, 19, 32, 33], "focu": [1, 19, 29, 30, 31, 33], "especi": [1, 3, 8], "task": [1, 29, 30], "famili": [1, 29], "llama": [1, 2, 29, 33], "gpt": [1, 29, 30, 32, 33], "j": [1, 29, 30, 32, 33], "neox": 1, "opt": [1, 6, 24, 29, 32, 33], "falcon": 1, "now": [1, 5, 8, 19, 23], "float": [1, 4, 5, 8, 9, 11, 15, 36], "when": [1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 27, 29, 30, 33, 34, 35, 36, 38], "mix": [1, 4, 8, 33], "str": [1, 6, 20], "specifi": [1, 8, 9, 10, 18, 35, 36], "either": [1, 5, 6], "object": [1, 4, 27, 33], "defin": [1, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 31, 33, 36], "recip": [1, 15], "quant": 1, "static": [1, 5, 17, 29, 30, 36], "onc": [1, 3, 5, 13, 18, 19, 20, 30, 35, 36], "quantizat": 1, "config": [1, 4, 17], "json": [1, 20], "file": [1, 3, 4, 8, 10, 11, 18, 19, 20, 27, 33, 35], "under": [1, 7, 11, 14, 19, 27, 28, 33], "need": [1, 3, 4, 5, 6, 8, 9, 16, 17, 19, 20, 22, 23, 24, 31, 33, 34, 35, 36, 38], "calibr": [1, 4, 17, 31], "dict": 1, "int4": [1, 30, 33], "": [1, 2, 6, 8, 9, 11, 18, 19, 20, 27, 30, 38], "should": [1, 3, 8, 9, 11, 18, 20, 22, 27, 29, 30], "state_dict": [1, 4, 9, 16], "checkpoint": [1, 4, 16, 27], "pt": [1, 4, 9, 30], "gptq": [1, 30], "etc": [1, 3, 5, 14, 18], "where": [1, 3, 5, 6, 7, 9], "kei": [1, 5, 6, 29, 33, 34], "group": [1, 6, 8, 9], "chang": [1, 3, 4, 6, 8, 9, 11, 16, 19, 24, 27, 30, 31, 33], "make": [1, 3, 4, 5, 6, 8, 9, 16, 20, 23, 24, 29, 30, 33, 35], "n": [1, 4, 6, 8, 9, 19], "thei": [1, 8, 11, 19, 29, 30, 33], "uint4": 1, "compress": [1, 15, 30], "along": [1, 3, 22], "store": [1, 14, 19, 29, 38], "int32": [1, 33], "zero": [1, 5, 9, 10, 20, 27, 30, 35], "point": [1, 5, 7, 8, 11, 15, 30], "scale": [1, 2, 5, 9, 15, 16, 29, 30, 33], "bia": [1, 8, 11, 14, 19, 30, 33], "state": [1, 5, 8, 9, 16, 29, 37], "channel": [1, 2, 10, 17, 33, 34], "automaticlli": 1, "deploy": [1, 4, 30], "torchscirpt": 1, "workabl": 1, "forward": [1, 4, 6, 8, 9, 11, 13, 19, 36], "after": [1, 3, 4, 8, 17, 20, 22, 24, 25, 27, 30, 38], "deepspe": [1, 29, 32, 33], "parallel": [1, 6, 27, 29, 33], "get_fp32_math_mod": 1, "fpmath_mod": 1, "fpmath": 1, "fp32mathmod": 1, "fp32": [1, 10, 13, 14, 17, 24, 33, 38], "bf32": [1, 10], "tf32": [1, 10], "disabl": [1, 9, 10, 18, 36], "implicit": 1, "set_fp32_math_mod": 1, "current_devic": 1, "int": [1, 4, 6, 8, 9], "index": [1, 3, 6, 7, 8, 9, 19, 22, 29, 33], "current_stream": [1, 8], "class": [1, 6, 9, 11, 19], "ani": [1, 3, 11, 18, 19, 20, 27, 33, 34], "context": [1, 3, 5, 7, 8, 10, 11, 22, 29], "wrapper": [1, 6, 8, 22], "encapsul": [1, 8], "op": [1, 3, 9, 10, 13, 18, 22, 23, 29, 30], "argument": [1, 6, 9, 13, 20, 30], "neg": 1, "integ": [1, 30], "device_count": [1, 9, 20], "device_of": 1, "obj": 1, "storag": [1, 38], "alloc": [1, 7, 16, 18, 29, 34, 37], "get_device_nam": 1, "name": [1, 8, 13, 20, 22, 30], "get_device_properti": 1, "properti": [1, 4, 8], "_deviceproperti": 1, "init": [1, 3, 16], "initi": [1, 6, 9, 16], "lazi": 1, "until": [1, 3, 22], "first": [1, 2, 3, 4, 6, 8, 12, 16, 17, 20, 30, 33], "access": [1, 7, 8, 19, 27, 29, 33, 38], "veri": [1, 3, 8, 19, 20, 21, 27, 29], "rare": 1, "sinc": [1, 7, 8, 19, 30, 34, 35], "could": [1, 6, 10, 17, 18, 19, 20, 30, 31, 36], "demand": [1, 5], "doe": [1, 7, 8, 18, 19, 23, 27, 33, 36], "repeatedli": [1, 3], "is_avail": 1, "indic": [1, 19], "is_initi": 1, "set_devic": [1, 6, 9, 16], "discourag": 1, "favor": 1, "most": [1, 5, 10, 13, 17, 27, 29, 33], "ze_affinity_mask": 1, "environment": 1, "variabl": [1, 3, 5, 10, 16, 18, 24, 27], "restrict": 1, "visibl": [1, 33], "streamcontext": 1, "around": [1, 8], "synchron": [1, 7, 8, 10, 16, 20], "wait": [1, 7, 20, 30], "complet": [1, 3, 4, 18, 19, 20, 31, 37], "fp8": [1, 5, 33], "fp8_autocast": [1, 15], "fp8_recip": [1, 15], "delayedsc": [1, 15], "get_rng_stat": 1, "bytetensor": 1, "rng": 1, "eagerli": 1, "get_rng_state_al": 1, "list": [1, 3, 4, 10, 11, 19, 20, 26, 29, 31, 33, 35], "repres": [1, 3, 6, 18, 33], "set_rng_stat": 1, "new_stat": 1, "desir": [1, 4, 17], "set_rng_state_al": 1, "iter": [1, 6, 20, 29], "manual_se": [1, 6, 9], "seed": [1, 6, 9], "safe": [1, 7], "silent": 1, "ignor": [1, 36], "multi": [1, 6, 35], "insuffici": 1, "determin": [1, 30], "manual_seed_al": 1, "seed_al": 1, "initial_se": 1, "prioriti": [1, 13], "kwarg": [1, 20], "record_ev": 1, "record": [1, 9, 18, 20], "new": [1, 2, 3, 15, 19, 30, 31, 33, 34], "sycl_queu": [1, 4, 8], "pycapsul": [1, 8], "queue": [1, 4, 5, 10, 18], "correspond": [1, 6, 27, 30, 33], "void": [1, 8], "pointer": [1, 8, 27], "address": [1, 6, 19], "Its": [1, 34, 36], "capsul": 1, "self": [1, 6, 9, 11, 19, 20], "wait_ev": 1, "futur": [1, 3, 12], "wait_stream": 1, "anoth": [1, 30], "without": [1, 5, 7, 11, 20, 22, 27, 30, 33, 34], "enqueu": 1, "affect": 1, "elapsed_tim": [1, 9], "end_ev": 1, "elaps": [1, 9], "millisecond": 1, "wa": [1, 4, 7, 8, 22, 27, 33], "queri": [1, 19], "check": [1, 4, 5, 6, 8, 13, 19, 20, 24, 29, 31, 34, 36], "captur": [1, 37], "A": [1, 3, 4, 5, 17, 19, 27, 30, 33, 35], "boolean": 1, "prevent": [1, 16, 38], "proceed": 1, "empty_cach": [1, 37], "unoccupi": 1, "held": 1, "those": [1, 8, 16, 20, 29, 37], "sysman": 1, "toolkit": [1, 6, 24, 32, 33], "amount": [1, 37], "howev": [1, 3, 5, 8, 10, 11, 12, 19, 29, 30, 33, 37], "help": [1, 3, 4, 8, 9, 13, 29, 34, 35, 37], "fragment": 1, "memory_stat": [1, 37], "dictionari": 1, "statist": [1, 4, 17, 31], "non": [1, 3, 11, 19, 30], "core": [1, 16, 23, 27, 32, 33], "large_pool": 1, "small_pool": 1, "peak": 1, "freed": [1, 37], "receiv": [1, 27, 33], "allocated_byt": 1, "segment": [1, 5, 20, 33], "reserv": [1, 18, 34], "xpumalloc": 1, "reserved_byt": 1, "activ": [1, 4, 15, 16, 17, 20, 24, 27, 29, 30, 31], "active_byt": 1, "inactive_split": 1, "inact": 1, "inactive_split_byt": 1, "broken": 1, "down": [1, 27], "pool": [1, 34], "across": [1, 5, 6, 9], "octob": 1, "2019": [1, 2], "1mb": 1, "small": [1, 17], "metric": 1, "maximum": 1, "histor": 1, "total": [1, 8, 20, 32, 33, 37], "decreas": [1, 27], "simpl": [1, 8, 10, 11, 19, 23, 24, 30, 33], "counter": 1, "num_alloc_retri": 1, "fail": [1, 27, 33, 36], "flush": 1, "retri": 1, "num_oom": 1, "out": [1, 4, 5, 7, 8, 11, 20, 22, 27, 30, 38], "error": [1, 3, 4, 8, 9, 18, 19, 27, 30, 33], "thrown": [1, 27], "memory_summari": 1, "abbrevi": 1, "human": 1, "readabl": [1, 8], "printout": 1, "displai": 1, "period": 1, "dure": [1, 3, 4, 27, 29, 30, 35, 36], "handl": [1, 4, 7, 19, 33], "except": [1, 6, 29], "summari": 1, "memory_snapshot": [1, 37], "snapshot": [1, 37], "interpret": [1, 8], "output": [1, 4, 9, 10, 11, 13, 15, 16, 18, 19, 20, 22, 23, 30], "familiar": [1, 8], "intern": [1, 10, 19], "memory_alloc": [1, 37], "occupi": [1, 37], "byte": 1, "less": [1, 5, 10, 11, 27, 33], "unus": [1, 37], "creat": [1, 3, 4, 7, 8, 10, 14, 17, 20, 23, 31, 33, 36], "max_memory_alloc": [1, 37], "By": [1, 5, 36], "begin": [1, 3, 8, 22], "reset_peak_stat": 1, "reset": [1, 14], "two": [1, 5, 7, 8, 15, 18, 29, 34], "measur": 1, "loop": [1, 3, 10, 20], "memory_reserv": [1, 37], "max_memory_reserv": [1, 37], "reset_peak_memory_stat": 1, "stat": 1, "individu": [1, 3], "memory_stats_as_nested_dict": 1, "nest": [1, 22], "reset_accumulated_memory_stat": 1, "accumul": 1, "enum": 1, "fp32_math_mod": 1, "dpccp": 1, "packet": 1, "enumer": [1, 4, 7, 9, 16], "math": [1, 5, 8, 10, 13], "fp32_math_mode_min": 1, "fp32_math_mode_max": 1, "comput": [1, 4, 15, 16, 19, 23, 29, 30, 33, 34, 35, 36], "primit": [1, 10], "attribut": [1, 19], "descript": [1, 5, 9, 10, 18, 19, 26, 36], "definit": [1, 8], "numer": [1, 11], "behavior": [1, 13, 22], "get_queue_from_stream": [1, 4, 8], "c10": [1, 4], "dpcpp": [1, 4, 8, 27, 33], "dec": 2, "2023": [2, 30, 32, 33], "softwar": [2, 28], "jul": 2, "deep": [2, 6, 7, 9, 11, 13, 15, 16], "learn": [2, 6, 7, 9, 11, 15, 16], "boost": [2, 4, 12, 32, 33], "dl": 2, "hug": 2, "face": 2, "bert": [2, 15], "googl": [2, 3], "cloud": 2, "platform": [2, 4, 7, 19, 23, 27, 30, 33, 35, 36], "gcp": 2, "technologi": 2, "guid": [2, 6], "apr": 2, "mar": 2, "x86": 2, "sapphir": 2, "rapid": 2, "part": [2, 4, 8, 11, 18, 19, 20, 34, 36], "jan": 2, "secur": 2, "torchserv": 2, "confer": 2, "2022": [2, 30, 33], "what": [2, 11, 20, 22, 33, 34, 35], "pyg": 2, "stabl": [2, 5, 6, 7, 11, 27], "diffus": 2, "arc": [2, 27, 30, 33, 35], "nov": [2, 33], "13": [2, 22], "potenti": [2, 23, 36], "fine": [2, 8, 30], "fx": 2, "sep": 2, "empow": [2, 5, 23], "xeon": [2, 32], "scalabl": 2, "processor": [2, 38], "aug": 2, "vision": [2, 4], "last": [2, 8, 10, 33, 34], "One": [2, 7, 15, 19, 38], "click": 2, "compressor": 2, "throughput": [2, 29, 33], "4x": [2, 32], "jun": 2, "grokk": 2, "principl": [2, 16, 19], "kt": 2, "person": 2, "text": [2, 35], "speech": 2, "2021": [2, 6, 33], "tune": [2, 11, 30], "up": [2, 5, 7, 8, 9, 29, 33, 34], "modern": 2, "naver": 2, "low": [2, 5, 6, 8, 24, 33, 38], "latenc": [2, 29, 33], "machin": [2, 3, 6, 30], "feb": [2, 33], "dlrm": 2, "oneccl": [2, 5, 9, 33], "mention": [2, 6, 8, 18], "deprec": [2, 12], "facebook": [2, 29], "3rd": 2, "gen": 2, "capabl": [2, 5, 17, 18, 23, 30, 33, 37], "2020": [2, 7], "collabor": 2, "caff": 2, "2017": 2, "thank": 3, "interest": 3, "intent": 3, "want": [3, 8, 10, 19, 20, 24], "propos": [3, 19, 29, 30], "post": [3, 5, 17, 23, 29, 30], "intend": 3, "shall": [3, 19], "discuss": [3, 8, 19], "design": [3, 9, 11, 13, 14, 19, 30, 31, 33, 34, 36], "agre": 3, "plan": 3, "look": [3, 4, 8, 19, 30], "ahead": [3, 8, 20, 22], "outstand": 3, "pick": 3, "comment": 3, "d": [3, 4, 8, 11, 36], "particular": [3, 11, 27, 30, 31, 33], "ask": 3, "pull": 3, "http": [3, 6, 7, 8, 9, 27], "com": [3, 6, 9, 27], "full": [3, 8, 23], "instal": [3, 4, 9, 10, 20, 23, 24, 26, 27, 29, 33, 35], "here": [3, 6, 7, 8, 9, 11, 18, 19, 20, 22], "uninstal": 3, "pip": [3, 6, 16, 23, 30], "ll": [3, 8, 20, 22], "know": [3, 7, 8, 34, 35], "fulli": [3, 17, 31, 33], "warn": [3, 18], "skip": [3, 4, 19, 22, 30, 34, 35], "few": [3, 12, 19], "alwai": [3, 11, 13, 19, 27, 33], "timeout": 3, "ye": 3, "clone": [3, 22, 38], "copi": [3, 5, 7, 19], "git": 3, "b": [3, 11, 22], "cd": [3, 4], "rebas": 3, "submodul": 3, "sync": 3, "recurs": 3, "job": [3, 27], "setup": [3, 8, 9, 16, 20, 29], "py": [3, 6, 8, 9, 10, 20, 27], "symlink": 3, "tree": 3, "reinstal": [3, 23], "again": [3, 23, 38], "__init__": [3, 6, 9, 11, 19], "would": [3, 4, 8, 10, 13, 17, 18, 19, 27, 33], "interfac": [3, 4, 8, 19, 29, 30], "pyi": 3, "cpp": [3, 4, 8], "h": [3, 4, 8, 18, 19, 30], "sure": [3, 6, 9, 16, 20], "Then": [3, 6, 17, 23, 27, 30, 33], "clean": [3, 27, 33], "our": [3, 4, 8, 13, 17, 29, 30], "3": [3, 4, 6, 8, 9, 11, 13, 18, 19, 20, 22, 23, 27, 30, 32, 33, 36], "6": [3, 4, 22, 32], "binari": [3, 4, 11, 19, 34, 35], "folder": 3, "mani": [3, 5, 8, 20], "wai": [3, 8, 19, 29, 30, 38], "next": [3, 8, 13], "re": [3, 6, 8, 11], "rm": [3, 14], "rf": 3, "toplevel": 3, "over": [3, 5, 8, 11, 12, 19, 33], "made": [3, 7, 33], "edit": 3, "repo": [3, 6, 23], "commit": [3, 23, 32], "keep": [3, 19, 20], "command": [3, 4, 6, 8, 16, 23, 27, 33], "realli": [3, 8], "untrack": 3, "deinit": 3, "f": [3, 4, 9, 16], "xdf": 3, "within": [3, 18, 30, 31, 33, 34, 36], "experi": [3, 13, 19, 29, 30], "environ": [3, 5, 6, 9, 10, 16, 18, 24, 27, 29], "env_key1": 3, "env_val1": 3, "env_key2": 3, "env_val2": 3, "suit": 3, "locat": 3, "test_": 3, "sub_fold": 3, "filenam": 3, "contain": [3, 6, 7, 10, 13, 30], "wish": [3, 8, 19, 20], "experiment": 3, "port": [3, 6], "stock": [3, 7, 19, 33], "10": [3, 7, 9, 18, 19, 20, 22, 23], "regress": [3, 12], "don": [3, 8, 11, 20, 23], "offici": [3, 17, 20, 33], "via": [3, 5, 7, 8, 10, 17, 20, 23, 30, 35, 37], "read": [3, 8, 30, 38], "readm": 3, "md": [3, 19], "docstr": 3, "length": [3, 32], "line": [3, 8, 19, 20, 22], "insid": [3, 8, 18, 23, 34, 36], "must": [3, 8, 20, 35, 36, 38], "limit": [3, 11, 19, 29, 30, 33], "80": 3, "charact": 3, "fit": [3, 30, 34], "jupyt": 3, "popup": 3, "abov": [3, 6, 7, 9, 10, 18, 19, 20, 22, 29, 38], "prerequisit": [3, 4], "r": [3, 32], "txt": [3, 4, 8], "html": [3, 7, 8], "_build": 3, "rst": 3, "live": 3, "tutori": [3, 4, 6, 8, 9, 20], "autofunct": 3, "autoclass": 3, "direct": 3, "shorten": 3, "sphinx": 3, "produc": [3, 7, 8, 11, 37], "miss": 3, "still": [3, 5, 8, 11, 19, 20, 33, 35], "torchvis": [4, 9], "demonstr": [4, 7, 13, 19, 30], "box": 4, "benefit": [4, 5, 11, 17], "against": 4, "precis": [4, 6, 15, 24, 30, 32, 33], "amp": [4, 23, 24, 33], "criterion": [4, 11], "below": [4, 6, 8, 11, 13, 14, 18, 19, 20, 22, 24, 30, 33, 35, 38], "move": [4, 17, 19, 24], "dataload": [4, 6, 9, 16, 20], "target": [4, 8, 9, 16, 33, 34, 35], "zero_grad": [4, 9, 16, 23], "autocast": [4, 23, 24], "backward": [4, 6, 8, 9, 11, 16, 23, 36], "lr": [4, 6, 9, 11, 23, 38], "001": [4, 6, 11], "download": [4, 9, 23, 27], "dataset": [4, 6, 9, 16, 30], "cifar10": 4, "compos": [4, 9], "resiz": 4, "224": [4, 11, 23], "totensor": [4, 9], "5": [4, 6, 9, 13, 17, 18, 19, 22, 30, 31, 32], "train_dataset": [4, 6, 16], "root": [4, 6, 10, 27, 29, 33], "train_load": [4, 6, 9, 11, 16], "batch_siz": [4, 6, 8, 9, 16, 19], "128": [4, 9, 11], "crossentropyloss": 4, "momentum": [4, 23, 38], "9": [4, 22, 33], "batch_idx": [4, 9, 16], "print": [4, 5, 6, 9, 16, 17, 19, 20, 22, 31], "model_state_dict": 4, "optimizer_state_dict": 4, "pth": 4, "finish": [4, 13, 18, 27], "nlp": 4, "resnet50_weight": 4, "rand": [4, 11, 19, 23], "no_grad": [4, 9, 23, 24], "bertmodel": 4, "from_pretrain": [4, 30], "uncas": 4, "vocab_s": 4, "seq_length": 4, "randint": 4, "freez": [4, 11, 23, 24], "strict": 4, "becaus": [4, 11, 19, 29, 35], "insert": [4, 17, 31], "observ": [4, 12, 17, 31], "prepare_jit": [4, 17, 31], "convert_jit": [4, 17, 31], "separ": [4, 8, 10, 18, 27, 28], "process": [4, 5, 6, 8, 9, 13, 15, 16, 22, 27, 29, 30, 36], "collect": [4, 5, 6, 9, 17, 20, 33], "o": [4, 6, 9, 27, 32, 33], "_recurs": 4, "wrap_cpp_modul": 4, "quantize_jit": [4, 17, 31], "modeljit": [4, 17, 31], "qconfig": [4, 17, 31], "minmaxobserv": [4, 17, 31], "with_arg": [4, 17, 31], "qscheme": [4, 17, 31], "per_tensor_symmetr": [4, 17, 31], "reduce_rang": [4, 17, 31], "quint8": 4, "default_weight_observ": [4, 17, 31], "calibration_data_load": 4, "batch": [4, 8, 9, 16, 19, 33, 34, 36], "len": [4, 9, 16, 20], "memory_format": [4, 5, 19], "channels_last": [4, 5, 19], "libtorch": [4, 33], "its": [4, 6, 7, 10, 11, 13, 14, 20, 24, 30], "own": [4, 8], "servic": 4, "regular": 4, "unlik": [4, 5, 9, 29, 30], "cmake": [4, 5, 33], "cppsdk": 4, "ensur": [4, 16], "page": [4, 6, 9, 20, 23, 25, 27, 31, 32, 36], "version": [4, 7, 8, 23, 27, 28, 33, 36, 38], "app": 4, "iostream": 4, "memori": [4, 5, 7, 8, 9, 11, 12, 15, 18, 20, 27, 29, 30, 32, 33, 36, 38], "argc": 4, "const": [4, 8], "char": 4, "argv": 4, "catch": [4, 17, 18], "std": [4, 8], "cerr": 4, "kxpu": 4, "ivalu": 4, "push_back": 4, "cout": 4, "slice": [4, 8, 19], "dim": [4, 7, 8, 9, 13, 19], "end": [4, 18, 20, 22, 27, 30, 34, 35, 36], "endl": 4, "cmakelist": [4, 8], "cmake_minimum_requir": [4, 8], "fatal_error": [4, 8], "find_packag": [4, 8], "add_execut": 4, "target_link_librari": [4, 8], "torch_ipex_librari": [4, 8], "set_properti": [4, 8], "cxx_standard": [4, 8], "17": [4, 8, 22, 32], "mkdir": 4, "build": [4, 5, 16, 22, 23, 27, 33, 35], "cc": 4, "icx": [4, 8], "cxx": 4, "icpx": [4, 8], "dcmake_prefix_path": [4, 8], "libpytorch_path": 4, "libpytorch": 4, "_": [4, 6, 7, 8, 14, 17, 18, 19, 22, 27, 30, 33, 36], "absolut": 4, "verifi": [4, 23, 27, 29], "linux": [4, 8, 27, 33], "ldd": 4, "y": [4, 11, 30], "z": [4, 8], "log": [4, 5, 9, 10, 22, 33], "depend": [4, 19, 33], "choos": [4, 5, 11, 13, 20, 33], "workspac": [4, 14], "identif": 4, "intelllvm": 4, "202x": 4, "abi": [4, 33], "info": [4, 17, 18], "done": [4, 9], "oneapi": [4, 5, 6, 7, 9, 13, 16, 24, 27, 32, 33, 35], "bin": [4, 27, 32, 33], "pthread": 4, "test": [4, 9, 22, 23, 32, 33, 34, 35], "cmake_have_libc_pthread": 4, "success": [4, 25], "lib": [4, 10, 18, 27, 33], "libintel": 4, "ext": 4, "written": [4, 33, 36], "0x00007fd5bb927000": 4, "libc10": 4, "0x00007fd5bb895000": 4, "libtorch_cpu": 4, "0x00007fd5a44d8000": 4, "0x00007fd5a1a1b000": 4, "0x00007fd5862b0000": 4, "libmkl_intel_lp64": [4, 27, 33], "mkl": [4, 24, 27, 33], "intel64": [4, 27, 33], "0x00007fd584ab0000": 4, "libmkl_cor": [4, 27, 33], "0x00007fd5806cc000": 4, "libmkl_gnu_thread": [4, 27], "0x00007fd57eb1d000": 4, "libmkl_sycl": [4, 27, 33], "0x00007fd55512c000": 4, "libopencl": 4, "0x00007fd55511d000": 4, "libsvml": 4, "intel64_lin": 4, "0x00007fd553b11000": 4, "libirng": 4, "0x00007fd553600000": 4, "libimf": 4, "0x00007fd55321b000": 4, "libintlc": 4, "0x00007fd553a9c000": 4, "libsycl": 4, "0x00007fd552f36000": 4, "show": [4, 6, 7, 8, 11, 20, 22, 29, 30, 31, 32, 33], "fsycl": [4, 8, 35], "cmake_cxx_flag": 4, "usm": [4, 7], "cl": 4, "hpp": 4, "namespac": [4, 11], "fetch": 4, "stream": [4, 10], "device_typ": [4, 8], "devicetyp": [4, 8], "impl": [4, 8, 33], "virtualguardimpl": [4, 8], "xpu_stream": 4, "getstream": [4, 8], "input_ptr": 4, "malloc_devic": 4, "fromusm": 4, "scalartyp": 4, "nullopt": 4, "output_tensor": 4, "append": 4, "allow": [4, 11, 27, 30, 33, 34, 35], "former": [4, 8], "zoo": 4, "benchmark": [4, 32, 37], "mark": [4, 18, 20, 30], "document": [4, 5, 8, 10, 31, 33], "column": [4, 8, 20], "simpli": [4, 8], "guidanc": 5, "nchw": 5, "nhwc": [5, 33], "anymor": 5, "center": [5, 14, 23, 27, 30, 33, 35], "flex": [5, 27, 33, 35], "seri": [5, 14, 23, 27, 30, 33, 35], "typic": [5, 7, 16, 20, 30, 33], "speed": [5, 8, 29, 33, 34, 38], "side": [5, 7], "imper": 5, "illustr": [5, 6, 9, 17, 19], "workflow": [5, 17], "meet": [5, 15, 20, 36], "commun": [5, 6, 7, 9, 33], "bind": [5, 8, 9, 33], "formerli": [5, 6, 9], "known": [5, 6, 9, 29], "torch_ccl": [5, 6], "horovod": [5, 27, 33], "among": [5, 7, 16], "framework": [5, 7, 10, 16], "interopar": 5, "particularli": [5, 7], "describ": [5, 6, 11, 19, 27, 30], "write": [5, 20], "practic": [5, 8, 29], "setuptool": 5, "suffici": [5, 10, 30], "driver": [5, 27, 32, 35], "ze_flat_device_hierarchi": [5, 10], "hierarchi": 5, "expos": [5, 11], "tile": [5, 6, 10, 29, 32], "industri": [5, 9, 33], "grade": [5, 9, 33], "worker": [5, 6, 9, 16], "maintain": [5, 6, 8, 9, 11], "replica": [5, 6, 9], "gradient": [5, 9, 15, 16, 30], "rank": [5, 6, 9, 16], "footprint": [5, 9, 15, 29, 33], "feasibl": [5, 9], "seamlessli": [5, 23], "har": [5, 23], "flagship": [5, 23], "inductor": [5, 23, 33], "torchinductor": [5, 23], "built": [5, 8, 22, 27, 33, 35], "let": [5, 8, 19, 22, 38], "stack": [5, 11, 18, 22], "piec": [5, 22], "verbos": [5, 8, 10, 18, 22], "messag": [5, 8, 10, 18, 19, 22, 27, 33], "indent": [5, 20, 22], "distinguish": [5, 22], "field": [5, 20, 22], "statement": [5, 20], "capac": [5, 13, 32], "continu": [5, 13, 18, 22, 27, 33], "macro": [5, 18], "torch_check": [5, 8, 18], "torch_error": [5, 18], "replic": 6, "everi": [6, 22, 29], "fed": 6, "c10d": [6, 9], "ccl": [6, 9, 16, 32], "processgroup": [6, 9], "hold": [6, 9, 19], "allgath": [6, 9, 16, 33], "alltoal": [6, 16], "successfulli": [6, 20], "apt": 6, "yum": 6, "dnf": 6, "sudo": 6, "devel": 6, "12": [6, 22, 33], "309": 6, "oneccl_bindings_for_pytorch": [6, 9], "repo_url": 6, "whl": [6, 23, 27], "u": [6, 8], "repositori": 6, "holder": [6, 18], "url": 6, "m": [6, 8, 9, 16, 23], "oneccl_bind_pt": 6, "basekit": [6, 16], "oneapi_root": 6, "env": [6, 16, 24], "var": [6, 16, 24], "sh": [6, 16, 24, 30], "manag": [6, 8, 11, 29], "modif": [6, 9, 16, 17], "necessari": [6, 9, 16, 19, 20, 22], "dist": [6, 9, 11, 27], "init_process_group": [6, 9], "exclus": [6, 9, 10], "id": [6, 7, 9, 22, 29], "local": [6, 9, 16], "arg": [6, 9, 16, 18, 20, 38], "local_rank": [6, 9, 16], "wrap": [6, 9, 16, 36], "device_id": [6, 7, 9, 20], "exactli": [6, 8], "resid": 6, "seed_numb": 6, "same": [6, 8, 9, 18, 19, 27, 29, 33], "launcher": 6, "cwd": 6, "setvar": 6, "Or": 6, "example_ddp": 6, "def": [6, 8, 9, 11, 19, 20, 27, 33], "super": [6, 9, 11, 19], "4": [6, 8, 18, 19, 22, 30, 33], "__name__": [6, 9], "__main__": [6, 9], "123": 6, "mpi_world_s": 6, "pmi_siz": 6, "mpi_rank": 6, "pmi_rank": 6, "world_siz": [6, 9], "els": [6, 19, 20, 38], "world": 6, "master_addr": [6, 9], "127": 6, "master_port": [6, 9], "29500": 6, "global": [6, 20], "get_rank": 6, "get_world_s": 6, "loss_fn": 6, "mseloss": 6, "rune": 6, "randn": [6, 13, 19, 20, 22], "label": [6, 11, 15], "l": 6, "mpirun": 6, "card": [6, 19, 27, 29, 33], "regard": [6, 19], "explicit": [6, 22], "minor": [6, 33], "single_card": 6, "single_card_dist": 6, "importerror": [6, 27, 33], "rais": 6, "spawn": [6, 9], "multiprocess": [6, 9], "multi_process_spawn": 6, "main_work": 6, "put": [6, 7, 9, 20], "train_sampl": [6, 16], "epoch": [6, 9, 16], "set_epoch": [6, 9], "adjust": [6, 30], "warp": 6, "sampler": [6, 9, 16], "loader": 6, "shuffl": [6, 9], "num_work": [6, 9], "pin_memori": [6, 9], "wide": [7, 30], "adopt": [7, 29, 30, 33], "numpi": 7, "domain": [7, 15], "interoper": 7, "v0": [7, 33], "7": [7, 9, 22], "relat": [7, 9, 17, 18, 20], "extern": 7, "from_dlpack": 7, "t2": 7, "empti": [7, 18, 19, 22], "capsule2": 7, "to_dlpack": 7, "dlmanagedtensor": 7, "stride": [7, 11], "pars": [7, 9], "extract": 7, "data_ptr": 7, "respons": [7, 17, 22, 29], "atendlmtensor": 7, "ndim": 7, "dmlc": 7, "io": 7, "spec": 7, "dldevicetyp": 7, "kdloneapi": 7, "between": [7, 11, 22, 29, 30], "kdlsycl": 7, "reli": [7, 19], "filter": 7, "selector": 7, "actual": [7, 8, 19, 27, 33], "parent": 7, "get_devic": 7, "consum": [7, 20], "valid": [7, 9, 10, 14], "three": [7, 29], "host": [7, 20, 30, 32], "far": [7, 23], "recogn": 7, "situat": 7, "probabl": [7, 9, 27], "hard": [7, 19], "variou": [7, 20, 23, 29, 30, 33], "monitor": [7, 37], "flow": 7, "readi": 7, "highli": [8, 13, 17, 24, 29, 30, 33], "org": 8, "walk": 8, "come": 8, "flavor": 8, "aot": [8, 10], "cpp_extens": 8, "approach": [8, 27, 29], "latter": 8, "afterward": 8, "besid": [8, 20, 29, 30, 33], "long": [8, 19, 29], "term": [8, 28], "lltm": 8, "dpcppextens": 8, "dpcppbuildextens": 8, "ext_modul": 8, "lltm_xpu": 8, "lltm_xpu_kernel": 8, "cmdclass": 8, "build_ext": 8, "conveni": [8, 11], "correct": [8, 9, 19], "equival": [8, 30, 33, 38], "vanilla": 8, "include_dir": 8, "include_path": 8, "And": [8, 33], "goe": 8, "plug": 8, "previous": 8, "were": 8, "elabor": 8, "fly": 8, "background": 8, "temporari": 8, "tmp": [8, 20], "torch_extens": 8, "ver": 8, "_xpu": 8, "emit": 8, "ninja": 8, "fact": [8, 19], "home": [8, 16, 27], "user_nam": 8, "ones": [8, 17, 30], "complic": [8, 22], "power": [8, 15], "system": [8, 27, 33], "increment": 8, "reload": 8, "second": [8, 16, 18, 20, 27], "18": [8, 22, 33], "compon": [8, 10, 28, 29], "set_source_files_properti": 8, "compile_flag": 8, "add_librari": 8, "torch_librari": 8, "target_include_directori": 8, "public": [8, 33], "python_include_dir": 8, "torch_ipex_include_dir": 8, "prefix": 8, "cmake_prefix_path": 8, "dcmake_c_compil": 8, "dcmake_cxx_compil": 8, "aval": 8, "c10_stream": 8, "associ": [8, 34], "subsequ": [8, 19], "yourself": 8, "strategi": 8, "pybind11": 8, "ultim": 8, "care": [8, 22], "consid": 8, "cuda": [8, 9, 20, 33], "declar": 8, "lltm_xpu_forward": 8, "old_h": 8, "old_cel": 8, "lltm_xpu_backward": 8, "grad_h": 8, "grad_cel": 8, "new_cel": 8, "input_g": 8, "output_g": 8, "candidate_cel": 8, "gate_weight": 8, "check_xpu": 8, "is_xpu": 8, "check_contigu": 8, "is_contigu": [8, 19], "contigu": [8, 19, 29, 33], "check_input": 8, "lltm_forward": 8, "lltm_backward": 8, "pybind11_modul": 8, "torch_extension_nam": 8, "bridg": 8, "natur": [8, 19, 29, 30], "templat": [8, 13, 18, 29], "typenam": 8, "scalar_t": 8, "sigmoid": [8, 33], "0f": 8, "exp": [8, 33], "At": [8, 29], "header": 8, "essenti": 8, "helper": 8, "d_sigmoid": 8, "d_tanh": 8, "tanh": [8, 33], "elu": [8, 33], "alpha": [8, 38], "fmax": 8, "fmin": 8, "d_elu": 8, "d_relu": 8, "hand": 8, "cat": [8, 11, 13], "gate": 8, "addmm": [8, 11, 33], "transpos": [8, 33], "state_s": 8, "new_h": 8, "zeros_lik": 8, "at_dispatch_floating_typ": 8, "lltm_forward_xpu": 8, "lltm_xpu_forward_kernel": 8, "purpos": 8, "lambda": 8, "As": [8, 17, 27, 30, 38], "instanti": 8, "retriev": 8, "doubl": 8, "at_dispatch_all_typ": 8, "size_t": 8, "1024": [8, 20, 32], "work_group": 8, "cgf": 8, "handler": [8, 15, 20], "cgh": 8, "kfn": 8, "nd_item": 8, "item": [8, 9, 16], "get_group": 8, "get_group_rang": 8, "get_local_id": 8, "gates_row": 8, "parallel_for": 8, "nd_rang": 8, "entir": [8, 29], "grid": 8, "fill": 8, "matric": [8, 30], "2048": 8, "launch": [8, 10, 18, 29, 33], "8": [8, 15, 22], "introductori": 8, "underlai": 8, "right": [8, 24, 29], "inde": [8, 27], "high": [8, 29, 30, 33, 38], "agnost": 8, "ineffici": 8, "eas": [8, 19], "dimension": 8, "abstract": 8, "much": [8, 19, 38], "pattern": [8, 17, 19, 31, 33, 37], "packedtensoraccessor32": 8, "lltm_xpu_backward_kernel": 8, "d_old_cel": 8, "d_gate": 8, "d_gates_": 8, "d_old_cell_": 8, "d_output_g": 8, "d_tanh_new_cel": 8, "d_new_cel": 8, "d_candidate_cel": 8, "d_input_g": 8, "lltm_backward_xpu": 8, "packed_accessor32": 8, "d_gate_weight": 8, "reshap": 8, "d_weight": 8, "mm": [8, 11], "d_bia": 8, "sum": [8, 9, 19, 33], "keepdim": [8, 9], "d_x": 8, "d_old_h": 8, "d_input": 8, "similar": [9, 20, 27, 33], "reducescatt": [9, 33], "align": [9, 18, 20, 33], "convent": 9, "fullyshardeddataparallel": 9, "singl": [9, 16, 18, 29, 32, 38], "trigger": [9, 17, 27, 31, 33, 35], "throw": 9, "switch": [9, 20], "argpars": 9, "functool": 9, "lr_schedul": 9, "steplr": 9, "mp": 9, "distributeddataparallel": [9, 33], "distributedsampl": [9, 16], "fully_sharded_data_parallel": 9, "cpuoffload": 9, "backwardprefetch": 9, "size_based_auto_wrap_polici": 9, "enable_wrap": 9, "localhost": 9, "12355": 9, "cleanup": [9, 27], "destroy_process_group": [9, 27], "toi": 9, "handwritten": 9, "digit": 9, "classif": 9, "net": 9, "conv1": 9, "32": [9, 19], "conv2": 9, "64": [9, 11, 23, 30], "dropout1": 9, "25": [9, 32], "dropout2": 9, "fc1": 9, "9216": 9, "fc2": 9, "relu": [9, 19, 33], "max_pool2d": 9, "flatten": 9, "log_softmax": [9, 11], "logic": [9, 19, 22, 27], "ddp_loss": 9, "nll_loss": [9, 11, 16], "reduct": 9, "all_reduc": 9, "reduceop": 9, "tloss": [9, 16], "6f": 9, "test_load": 9, "pred": 9, "argmax": 9, "max": [9, 14, 23, 27, 30, 32, 33, 35], "eq": [9, 33], "view_a": 9, "test_loss": 9, "averag": [9, 16, 20], "4f": 9, "2f": 9, "100": [9, 16, 20, 22, 32, 33], "fsdp_main": 9, "1307": 9, "3081": 9, "dataset1": 9, "mnist": 9, "dataset2": 9, "sampler1": 9, "num_replica": [9, 16], "sampler2": 9, "train_kwarg": 9, "test_kwarg": 9, "test_batch_s": 9, "xpu_kwarg": 9, "my_auto_wrap_polici": 9, "partial": 9, "min_num_param": 9, "init_start_ev": 9, "event": 9, "enable_tim": 9, "init_end_ev": 9, "adadelta": 9, "step_siz": 9, "gamma": 9, "1000": 9, "sec": 9, "save_model": 9, "barrier": 9, "mnist_cnn": 9, "final": [9, 34, 35], "parser": 9, "argumentpars": 9, "add_argu": 9, "metavar": 9, "14": [9, 22], "rate": [9, 16], "action": [9, 18], "store_tru": 9, "random": [9, 16, 27], "parse_arg": 9, "nproc": 9, "join": 9, "snippet": [9, 13, 20, 31], "fsdp_mnist_xpu": 9, "who": [10, 33, 35], "overrid": 10, "ON": [10, 20, 22, 32], "off": [10, 11, 20, 22, 27, 29, 30, 33], "defaultvalu": 10, "use_onemkl": [10, 27, 33], "onemkl": [10, 13, 18, 27, 33], "bla": 10, "use_channels_last_1d": 10, "1d": 10, "use_persist_stream": 10, "persist": 10, "use_scratchpad_mod": 10, "scratchpad": 10, "use_primitive_cach": 10, "use_queue_barri": 10, "submit_barri": 10, "dummi": 10, "use_multi_context": 10, "use_profil": 10, "legaci": 10, "profil": [10, 33], "use_kineto": [10, 20], "kineto": [10, 21, 33], "use_sycl_assert": 10, "assert": [10, 20], "use_itt_annot": 10, "itt": 10, "annot": 10, "use_split_fp64_loop": 10, "fp64": [10, 27, 33], "element": [10, 19, 38], "wise": [10, 30, 31, 33, 38], "use_xetla": 10, "xetla": [10, 13, 33], "build_by_per_kernel": 10, "per_kernel": 10, "use_aot_devlist": [10, 35], "build_internal_debug": 10, "debug": [10, 17, 18, 22, 31], "build_separate_op": 10, "build_simple_trac": 10, "use_onednn_dir": 10, "use_xetla_src": 10, "ipex_gpu_root_dir": 10, "dir": 10, "build_opt_level": 10, "add": [10, 11, 14, 16, 18, 19, 22, 27, 33, 38], "ox": 10, "accept": [10, 18], "while": [10, 11, 17, 19, 20, 29, 30, 33], "equal": [10, 27], "optioncpu": 10, "ipex_fp32_math_mod": 10, "optiongpu": 10, "ipex_verbos": 10, "ipex_xpu_sync_mod": 10, "enforc": 10, "ipex_tile_as_devic": 10, "partit": [10, 16], "map": [10, 19], "composit": 10, "ipex_log_level": 10, "ipex_log_compon": [10, 18], "pl": 10, "sepreat": 10, "sub_compon": 10, "ipex_log_rotate_s": [10, 18], "rotat": [10, 18], "ipex_log": 10, "ipex_log_split_s": [10, 18], "ipex_log_output": [10, 18], "null": [10, 18], "optionexperiment": 10, "ipex_simple_trac": [10, 22], "ipex_ze_trac": [10, 20], "export": [10, 18, 22, 27, 33], "resnet50": [10, 20], "lower": [11, 17, 29, 30, 33], "lighter": 11, "smaller": [11, 33], "sacrif": 11, "trade": [11, 29, 30, 33], "slower": 11, "accur": [11, 29, 30], "faster": 11, "autom": 11, "speedup": [11, 13, 29, 33], "simplenet": [11, 23], "pad": [11, 19, 33], "scope": 11, "chosen": [11, 13], "categori": [11, 14], "circumst": 11, "imag": [11, 19], "float64": 11, "variant": 11, "suppli": [11, 19], "region": 11, "addmm_": 11, "cannot": [11, 19, 20, 27, 33], "stabil": 11, "regardless": 11, "unlist": 11, "downstream": 11, "assum": [11, 24], "believ": [11, 19], "unstabl": 11, "conv1d": [11, 19], "conv3d": [11, 33], "_convolut": 11, "conv_tbc": 11, "conv_transpose1d": 11, "conv_transpose3d": 11, "prelu": 11, "addmv": 11, "addr": 11, "matmul": [11, 14, 30, 33], "mv": 11, "bmm": 11, "baddbmm": 11, "addbmm": 11, "chain_matmul": 11, "linalg_multi_dot": 11, "_thnn_fused_gru_cel": 11, "gru_cel": 11, "scaled_dot_product_attent": 11, "binary_cross_entropi": 11, "binary_cross_entropy_with_logit": 11, "nll_loss2d": 11, "nll_loss_nd": 11, "cross_entropy_loss": 11, "fft_fft": 11, "fft_ifft": 11, "fft_fft2": 11, "fft_ifft2": 11, "fft_fftn": 11, "fft_ifftn": 11, "fft_rfft": 11, "fft_irfft": 11, "fft_rfft2": 11, "fft_irfft2": 11, "fft_rfftn": 11, "fft_irfftn": 11, "fft_hfft": 11, "fft_ihfft": 11, "reciproc": 11, "pow": [11, 33], "frobenius_norm": 11, "nuclear_norm": 11, "cosine_similar": 11, "poisson_nll_loss": 11, "cosine_embedding_loss": 11, "hinge_embedding_loss": 11, "kl_div": 11, "l1_loss": 11, "smooth_l1_loss": 11, "huber_loss": 11, "mse_loss": 11, "margin_ranking_loss": 11, "multilabel_margin_loss": 11, "soft_margin_loss": 11, "triplet_margin_loss": 11, "multi_margin_loss": 11, "pdist": 11, "cdist": 11, "renorm": 11, "addcdiv": 11, "addcmul": 11, "atan2": 11, "bilinear": 11, "cross": [11, 33], "dot": [11, 19, 29], "grid_sampl": 11, "index_put": 11, "tensordot": 11, "scatter_add": 11, "g": [11, 17, 19, 27, 29, 33, 34, 35], "intervent": 11, "mixtur": 11, "enable_auto_channels_last": [12, 36], "disable_auto_channels_last": [12, 36], "broad": 12, "bring": [12, 17, 29, 34], "concaten": [13, 29], "special": [13, 29], "basic": [13, 33], "empir": 13, "guarante": 13, "ideal": 13, "xe": [13, 29, 33], "algebra": [13, 29], "compute_eng": 13, "xpucomputeeng": 13, "x1": 13, "20": [13, 19, 22, 27], "x2": 13, "onednn_layout": 13, "highest": 13, "upsampl": [13, 19], "align_corn": 13, "step2": 13, "step3": 13, "step4": 13, "fall": 13, "back": [13, 19], "averagepool2d": 13, "concat": [13, 19, 29], "maxpool2d": 13, "maxpool3d": 13, "layernorm": [13, 14], "permutecontigu": 13, "softmax": [13, 33], "greater": [13, 27], "fp16": [13, 14, 24, 29, 30, 32, 33], "upsampleblinear2d": 13, "upsamplenearest": 13, "divis": 13, "integr": [14, 33, 35], "ecolog": 14, "worth": 14, "therefor": [14, 17], "NOT": [14, 19], "necessarili": 14, "common": 14, "being": [14, 22], "dequant": [14, 33], "geglu": 14, "residu": 14, "pre": [14, 29, 30, 35], "norm": [14, 33], "mlp": [14, 30], "moe": 14, "retak": 14, "bit": 15, "dnn": 15, "e4m3": 15, "sign": [15, 20, 30], "expon": 15, "mantissa": 15, "e5m2": 15, "FOR": 15, "onlin": 15, "decompress": 15, "delai": 15, "algorithm": [15, 19, 30, 33], "quantizaiton": 15, "showcas": [15, 30], "_fp8_convert": 15, "convert_fp8_model": 15, "fp8_autocas": 15, "input_id": [15, 30], "token_type_id": 15, "segment_id": 15, "attention_mask": 15, "input_mask": 15, "masked_lm_label": 15, "next_sentence_label": 15, "tensorflow": [16, 19], "kera": 16, "apach": [16, 28], "mxnet": 16, "goal": 16, "mpi": [16, 27], "concept": [16, 19], "broadcast": 16, "hvd": [16, 27], "pin": [16, 23], "server": [16, 27], "forth": 16, "devid": 16, "effect": [16, 17, 30], "compens": 16, "distributedoptim": 16, "deleg": [16, 34], "broadcast_paramet": 16, "root_rank": 16, "broadcast_optimizer_st": 16, "consist": [16, 29], "restor": 16, "corrupt": 16, "accomplish": 16, "guard": 16, "named_paramet": 16, "log_interv": 16, "overal": 17, "view": [17, 19, 20, 33], "conv_relu": 17, "deliv": 17, "modelimp": [17, 31], "quantwrapp": [17, 31], "perchannel": [17, 31], "prepar": [17, 31], "obtain": [17, 29, 30, 31], "calib_dataset": [17, 31], "inference_data": [17, 31], "stage": [17, 27, 38], "symmetr": 17, "asymmetr": [17, 33], "uint8": 17, "zero_point": 17, "swap": [17, 27], "Be": 17, "free": [17, 18], "scriptmodul": [17, 31], "example_input": [17, 31], "warmup": [17, 20, 31], "warmup_data": [17, 31], "graph_for": [17, 31], "inference_dta": [17, 31], "whole": [17, 18, 33], "conv_unari": 17, "conv_binari": 17, "linear_unari": 17, "conv_sum_relu": 17, "henc": [17, 33], "consider": 17, "dump": [17, 33], "analysi": 17, "attempt": [18, 34], "realloc": 18, "err": 18, "critic": [18, 33], "belong": 18, "syngraph": 18, "logutil": 18, "ipex_xxx_log": 18, "xxx": [18, 33], "There": [18, 20, 24, 29], "four": 18, "sub": [18, 33], "fmt": 18, "ab": [18, 33], "ipex_info_log": 18, "identifi": 18, "uniqu": [18, 20, 22], "event_id": 18, "step_id": 18, "ipex_xxx_event_end": 18, "ipex_event_log": 18, "record_avg_pool": 18, "prepare_data": 18, "data_prepare_finish": 18, "avg_pool": 18, "ipex_info_event_end": 18, "five": 18, "ipex_logging_level": 18, "integar": 18, "consol": [18, 20, 22], "mb": 18, "set_log_level": 18, "log_level": 18, "get_log_level": 18, "set_log_output_file_path": 18, "log_path": 18, "get_log_output_file_path": 18, "set_log_rotate_file_s": 18, "get_log_rotate_file_s": 18, "set_log_split_file_s": 18, "get_log_split_file_s": 18, "set_log_compon": 18, "log_compon": 18, "get_log_compon": 18, "previou": [18, 19], "represent": 19, "multidimension": 19, "arrai": [19, 36], "nd": 19, "space": [19, 30], "semant": 19, "dens": 19, "spars": 19, "coo": 19, "cnn": 19, "canon": 19, "assign": [19, 20], "2d": [19, 36], "height": 19, "width": [19, 29], "bmp": 19, "contiguous_format": 19, "reason": 19, "close": 19, "higher": [19, 29], "difficult": 19, "manipul": 19, "to_dens": 19, "upstream": 19, "Will": 19, "easier": [19, 23], "secret": 19, "ingredi": 19, "cover": [19, 29], "almost": 19, "foundat": 19, "upper": 19, "expens": 19, "sequenc": [19, 20, 29], "benefici": 19, "nb": 19, "aka": 19, "me": 19, "roughli": 19, "50": 19, "perf": 19, "mkldnn": 19, "mkldnn_util": 19, "to_mkldnn": 19, "explain": [19, 30], "diagram": 19, "conclus": 19, "minimum": 19, "But": 19, "usual": [19, 29, 30], "neglig": 19, "organ": 19, "question": 19, "reinterpret": 19, "w": [19, 30], "answer": 19, "chw": 19, "hw": [19, 35], "offset": [19, 29], "stride_n": 19, "stride_c": 19, "stride_h": 19, "stride_w": 19, "merit": 19, "express": 19, "noncontigu": 19, "big": 19, "n1": 19, "n2": 19, "mind": 19, "someth": 19, "rfc": 19, "hwc": 19, "wc": 19, "chwn": 19, "hwn": 19, "wn": 19, "outplac": 19, "_appli": 19, "spontan": 19, "tell": 19, "compris": 19, "guidelin": 19, "awar": [19, 30], "repeat": [19, 20], "my": 19, "recent": [19, 30], "pr": 19, "cudnn": 19, "accommod": 19, "hidden": [19, 29], "ideep": 19, "format_tag": 19, "src_md": 19, "desc": 19, "data_typ": 19, "f32": 19, "src_mem": 19, "src_data_ptr": 19, "hwio": 19, "gemm": [19, 29, 33], "avx512": [19, 33], "3d": 19, "batchnorm1d": 19, "maxpool1d": 19, "div": [19, 33], "nearest": [19, 30], "sycl_devic": 19, "sequenti": 19, "kernel_s": 19, "test_input": 19, "test_input_xpu": 19, "to_channels_last_1d": 19, "tenor": 19, "xpu_r": 19, "is_contiguous_channels_last_1d": 19, "input_xpu": 19, "meta": [19, 29], "expect": [19, 27], "invalid": [19, 27], "corrspond": 19, "prebuilt": [20, 23, 27, 33, 35], "wheel": [20, 23, 27, 33, 35], "affili": 20, "use_onetrac": 20, "onetrac": 20, "layer": [20, 29, 33, 34, 36], "profileract": 20, "input_tensor": 20, "prof": 20, "proper": 20, "output_tensor_1": 20, "nonzero": 20, "output_tensor_2": 20, "tabl": [20, 35], "key_averag": 20, "my_schedul": 20, "skip_first": 20, "trace_handl": 20, "p": [20, 32], "sort_bi": 20, "self_xpu_time_tot": 20, "row_limit": 20, "trace_": 20, "step_num": 20, "outsid": 20, "on_trace_readi": 20, "forget": 20, "record_shap": 20, "rememb": 20, "effort": [20, 30], "contextlib": 20, "profiler_setup": 20, "nullcontext": 20, "should_profil": 20, "profileact": 20, "unset": 20, "stop": [20, 27], "involv": 20, "Such": [20, 33], "a_0": 20, "a_1": 20, "b_0": 20, "b_1": 20, "export_chrome_trac": 20, "trace_example_on_multi_devic": 20, "exclud": 20, "children": 20, "percentag": 20, "propot": 20, "percentasg": 20, "avg": 20, "consumpt": 20, "sonsumpt": 20, "viewer": 20, "perfetto": 20, "ui": 20, "dev": 20, "trace_fil": 20, "examin": [20, 38], "failur": [20, 27, 33], "tracer": 20, "collector": 20, "workaround": [20, 27], "ze_enable_tracing_lay": 20, "soon": 21, "instead": [21, 31, 33, 38], "screen": 22, "turn": 22, "bracket": 22, "enable_simple_trac": 22, "disable_simple_trac": 22, "using_simple_trac": 22, "unintention": 22, "exmapl": 22, "262618": 22, "wrapper__empty_strid": 22, "atenipextypexpu": [22, 30], "empty_strid": 22, "wrapper__copy_": 22, "copy_": 22, "wrapper___unique2": 22, "_unique2": 22, "wrapper__clon": 22, "wrapper___reshape_alia": 22, "_reshape_alia": 22, "wrapper_memory_format_empti": 22, "11": 22, "wrapper__as_strid": 22, "as_strid": 22, "15": 22, "wrapper___local_scalar_dens": 22, "_local_scalar_dens": 22, "16": [22, 32], "wrapper__resize_": 22, "resize_": 22, "19": 22, "pid": 22, "tid": 22, "name1": 22, "name2": 22, "arrow": 22, "relationship": 22, "child": 22, "gdb": 22, "triton": [23, 33], "codegen": 23, "addition": [23, 30], "facilit": 23, "contribut": 23, "intens": 23, "ever": 23, "unlock": 23, "v2": [23, 33], "firstli": [23, 29, 30], "llvm": 23, "forc": 23, "cp310": 23, "manylinux_2_17_x86_64": 23, "manylinux2014_x86_64": 23, "triton_codegen_intel_xpu_backend": 23, "compiled_model": 23, "weight_decai": [23, 38], "loss_funct": 23, "demostr": 24, "cache_en": 24, "suppos": 24, "bash": [24, 27, 30], "problem": [27, 38], "unsupport": [27, 33], "graphic": [27, 30, 33, 35], "improp": 27, "unload": 27, "conda": [27, 33], "encount": [27, 34, 35], "ship": 27, "libstdc": 27, "conflict": 27, "ld_preload": [27, 33], "symbol": [27, 33], "undefin": [27, 33], "_glibcxx_use_cxx11_abi": 27, "_znk5torch8autograd4node4nameb5cxx11ev": [27, 33], "appear": [27, 33], "glibcxx_use_cxx11_abi": 27, "bad": 27, "termin": 27, "rn50": 27, "friendli": 27, "ungracefulli": 27, "116312": 27, "997": 27, "170": [27, 33, 35], "progress": [27, 29], "wsl2": [27, 33], "ram": 27, "killer": 27, "dmesg": 27, "oom": 27, "had": 27, "kill": 27, "max_job": 27, "conserv": 27, "slow": 27, "thing": 27, "lot": [27, 29, 33, 34, 35], "cl_device_not_found": 27, "tdr": 27, "window": [27, 33], "tdrdelai": 27, "registri": 27, "reboot": 27, "converg": 27, "24": 27, "hour": 27, "divid": [27, 36], "phase": [27, 29], "instabl": 27, "fault": 27, "atom": 27, "violat": 27, "lt": [27, 32, 33], "803": 27, "29": 27, "investig": 27, "roll": 27, "775": 27, "usr": [27, 33], "ld": [27, 33], "lmkl_sycl": [27, 33], "lmkl_intel_ilp64": [27, 33], "lmkl_core": [27, 33], "lmkl_tbb_thread": [27, 33], "linker": [27, 33], "exit": [27, 33], "v": [27, 33], "occur": [27, 30, 33], "resolv": [27, 33], "mkl_dpcpp_root": [27, 33], "mkl_lapack_dspevd": 27, "fatal": [27, 33], "libmkl_vml_avx512": 27, "libmkl": [27, 33], "vml": [27, 33], "incorrectli": [27, 33], "oserror": [27, 33], "wrong": [27, 33], "preload": [27, 33], "libmkl_intel_ilp64": [27, 33], "suffix": [27, 33], "test_weight_norm": 27, "testnnmethod": 27, "test_weight_norm_differnt_typ": 27, "copyright": 28, "notic": 28, "subject": 28, "condit": [28, 36], "architectur": [29, 30], "decod": 29, "multiheadattent": 29, "feedforward": 29, "bound": [29, 38], "kv_cach": 29, "smoothquant": 29, "llama2": [29, 32], "qwen": [29, 30, 33], "bloom": [29, 32], "huggingfac": [29, 30], "hub": [29, 30], "7b": [29, 30, 32, 33], "hf": 29, "13b": [29, 32, 33], "70b": 29, "eleutherai": 29, "6b": [29, 30, 32, 33], "30b": 29, "3b": 29, "bigscienc": 29, "7b1": 29, "woq": 29, "codellama": 29, "indirect": 29, "rope": 29, "tpp": 29, "expand": 29, "brief": 29, "introduct": 29, "xelta": 29, "rotari": 29, "posit": [29, 30], "squar": [29, 33], "rmsnorm": 29, "beam": [29, 33], "idx": 29, "reorder_cach": 29, "bottleneck": [29, 30], "prompt": [29, 30], "kept": 29, "buffer": 29, "wast": 29, "prefil": 29, "influenc": 29, "histori": 29, "left": 29, "decid": 29, "timestamp": 29, "elimin": 29, "sdpa": 29, "shard": [29, 33], "lead": 29, "significantli": [29, 30], "heavier": 29, "becom": 29, "remark": [29, 30, 33], "deploi": [29, 30, 33], "resourc": [29, 30, 33], "challeng": [29, 30, 33], "overcom": [29, 30], "complex": [29, 30], "w8a8": [29, 30], "bandwidth": [29, 30], "preserv": [29, 30], "minim": [29, 30], "qualiti": [29, 30, 34, 35], "rtn": 30, "awq": 30, "teq": 30, "autoround": 30, "10004": 30, "stai": 30, "int4_fullrang": 30, "datatyp": [30, 32, 33], "procedur": 30, "constrain": 30, "round": [30, 33], "intuit": 30, "boast": 30, "simplic": 30, "easili": 30, "nf4": 30, "uniform": 30, "w4g32": 30, "w8": 30, "wors": 30, "explor": 30, "min": [30, 33], "handcraft": 30, "impos": 30, "broader": [30, 33], "knowledg": 30, "trainabl": 30, "ransform": 30, "summit": 30, "peer": 30, "review": 30, "202306": 30, "brain": 30, "surgeon": 30, "remain": [30, 38], "unquant": 30, "mitig": [30, 32], "occasion": 30, "semidefinit": 30, "necessit": 30, "hyperparamet": 30, "descent": 30, "minmax": 30, "200": 30, "impress": 30, "hypeparamet": 30, "compat": [30, 33], "relianc": 30, "backpropag": 30, "quit": [30, 33], "onnx": 30, "gunho": 30, "park": 30, "baeseong": 30, "se": 30, "jung": 30, "kwon": 30, "byeongwook": 30, "kim": 30, "youngjoo": 30, "lee": 30, "dongsoo": 30, "nuqmm": 30, "arxiv": 30, "preprint": 30, "2206": 30, "09557": 30, "lin": 30, "ji": 30, "et": 30, "al": 30, "2306": 30, "00978": 30, "cheng": 30, "cai": 30, "lv": 30, "shen": 30, "2310": 30, "10944": 30, "frantar": 30, "elia": 30, "2210": 30, "17323": 30, "zhang": 30, "he": 30, "2309": 30, "05516": 30, "easiest": 30, "load_in_4bit": 30, "hook": 30, "automodelforcausallm": [30, 31], "4bit": 30, "qmodel": 30, "model_nam": 30, "device_map": 30, "trust_remote_cod": 30, "use_llm_runtim": 30, "weightonlyquantconfig": 30, "woq_quantization_config": 30, "compute_dtyp": 30, "weight_dtyp": 30, "scale_dtyp": 30, "group_siz": 30, "inc_model": 30, "conf": 30, "calib_func": 30, "calib_dataload": 30, "export_compressed_model": 30, "compression_dtyp": 30, "compression_dim": 30, "use_optimum_format": 30, "convert_dtype_str2torch": 30, "weightonlyquantizedlinear": 30, "blocksiz": 30, "front": 30, "present": 30, "ipextransformerlinear": 30, "ipextransformerattnoptimizedint4": 30, "ipextransformermlpoptimizedint4": 30, "major": 30, "qkv": 30, "torch_ipex": 30, "mm_qkv_out_int4": 30, "mm_bias_int4": 30, "correspondingli": 30, "mm_silu_mul_int4": 30, "substitut": 30, "ipex_op_regist": 30, "hgemmxetla_int4": 30, "polici": [30, 33], "beforehand": [30, 34, 35], "later": 30, "has_2d_block_arrai": 30, "curdevid": 30, "suitabl": 30, "ordered_gemm_wint4_config_set_pvc": 30, "ordered_gemm_wint4_config_set_arc": 30, "hgemm_int4_common_dispatch": 30, "hgemm_bias_wint4_arc": 30, "intel_extension_for_transform": 30, "autotoken": 30, "token": [30, 32], "upon": 30, "littl": 30, "girl": 30, "return_tensor": 30, "save_pretrain": 30, "saved_dir": 30, "loaded_model": 30, "run_benchmark_woq": 30, "content": [30, 31, 33], "transpar": [31, 33], "undergo": 31, "overview": [31, 33], "model_name_or_path": 31, "amp_dtyp": 31, "topologi": [32, 38], "1550": 32, "2024": [32, 33], "736": 32, "v4": 32, "31": 32, "4fc181b0": 32, "ec33277": 32, "platinum": 32, "8480": 32, "node": 32, "socket": 32, "56": 32, "ucod": 32, "0x2b0004b1": 32, "hyper": 32, "turboboost": 32, "bio": 32, "se5c7411": 32, "86b": 32, "9525": 32, "d25": 32, "2304190630": 32, "ddr": 32, "slot": 32, "64gb": 32, "frequenc": 32, "4800": 32, "dcpmm": 32, "1024gb": 32, "ubuntu": [32, 33], "22": 32, "04": [32, 33], "1020": 32, "oem": 32, "spectr": 32, "meltdown": 32, "pvc": [32, 35], "oam": 32, "ifwi": 32, "b4": 32, "si": 32, "ww42": 32, "3_25mhzi_quad_dameni_oam600w_ifrv2332i_pscnull_ifwi": 32, "ecc": 32, "amc": 32, "sw": 32, "fw": 32, "v3": 33, "beta": 33, "hbm": 33, "kv": 33, "chines": 33, "baichuan2": 33, "chatglm3": 33, "5x": 33, "torch_llm_allreduc": 33, "xelink": 33, "webpag": 33, "uplift": 33, "3696": 33, "sdp": 33, "fallback": 33, "3706": 33, "3788": 33, "3841": 33, "workgroup": 33, "3796": 33, "3808": 33, "dockerfil": 33, "3829": 33, "3882": 33, "3887": 33, "3970": 33, "patch": 33, "fft": 33, "21": 33, "date": 33, "top": [33, 36], "reach": 33, "competit": 33, "a770": 33, "primari": 33, "verif": 33, "vehicl": 33, "emul": 33, "fsdp": 33, "merg": 33, "publicli": 33, "oct": 33, "focus": 33, "oob": 33, "v1": 33, "unaryop": 33, "sqrt": 33, "log_sigmoid": 33, "hardswish": 33, "hardsigmoid": 33, "silu": 33, "hardtanh": 33, "leaky_relu": 33, "binaryop": 33, "mul": 33, "ne": 33, "ge": 33, "gt": 33, "le": 33, "gelu": 33, "mish": 33, "concret": 33, "adamw": [33, 38], "permut": 33, "scalar": 33, "pixelshuffl": 33, "leaki": 33, "softplu": 33, "glibcxx": 33, "cxx11": 33, "gcc": 33, "path_to_your_onemkl": 33, "__release_lnx": 33, "lapack": 33, "dspevd": 33, "lp64": 33, "libmkl_sequenti": 33, "lifecycl": [34, 35], "benifit": [34, 35], "deliveri": [34, 35], "disadvantag": [34, 35], "500mb": [34, 35], "5gb": [34, 35], "dealloc": 34, "smallest": 34, "unabl": 34, "appropri": 34, "comma": 35, "delimit": 35, "ats": 35, "m150": 35, "togeth": 35, "seper": 35, "opencl": 35, "spir64_gen": 35, "flag": 36, "convtranspos": 36, "tri": 36, "connect": 36, "batchnorm": 36, "instanc": 36, "opportun": 36, "met": 36, "parameterwrapp": 36, "_parameter_wrapp": 36, "can_cast_train": 36, "ipex_weight_convert_module_xpu": 36, "bottom": 36, "simultan": 36, "referenc": 36, "ipex_fused_optimizer_list_xpu": 36, "_optimizer_util": 36, "_original_step": 36, "understand": 37, "adam": 38, "lamb": 38, "lar": 38, "grad": 38, "buf": 38, "momentum_buffer_list": 38, "detach": 38, "mul_": 38, "add_": 38, "dampen": 38, "nesterov": 38, "claus": 38, "bottl": 38, "neck": 38, "solv": 38, "pseudo": 38, "sgd_fused_step": 38}, "objects": {"": [[1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E", "xpu::BF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E", "xpu::FP32"], [1, 1, 1, "_CPPv4N3xpu14FP32_MATH_MODEE", "xpu::FP32_MATH_MODE"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E", "xpu::FP32_MATH_MODE::BF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E", "xpu::FP32_MATH_MODE::FP32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "xpu::FP32_MATH_MODE::FP32_MATH_MODE_MAX"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "xpu::FP32_MATH_MODE::FP32_MATH_MODE_MIN"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E", "xpu::FP32_MATH_MODE::TF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "xpu::FP32_MATH_MODE_MAX"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "xpu::FP32_MATH_MODE_MIN"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E", "xpu::TF32"], [1, 2, 1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE", "xpu::get_queue_from_stream"], [1, 3, 1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE", "xpu::get_queue_from_stream::stream"], [1, 2, 1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "xpu::set_fp32_math_mode"], [1, 3, 1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "xpu::set_fp32_math_mode::mode"]], "intel_extension_for_pytorch": [[1, 4, 1, "", "get_fp32_math_mode"], [1, 4, 1, "", "optimize"], [1, 4, 1, "", "optimize_transformers"], [1, 4, 1, "", "set_fp32_math_mode"]], "intel_extension_for_pytorch.xpu": [[1, 5, 1, "", "Event"], [1, 5, 1, "", "Stream"], [1, 4, 1, "", "current_device"], [1, 4, 1, "", "current_stream"], [1, 5, 1, "", "device"], [1, 4, 1, "", "device_count"], [1, 5, 1, "", "device_of"], [1, 4, 1, "", "empty_cache"], [1, 4, 1, "", "get_device_name"], [1, 4, 1, "", "get_device_properties"], [1, 4, 1, "", "get_rng_state"], [1, 4, 1, "", "get_rng_state_all"], [1, 4, 1, "", "init"], [1, 4, 1, "", "initial_seed"], [1, 4, 1, "", "is_available"], [1, 4, 1, "", "is_initialized"], [1, 4, 1, "", "manual_seed"], [1, 4, 1, "", "manual_seed_all"], [1, 4, 1, "", "max_memory_allocated"], [1, 4, 1, "", "max_memory_reserved"], [1, 4, 1, "", "memory_allocated"], [1, 4, 1, "", "memory_reserved"], [1, 4, 1, "", "memory_snapshot"], [1, 4, 1, "", "memory_stats"], [1, 4, 1, "", "memory_stats_as_nested_dict"], [1, 4, 1, "", "memory_summary"], [1, 4, 1, "", "reset_accumulated_memory_stats"], [1, 4, 1, "", "reset_peak_memory_stats"], [1, 4, 1, "", "seed"], [1, 4, 1, "", "seed_all"], [1, 4, 1, "", "set_device"], [1, 4, 1, "", "set_rng_state"], [1, 4, 1, "", "set_rng_state_all"], [1, 4, 1, "", "stream"], [1, 4, 1, "", "synchronize"]], "intel_extension_for_pytorch.xpu.Event": [[1, 6, 1, "", "elapsed_time"], [1, 6, 1, "", "query"], [1, 6, 1, "", "record"], [1, 6, 1, "", "synchronize"], [1, 6, 1, "", "wait"]], "intel_extension_for_pytorch.xpu.Stream": [[1, 6, 1, "", "record_event"], [1, 7, 1, "", "sycl_queue"], [1, 6, 1, "", "synchronize"], [1, 6, 1, "", "wait_event"], [1, 6, 1, "", "wait_stream"]], "intel_extension_for_pytorch.xpu.fp8.fp8": [[1, 4, 1, "", "fp8_autocast"]]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:enum", "2": "cpp:function", "3": "cpp:functionParam", "4": "py:function", "5": "py:class", "6": "py:method", "7": "py:property"}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "enum", "C++ enum"], "2": ["cpp", "function", "C++ function"], "3": ["cpp", "functionParam", "C++ function parameter"], "4": ["py", "function", "Python function"], "5": ["py", "class", "Python class"], "6": ["py", "method", "Python method"], "7": ["py", "property", "Python property"]}, "titleterms": {"intel": [0, 3, 4, 6, 14, 17, 30, 32], "extens": [0, 3, 5, 6, 8, 14, 17, 30], "pytorch": [0, 3, 6, 14, 16, 17, 19, 30], "architectur": 0, "support": [0, 5, 11, 14, 15, 19, 20, 30], "api": [1, 5, 6, 12, 19, 26, 31, 36], "document": [1, 3, 26], "gener": [1, 27], "miscellan": 1, "random": 1, "number": 1, "stream": [1, 8], "event": [1, 18], "memori": [1, 19, 34, 37], "manag": [1, 34, 37], "c": [1, 4, 18, 19], "blog": 2, "public": 2, "contribut": 3, "develop": 3, "xpu": [3, 4, 19, 20, 33], "tip": 3, "debug": [3, 5, 13], "unit": [3, 27], "test": [3, 27], "better": 3, "local": 3, "pytest": 3, "write": [3, 8, 19], "build": [3, 8, 10, 20], "exampl": [4, 6, 7, 8, 9, 15], "python": [4, 5, 18], "train": [4, 5, 11, 23], "singl": [4, 6], "instanc": 4, "float32": [4, 11], "bfloat16": [4, 11], "infer": [4, 11, 29, 30, 31], "imper": [4, 11, 17, 31], "mode": [4, 15, 17, 31], "resnet50": 4, "bert": 4, "torchscript": [4, 11, 17, 31], "float16": [4, 11], "int8": 4, "torch": [4, 5, 23], "optim": [4, 17, 29, 31, 34, 36, 38], "basic": 4, "usag": [4, 6, 9, 15, 16, 18, 27, 30, 31], "us": [4, 5, 7, 8, 11, 12, 13, 20, 22, 35], "sycl": [4, 8], "code": 4, "custom": 4, "dpc": [4, 5, 8], "kernel": [4, 14, 19], "ai": [4, 32], "refer": [4, 11, 30], "model": [4, 19, 20, 22, 29, 30], "featur": [5, 13, 30], "easi": 5, "channel": [5, 12, 19, 36], "last": [5, 12, 19, 36], "auto": [5, 11, 12], "mix": [5, 11], "precis": [5, 11, 29], "amp": [5, 11], "quantiz": [5, 15, 17, 29, 30], "distribut": [5, 29, 31], "dlpack": [5, 7], "solut": [5, 7], "advanc": [5, 10], "configur": [5, 10, 32], "fulli": [5, 9], "shard": [5, 9], "data": [5, 7, 9, 15, 29, 32], "parallel": [5, 9], "fsdp": [5, 9], "compil": [5, 8, 23, 34, 35], "gpu": [5, 6, 9, 11, 17, 23, 30, 33, 34, 38], "beta": [5, 23], "simpl": [5, 18, 22], "trace": [5, 20, 22], "tool": [5, 20, 21, 22], "prototyp": [5, 15, 16, 18, 20, 22, 30], "kineto": [5, 20], "profil": [5, 20, 21], "comput": [5, 13], "engin": [5, 13], "ipex_log": [5, 18], "distributeddataparallel": 6, "ddp": 6, "introduct": [6, 7, 8, 9, 11, 13, 14, 18, 20, 21, 22, 23, 26, 30, 35, 38], "instal": [6, 16, 25, 30], "oneccl": 6, "bind": 6, "recommend": 6, "from": 6, "prebuilt": 6, "wheel": 6, "sourc": 6, "runtim": [6, 10, 30], "dynam": 6, "link": 6, "mpi": 6, "launch": 6, "node": 6, "scale": 6, "onli": [6, 9, 29, 30], "case": [7, 11, 13, 20, 22, 35], "design": 7, "import": 7, "capsul": 7, "export": [7, 20], "dldevic": 7, "pointer": 7, "asynchron": 7, "program": 7, "motiv": 8, "setuptool": 8, "jit": 8, "cmake": 8, "request": 8, "current": 8, "c10": 8, "fetch": 8, "correspond": 8, "queue": 8, "op": [8, 11], "accessor": 8, "time": [10, 34, 35], "default": [11, 12, 19], "path": 11, "autocast": 11, "elig": 11, "specif": 11, "behavior": 11, "can": 11, "promot": 11, "widest": 11, "input": 11, "type": [11, 15, 29], "eas": 12, "enabl": [12, 22], "disabl": [12, 20, 22], "known": [12, 20, 33], "issu": [12, 20, 33], "experiment": 13, "select": 13, "polici": [13, 29], "multipl": 13, "implement": 13, "oper": [13, 15, 19, 29, 38], "deepspe": [14, 31], "platform": 14, "float8": 15, "fp8": 15, "run": [15, 30], "horovod": 16, "definit": 18, "log": 18, "level": 18, "compon": 18, "enviorn": 18, "set": [18, 20], "replac": 18, "ipex_simple_trac": 18, "ipex_verbos": 18, "what": 19, "i": 19, "format": 19, "all": 19, "That": 19, "matter": 19, "nchw": 19, "b": 19, "nhwc": 19, "block": 19, "nchw16c": 19, "cpu": 19, "stride": 19, "layout": 19, "tensor": 19, "creation": 19, "convers": 19, "d": 19, "coverag": 19, "regist": 19, "aten": 19, "nativ": 19, "manner": 19, "onednn": 19, "creat": 19, "convolut": 19, "primit": 19, "1d": 19, "determin": 19, "environ": [20, 30], "variabl": 20, "add": 20, "Into": 20, "script": [20, 30], "partli": 20, "backend": 20, "multi": 20, "devic": 20, "applic": 20, "result": [20, 22], "chrome": 20, "legaci": 21, "deprec": 21, "requir": [23, 35], "depend": [23, 27], "inferenec": 23, "quick": 24, "start": [24, 26], "execut": [24, 30], "get": 26, "troubleshoot": 27, "librari": 27, "licens": 28, "larg": 29, "languag": 29, "llm": [29, 30, 32], "overview": [29, 32], "methodologi": 29, "linear": [29, 30], "deep": 29, "fusion": [29, 38], "segment": 29, "kv": 29, "cach": 29, "low": 29, "weight": [29, 30], "int4": 29, "framework": 30, "matrix": 30, "initi": 30, "dispatch": 30, "For": 30, "setup": 30, "transform": [30, 31], "neural": 30, "compressor": 30, "save": 30, "load": 30, "option": 30, "woq": 30, "benchmark": 30, "frontend": [31, 36], "pseudocod": 31, "common": 31, "scenario": 31, "fp16": 31, "smoothquant": 31, "perform": 32, "center": 32, "product": 32, "v2": 32, "1": [32, 33], "10": [32, 33], "softwar": 32, "version": 32, "hardwar": 32, "releas": 33, "2": 33, "30": 33, "highlight": 33, "20": 33, "0": 33, "110": 33, "13": 33, "120": 33, "200": 33, "technic": 34, "detail": 34, "ahead": [34, 35], "aot": [34, 35], "ipex": [34, 36], "automat": 36, "conv_bn_fold": 36, "linear_bn_fold": 36, "replace_dropout_with_ident": 36, "split_master_weight_for_bf16": 36, "fuse_update_step": 36}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 58}, "alltitles": {"Intel\u00ae Extension for PyTorch*": [[0, "intel-extension-for-pytorch"]], "Architecture": [[0, "architecture"]], "Support": [[0, "support"]], "API Documentation": [[1, "api-documentation"], [26, "api-documentation"]], "General": [[1, "general"]], "Miscellaneous": [[1, "miscellaneous"]], "Random Number Generator": [[1, "random-number-generator"]], "Streams and events": [[1, "streams-and-events"]], "Memory management": [[1, "memory-management"]], "C++ API": [[1, "c-api"]], "Blogs & Publications": [[2, "blogs-publications"]], "Contribution": [[3, "contribution"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[3, "contributing-to-intel-extension-for-pytorch"]], "Developing Intel\u00ae Extension for PyTorch* on XPU": [[3, "developing-intel-extension-for-pytorch-on-xpu"]], "Tips and Debugging": [[3, "tips-and-debugging"]], "Unit testing": [[3, "unit-testing"]], "Better local unit tests with pytest": [[3, "better-local-unit-tests-with-pytest"]], "Writing documentation": [[3, "writing-documentation"]], "Building documentation": [[3, "building-documentation"]], "Tips": [[3, "tips"]], "Examples": [[4, "examples"]], "Python": [[4, "python"]], "Training": [[4, "training"]], "Single-Instance Training": [[4, "single-instance-training"]], "Float32": [[4, "float32"], [4, "id1"]], "BFloat16": [[4, "bfloat16"], [4, "id4"]], "Inference": [[4, "inference"]], "Imperative Mode": [[4, "imperative-mode"], [4, "id5"], [4, "id11"], [17, "imperative-mode"]], "Resnet50": [[4, "resnet50"], [4, "id2"], [4, "id6"], [4, "id9"], [4, "id12"], [4, "id15"]], "BERT": [[4, "bert"], [4, "id3"], [4, "id7"], [4, "id10"], [4, "id13"], [4, "id16"]], "TorchScript Mode": [[4, "torchscript-mode"], [4, "id8"], [4, "id14"], [17, "torchscript-mode"], [31, "torchscript-mode"]], "Float16": [[4, "float16"]], "INT8": [[4, "int8"]], "torch.xpu.optimize": [[4, "torch-xpu-optimize"]], "C++": [[4, "c"]], "Basic Usage": [[4, "basic-usage"]], "Use SYCL code": [[4, "use-sycl-code"]], "Customize DPC++ kernels": [[4, "customize-dpc-kernels"]], "Intel\u00ae AI Reference Models": [[4, "intel-ai-reference-models"]], "Features": [[5, "features"]], "Easy-to-use Python API": [[5, "easy-to-use-python-api"]], "Channels Last": [[5, "channels-last"], [19, "channels-last"]], "Auto Mixed Precision (AMP)": [[5, "auto-mixed-precision-amp"]], "Quantization": [[5, "quantization"]], "Distributed Training": [[5, "distributed-training"]], "DLPack Solution": [[5, "dlpack-solution"], [7, "dlpack-solution"]], "DPC++ Extension": [[5, "dpc-extension"], [8, "dpc-extension"]], "Advanced Configuration": [[5, "advanced-configuration"], [10, "advanced-configuration"]], "Fully Sharded Data Parallel (FSDP)": [[5, "fully-sharded-data-parallel-fsdp"], [9, "fully-sharded-data-parallel-fsdp"]], "torch.compile for GPU (Beta)": [[5, "torch-compile-for-gpu-beta"], [23, "torch-compile-for-gpu-beta"]], "Simple Trace Tool (Prototype)": [[5, "simple-trace-tool-prototype"], [22, "simple-trace-tool-prototype"]], "Kineto Supported Profiler Tool (Prototype)": [[5, "kineto-supported-profiler-tool-prototype"], [20, "kineto-supported-profiler-tool-prototype"]], "Compute Engine (Prototype feature for debug)": [[5, "compute-engine-prototype-feature-for-debug"]], "IPEX_LOGGING (Prototype feature for debug)": [[5, "ipex-logging-prototype-feature-for-debug"]], "DistributedDataParallel (DDP)": [[6, "distributeddataparallel-ddp"]], "Introduction": [[6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"], [11, "introduction"], [13, "introduction"], [14, "introduction"], [18, "introduction"], [20, "introduction"], [21, "introduction"], [22, "introduction"], [23, "introduction"], [26, "introduction"], [30, "introduction"], [35, "introduction"], [38, "introduction"]], "Installation of Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "installation-of-intel-oneccl-bindings-for-pytorch"]], "Install PyTorch and Intel\u00ae Extension for PyTorch*": [[6, "install-pytorch-and-intel-extension-for-pytorch"]], "Install Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "install-intel-oneccl-bindings-for-pytorch"]], "[Recommended] Install from prebuilt wheels": [[6, "recommended-install-from-prebuilt-wheels"]], "Install from source": [[6, "install-from-source"]], "Runtime Dynamic Linking": [[6, "runtime-dynamic-linking"]], "DDP Usage": [[6, "ddp-usage"]], "Example Usage (MPI launch for single node):": [[6, "example-usage-mpi-launch-for-single-node"]], "DDP scaling API (GPU Only)": [[6, "ddp-scaling-api-gpu-only"]], "Usage of DDP scaling API": [[6, "usage-of-ddp-scaling-api"]], "Use Case": [[7, "use-case"], [11, "use-case"], [13, "use-case"], [20, "use-case"], [22, "use-case"]], "Design": [[7, "design"]], "Import DLPack Capsule": [[7, "import-dlpack-capsule"]], "Export DLPack Capsule": [[7, "export-dlpack-capsule"]], "DLDevice and data pointer": [[7, "dldevice-and-data-pointer"]], "Asynchronous Programming": [[7, "asynchronous-programming"]], "Example Case": [[7, "example-case"]], "Motivation and Example": [[8, "motivation-and-example"]], "Writing a DPC++ Extension": [[8, "writing-a-dpc-extension"]], "Building with setuptools": [[8, "building-with-setuptools"]], "JIT Compiling Extensions": [[8, "jit-compiling-extensions"]], "Building with CMake": [[8, "building-with-cmake"]], "Requesting the current c10::Stream": [[8, "requesting-the-current-c10-stream"]], "Fetching the corresponding sycl::queue": [[8, "fetching-the-corresponding-sycl-queue"]], "Writing the DPC++ Op": [[8, "writing-the-dpc-op"]], "Using accessors": [[8, "using-accessors"]], "FSDP Usage (GPU only)": [[9, "fsdp-usage-gpu-only"]], "Example": [[9, "example"]], "Build Time Configuration": [[10, "build-time-configuration"]], "Runtime Configuration": [[10, "runtime-configuration"]], "Auto Mixed Precision (AMP) on GPU": [[11, "auto-mixed-precision-amp-on-gpu"]], "Default Precision": [[11, "default-precision"]], "Inference with Imperative Path": [[11, "inference-with-imperative-path"]], "Inference with TorchScript Path": [[11, "inference-with-torchscript-path"]], "Training Support": [[11, "training-support"]], "Autocast Op Reference": [[11, "autocast-op-reference"]], "Op Eligibility": [[11, "op-eligibility"]], "Op-Specific Behavior": [[11, "op-specific-behavior"]], "Ops that can autocast to bfloat16": [[11, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float16": [[11, "ops-that-can-autocast-to-float16"]], "Ops that can autocast to float32": [[11, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[11, "ops-that-promote-to-the-widest-input-type"]], "Auto Channels Last": [[12, "auto-channels-last"]], "Ease-of-use auto channels last API": [[12, "ease-of-use-auto-channels-last-api"]], "default": [[12, "default"]], "enable": [[12, "enable"]], "disable": [[12, "disable"]], "Known issue": [[12, "known-issue"]], "Compute Engine (Experimental feature for debug)": [[13, "compute-engine-experimental-feature-for-debug"]], "Engine Selection Policy": [[13, "engine-selection-policy"]], "Multiple Implementations Operators and Engines": [[13, "multiple-implementations-operators-and-engines"]], "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels": [[14, "intel-extension-for-pytorch-deepspeed-kernels"]], "Supported Platform": [[14, "supported-platform"]], "Float8 Data Type Support (Prototype)": [[15, "float8-data-type-support-prototype"]], "Float8 Data Type": [[15, "float8-data-type"]], "FP8 Quantization": [[15, "fp8-quantization"]], "Supported running mode": [[15, "supported-running-mode"]], "Supported operators": [[15, "supported-operators"]], "FP8 usage example": [[15, "fp8-usage-example"]], "Horovod with PyTorch (Prototype)": [[16, "horovod-with-pytorch-prototype"]], "Install Horovod with PyTorch": [[16, "install-horovod-with-pytorch"]], "Horovod with PyTorch Usage": [[16, "horovod-with-pytorch-usage"]], "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]": [[17, "intel-extension-for-pytorch-optimizations-for-quantization-gpu"]], "IPEX_LOGGING (Prototype)": [[18, "ipex-logging-prototype"]], "IPEX_LOGGING Definition": [[18, "ipex-logging-definition"]], "Log Level": [[18, "log-level"]], "Log Component": [[18, "log-component"]], "Usage in C++": [[18, "usage-in-c"]], "Simple Log": [[18, "simple-log"]], "Event Log": [[18, "event-log"]], "Enviornment settings": [[18, "enviornment-settings"]], "Usage in python": [[18, "usage-in-python"]], "Replace IPEX_SIMPLE_TRACE": [[18, "replace-ipex-simple-trace"]], "Replace IPEX_VERBOSE": [[18, "replace-ipex-verbose"]], "What is Channels Last": [[19, "what-is-channels-last"]], "Memory Format Is All That Matters": [[19, "memory-format-is-all-that-matters"]], "a. NCHW (default)": [[19, "a-nchw-default"]], "b. NHWC": [[19, "b-nhwc"]], "c. Blocked (nChw16c, on CPU)": [[19, "c-blocked-nchw16c-on-cpu"]], "PyTorch Strided Layout": [[19, "pytorch-strided-layout"]], "Channels Last Memory Format APIs": [[19, "channels-last-memory-format-apis"]], "a. tensor creation": [[19, "a-tensor-creation"]], "b. tensor conversion": [[19, "b-tensor-conversion"]], "c. model conversion": [[19, "c-model-conversion"]], "d. operator coverage in PyTorch": [[19, "d-operator-coverage-in-pytorch"]], "Writing Channels Last Kernels on CPU": [[19, "writing-channels-last-kernels-on-cpu"]], "a. Register Channels Last Kernel in ATen Native Manner": [[19, "a-register-channels-last-kernel-in-aten-native-manner"]], "b. Register oneDNN Kernel on Channels Last": [[19, "b-register-onednn-kernel-on-channels-last"]], "oneDNN NHWC APIs": [[19, "onednn-nhwc-apis"]], "a. Create NHWC Memory": [[19, "a-create-nhwc-memory"]], "b. Create Convolution Primitive": [[19, "b-create-convolution-primitive"]], "Channels Last 1D support on XPU": [[19, "channels-last-1d-support-on-xpu"]], "a. tensor conversion with Channels Last 1D": [[19, "a-tensor-conversion-with-channels-last-1d"]], "b. model conversion with Channels Last 1D": [[19, "b-model-conversion-with-channels-last-1d"]], "c. determine if in Channels Last 1D memory format": [[19, "c-determine-if-in-channels-last-1d-memory-format"]], "Build Tool": [[20, "build-tool"]], "Use Tool": [[20, "use-tool"]], "Set Environment Variable": [[20, "set-environment-variable"]], "Add Profiler Into Script": [[20, "add-profiler-into-script"]], "Disable Tool in Model Script": [[20, "disable-tool-in-model-script"]], "Disable Tool Partly for XPU Backend": [[20, "disable-tool-partly-for-xpu-backend"]], "Profile on Multi-device Application": [[20, "profile-on-multi-device-application"]], "Result": [[20, "result"]], "Export to Chrome Trace": [[20, "export-to-chrome-trace"]], "Known issues": [[20, "known-issues"]], "Legacy Profiler Tool (Deprecated)": [[21, "legacy-profiler-tool-deprecated"]], "Enable and Disable Tool": [[22, "enable-and-disable-tool"]], "Use Simple Trace in Model": [[22, "use-simple-trace-in-model"]], "Results": [[22, "results"]], "Required Dependencies": [[23, "required-dependencies"]], "Inferenece with torch.compile": [[23, "inferenece-with-torch-compile"]], "Training with torch.compile": [[23, "training-with-torch-compile"]], "Quick Start": [[24, "quick-start"]], "Execution": [[24, "execution"]], "Installation": [[25, "installation"]], "Get Started": [[26, "get-started"]], "Troubleshooting": [[27, "troubleshooting"]], "General Usage": [[27, "general-usage"]], "Library Dependencies": [[27, "library-dependencies"]], "Unit Test": [[27, "unit-test"]], "License": [[28, "license"]], "Large Language Models (LLM) Optimizations Overview": [[29, "large-language-models-llm-optimizations-overview"]], "Optimized Models": [[29, "optimized-models"]], "Optimization Methodologies": [[29, "optimization-methodologies"]], "Linear Operator Optimization": [[29, "linear-operator-optimization"]], "Deep Fusion Policy": [[29, "deep-fusion-policy"]], "Segment KV Cache": [[29, "segment-kv-cache"]], "Distributed Inference": [[29, "distributed-inference"]], "Low Precision Data Types": [[29, "low-precision-data-types"]], "Weight Only Quantization INT4": [[29, "weight-only-quantization-int4"]], "Weight-Only Quantization (Prototype)": [[30, "weight-only-quantization-prototype"]], "Supported Framework Model Matrix": [[30, "supported-framework-model-matrix"]], "References": [[30, "references"]], "Weight-Only Quantization LLM features in Intel\u00ae Extension for PyTorch*": [[30, "weight-only-quantization-llm-features-in-intel-extension-for-pytorch"]], "Weight-Only Quantization Initialization": [[30, "weight-only-quantization-initialization"]], "Weight-Only Quantization Runtime": [[30, "weight-only-quantization-runtime"]], "Weight-Only Quantization Linear Dispatch": [[30, "weight-only-quantization-linear-dispatch"]], "Usage of running Weight-Only Quantization LLM For Intel\u00ae GPU": [[30, "usage-of-running-weight-only-quantization-llm-for-intel-gpu"]], "Environment Setup": [[30, "environment-setup"]], "Run Weight-Only Quantization LLM on Intel\u00ae GPU": [[30, "run-weight-only-quantization-llm-on-intel-gpu"]], "Install Intel-extension-for-transformers and Neural-compressor": [[30, "install-intel-extension-for-transformers-and-neural-compressor"]], "Quantize Model and Inference": [[30, "quantize-model-and-inference"]], "Save and Load Quantized Model (Optional)": [[30, "save-and-load-quantized-model-optional"]], "Execute WOQ benchmark script": [[30, "execute-woq-benchmark-script"]], "Transformers Optimization Frontend API": [[31, "transformers-optimization-frontend-api"]], "Pseudocode of Common Usage Scenarios": [[31, "pseudocode-of-common-usage-scenarios"]], "FP16": [[31, "fp16"]], "SmoothQuant": [[31, "smoothquant"]], "Imperative mode": [[31, "imperative-mode"]], "Distributed Inference with DeepSpeed": [[31, "distributed-inference-with-deepspeed"]], "Performance": [[32, "performance"]], "Overview": [[32, "overview"]], "Performance Data for Intel\u00ae AI Data Center Products": [[32, "performance-data-for-intel-ai-data-center-products"]], "LLM Performance v2.1.10": [[32, "llm-performance-v2-1-10"]], "Configuration": [[32, "configuration"]], "Software Version": [[32, "software-version"]], "Hardware Configuration": [[32, "hardware-configuration"]], "Releases": [[33, "releases"]], "2.1.30+xpu": [[33, "xpu"]], "Highlights": [[33, "highlights"], [33, "id2"], [33, "id5"], [33, "id8"], [33, "id11"], [33, "id14"], [33, "id16"]], "Known Issues": [[33, "known-issues"], [33, "id3"], [33, "id6"], [33, "id9"], [33, "id12"], [33, "id15"], [33, "id17"]], "2.1.20+xpu": [[33, "id1"]], "2.1.10+xpu": [[33, "id4"]], "2.0.110+xpu": [[33, "id7"]], "1.13.120+xpu": [[33, "id10"]], "1.13.10+xpu": [[33, "id13"]], "1.10.200+gpu": [[33, "gpu"]], "Technical Details": [[34, "technical-details"]], "Optimizer Optimization [GPU]": [[34, "optimizer-optimization-gpu"]], "Ahead of Time Compilation (AOT) [GPU]": [[34, "ahead-of-time-compilation-aot-gpu"]], "Memory Management [GPU]": [[34, "memory-management-gpu"]], "ipex.optimize [GPU]": [[34, "ipex-optimize-gpu"]], "Ahead of Time (AOT) Compilation": [[35, "ahead-of-time-aot-compilation"]], "Use case": [[35, "use-case"]], "Requirement": [[35, "requirement"]], "ipex.optimize Frontend API": [[36, "ipex-optimize-frontend-api"]], "Automatic Channels Last": [[36, "automatic-channels-last"]], "conv_bn_folding": [[36, "conv-bn-folding"]], "linear_bn_folding": [[36, "linear-bn-folding"]], "replace_dropout_with_identity": [[36, "replace-dropout-with-identity"]], "split_master_weight_for_bf16": [[36, "split-master-weight-for-bf16"]], "fuse_update_step": [[36, "fuse-update-step"]], "Memory Management": [[37, "memory-management"]], "Optimizer Fusion on GPU": [[38, "optimizer-fusion-on-gpu"]], "Operation Fusion": [[38, "operation-fusion"]]}, "indexentries": {"event (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.Event"]], "stream (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.Stream"]], "current_device() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.current_device"]], "current_stream() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.current_stream"]], "device (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device"]], "device_count() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device_count"]], "device_of (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device_of"]], "elapsed_time() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.elapsed_time"]], "empty_cache() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.empty_cache"]], "fp8_autocast() (in module intel_extension_for_pytorch.xpu.fp8.fp8)": [[1, "intel_extension_for_pytorch.xpu.fp8.fp8.fp8_autocast"]], "get_device_name() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_device_name"]], "get_device_properties() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_device_properties"]], "get_fp32_math_mode() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.get_fp32_math_mode"]], "get_rng_state() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_rng_state"]], "get_rng_state_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_rng_state_all"]], "init() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.init"]], "initial_seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.initial_seed"]], "is_available() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.is_available"]], "is_initialized() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.is_initialized"]], "manual_seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.manual_seed"]], "manual_seed_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.manual_seed_all"]], "max_memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.max_memory_allocated"]], "max_memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.max_memory_reserved"]], "memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_allocated"]], "memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_reserved"]], "memory_snapshot() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_snapshot"]], "memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_stats"]], "memory_stats_as_nested_dict() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_stats_as_nested_dict"]], "memory_summary() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_summary"]], "optimize() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.optimize"]], "optimize_transformers() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.optimize_transformers"]], "query() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.query"]], "record() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.record"]], "record_event() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.record_event"]], "reset_accumulated_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.reset_accumulated_memory_stats"]], "reset_peak_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.reset_peak_memory_stats"]], "seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.seed"]], "seed_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.seed_all"]], "set_device() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_device"]], "set_fp32_math_mode() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.set_fp32_math_mode"]], "set_rng_state() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_rng_state"]], "set_rng_state_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_rng_state_all"]], "stream() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.stream"]], "sycl_queue (intel_extension_for_pytorch.xpu.stream property)": [[1, "intel_extension_for_pytorch.xpu.Stream.sycl_queue"]], "synchronize() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.synchronize"]], "synchronize() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.synchronize"]], "synchronize() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.synchronize"]], "wait() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.wait"]], "wait_event() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.wait_event"]], "wait_stream() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.wait_stream"]], "xpu::fp32_math_mode (c++ enum)": [[1, "_CPPv4N3xpu14FP32_MATH_MODEE"]], "xpu::fp32_math_mode::bf32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E"]], "xpu::fp32_math_mode::fp32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E"]], "xpu::fp32_math_mode::fp32_math_mode_max (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE"]], "xpu::fp32_math_mode::fp32_math_mode_min (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE"]], "xpu::fp32_math_mode::tf32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E"]], "xpu::get_queue_from_stream (c++ function)": [[1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE"]], "xpu::set_fp32_math_mode (c++ function)": [[1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/DDP", "tutorials/features/DLPack", "tutorials/features/DPC++_Extension", "tutorials/features/FSDP", "tutorials/features/advanced_configuration", "tutorials/features/amp_gpu", "tutorials/features/auto_channels_last", "tutorials/features/compute_engine", "tutorials/features/deepspeed_kernels", "tutorials/features/float8", "tutorials/features/horovod", "tutorials/features/int8_overview_xpu", "tutorials/features/ipex_log", "tutorials/features/nhwc", "tutorials/features/profiler_kineto", "tutorials/features/profiler_legacy", "tutorials/features/simple_trace", "tutorials/features/torch_compile_gpu", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/int4_weight_only_quantization", "tutorials/llm/llm_optimize_transformers", "tutorials/performance", "tutorials/releases", "tutorials/technical_details", "tutorials/technical_details/AOT", "tutorials/technical_details/ipex_optimize", "tutorials/technical_details/memory_management", "tutorials/technical_details/optimizer_fusion_gpu"], "filenames": ["index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/DDP.md", "tutorials/features/DLPack.md", "tutorials/features/DPC++_Extension.md", "tutorials/features/FSDP.md", "tutorials/features/advanced_configuration.md", "tutorials/features/amp_gpu.md", "tutorials/features/auto_channels_last.md", "tutorials/features/compute_engine.md", "tutorials/features/deepspeed_kernels.md", "tutorials/features/float8.md", "tutorials/features/horovod.md", "tutorials/features/int8_overview_xpu.md", "tutorials/features/ipex_log.md", "tutorials/features/nhwc.md", "tutorials/features/profiler_kineto.md", "tutorials/features/profiler_legacy.md", "tutorials/features/simple_trace.md", "tutorials/features/torch_compile_gpu.md", "tutorials/getting_started.md", "tutorials/installation.rst", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/int4_weight_only_quantization.md", "tutorials/llm/llm_optimize_transformers.md", "tutorials/performance.md", "tutorials/releases.md", "tutorials/technical_details.rst", "tutorials/technical_details/AOT.md", "tutorials/technical_details/ipex_optimize.md", "tutorials/technical_details/memory_management.rst", "tutorials/technical_details/optimizer_fusion_gpu.md"], "titles": ["Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Contribution", "Examples", "Features", "DistributedDataParallel (DDP)", "DLPack Solution", "DPC++ Extension", "Fully Sharded Data Parallel (FSDP)", "Advanced Configuration", "Auto Mixed Precision (AMP) on GPU", "Auto Channels Last", "Compute Engine (Experimental feature for debug)", "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels", "Float8 Data Type Support (Prototype)", "Horovod with PyTorch (Prototype)", "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]", "IPEX_LOGGING (Prototype)", "Channels Last", "Kineto Supported Profiler Tool (Prototype)", "Legacy Profiler Tool (Deprecated)", "Simple Trace Tool (Prototype)", "torch.compile for GPU (Beta)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimizations Overview", "Weight-Only Quantization (Prototype)", "Transformers Optimization Frontend API", "Performance", "Releases", "Technical Details", "Ahead of Time (AOT) Compilation", "ipex.optimize Frontend API", "Memory Management", "Optimizer Fusion on GPU"], "terms": {"intel optim": 0, "intel\u00ae extension for pytorch*": 0, "gpu": [0, 1, 2, 3, 4, 8, 10, 13, 14, 15, 16, 20, 21, 24, 26, 27, 29, 32, 35, 36, 37], "discrete gpu": 0, "intel discrete gpu": 0, "extend": [0, 5, 7, 23, 26, 29, 30, 33], "latest": [0, 6, 7, 16, 24, 26, 27, 29, 32], "perform": [0, 1, 2, 4, 5, 8, 11, 12, 13, 14, 17, 19, 23, 26, 29, 30, 31, 33, 36, 38], "optim": [0, 1, 2, 5, 6, 9, 11, 12, 13, 14, 16, 19, 23, 24, 26, 27, 30, 33], "hardwar": [0, 2, 5, 26, 29, 33], "take": [0, 1, 8, 11, 19, 26, 30, 33], "advantag": [0, 1, 12, 19, 26, 33], "advanc": [0, 8, 24, 30, 33, 37], "vector": [0, 1, 4, 8, 14, 19, 33], "512": [0, 4, 33], "avx": [0, 33], "neural": [0, 2, 5, 13, 15, 33], "network": [0, 2, 5, 11, 13, 15, 33], "instruct": [0, 3, 4, 24, 25, 26, 27, 29, 30, 33], "vnni": [0, 33], "matrix": [0, 23, 26, 33], "amx": [0, 2, 33], "cpu": [0, 1, 2, 4, 5, 6, 20, 24, 27, 32, 33, 34, 36], "well": [0, 1, 3, 4, 29, 30, 33], "x": [0, 4, 8, 9, 11, 19, 26, 30, 35], "e": [0, 1, 4, 8, 11, 17, 19, 26, 27, 29, 33, 34, 35], "xmx": [0, 26, 33], "ai": [0, 2, 10, 26, 27, 29, 33], "engin": [0, 4, 19, 26, 30, 33], "discret": [0, 26, 33], "moreov": [0, 29, 33], "provid": [0, 1, 3, 4, 5, 6, 8, 9, 11, 13, 14, 18, 25, 26, 27, 29, 30, 31, 33, 35, 36, 38], "easi": [0, 2, 16, 26, 33], "acceler": [0, 1, 2, 5, 15, 23, 26, 30, 33], "through": [0, 1, 4, 5, 8, 11, 23, 26, 30, 33], "xpu": [0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 22, 23, 24, 26, 27, 30, 31, 32, 36], "devic": [0, 1, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 18, 19, 21, 23, 26, 27, 29, 30, 31, 33, 34, 35, 36], "In": [0, 1, 4, 5, 8, 11, 13, 19, 20, 22, 23, 29, 30, 33], "current": [0, 1, 3, 5, 7, 9, 13, 17, 18, 20, 22, 27, 29, 30, 31, 35, 36, 38], "technolog": [0, 29], "landscap": [0, 29], "gener": [0, 3, 4, 6, 7, 8, 13, 17, 18, 19, 29, 30, 31, 33, 35, 36], "genai": [0, 29], "workload": [0, 4, 5, 11, 17, 27, 29, 31, 33, 34], "model": [0, 1, 2, 5, 6, 9, 10, 11, 12, 13, 15, 16, 17, 23, 24, 27, 31, 32, 33, 36], "have": [0, 1, 3, 4, 6, 7, 8, 12, 13, 17, 19, 20, 22, 24, 27, 28, 29, 30, 35], "gain": [0, 29], "widespread": [0, 29], "attent": [0, 29, 30], "popular": [0, 7, 29, 30, 32], "larg": [0, 1, 5, 9, 18, 27, 30, 31, 33, 38], "languag": [0, 8, 30, 31, 33], "llm": [0, 2, 27, 31, 33], "emerg": [0, 29], "domin": [0, 29], "drive": [0, 29], "applic": [0, 1, 4, 29, 30, 34, 35, 36, 37], "start": [0, 1, 2, 3, 4, 6, 16, 18, 20, 22, 25, 27], "from": [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 21, 22, 23, 27, 29, 30, 31, 33, 34, 35, 36, 37], "2": [0, 2, 4, 6, 7, 8, 9, 11, 18, 19, 20, 22, 23, 27, 28, 29, 30, 32, 34, 35, 36], "1": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 18, 19, 20, 22, 23, 27, 29, 30, 36, 38], "0": [0, 1, 3, 4, 6, 8, 9, 10, 11, 16, 18, 20, 22, 23, 27, 28, 30, 32, 38], "specif": [0, 1, 4, 5, 7, 10, 13, 14, 16, 18, 19, 29, 33, 34, 36], "certain": [0, 1, 30, 31], "ar": [0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 24, 27, 29, 30, 31, 33, 34, 35, 36, 38], "introduc": [0, 2, 8, 19, 33], "For": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 18, 19, 20, 24, 26, 29, 33, 34, 35, 36, 37, 38], "more": [0, 1, 3, 5, 6, 8, 9, 10, 11, 17, 20, 22, 23, 24, 27, 29, 30, 33, 34, 35, 37, 38], "inform": [0, 1, 3, 5, 6, 7, 8, 9, 17, 18, 19, 20, 23, 24, 29, 30, 33, 34], "refer": [0, 1, 6, 8, 9, 10, 12, 13, 19, 20, 24, 25, 26, 33, 35, 36], "section": [0, 4, 5, 11, 17, 23, 25, 26, 30, 31, 36], "The": [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 38], "can": [0, 1, 3, 4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 19, 20, 22, 24, 27, 29, 30, 31, 33, 34, 35, 36, 37, 38], "load": [0, 1, 4, 8, 20, 27, 33, 36], "python": [0, 1, 3, 6, 7, 8, 9, 10, 16, 20, 23, 24, 29, 30, 31, 33, 34, 36], "modul": [0, 1, 4, 5, 6, 8, 9, 11, 14, 17, 19, 30, 31, 33, 34, 36], "program": [0, 1], "link": [0, 4], "c": [0, 5, 6, 8, 11, 22], "librari": [0, 4, 5, 6, 7, 8, 9, 10, 13, 14, 18, 20, 22, 33], "script": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 17, 22, 24, 27, 29, 31, 36], "user": [0, 1, 4, 5, 6, 10, 12, 13, 19, 20, 23, 27, 33, 34, 35, 36, 37], "enabl": [0, 1, 2, 4, 5, 6, 10, 11, 15, 17, 19, 20, 23, 24, 29, 30, 33, 34, 35, 36], "dynam": [0, 4, 15, 27], "import": [0, 1, 3, 4, 6, 8, 9, 15, 16, 17, 19, 20, 22, 23, 24, 27, 29, 30, 31, 33], "intel_extension_for_pytorch": [0, 1, 3, 4, 6, 7, 8, 9, 15, 16, 17, 19, 20, 22, 23, 24, 27, 30, 31, 33], "featur": [0, 2, 3, 4, 11, 14, 19, 23, 26, 27, 33, 34, 35, 36], "includ": [0, 1, 4, 8, 10, 14, 20, 24, 27, 28, 29, 30, 32, 33, 34, 36], "onli": [0, 1, 3, 4, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 24, 33, 36], "packag": [0, 4, 6, 8, 9, 23, 24, 33], "mai": [0, 1, 2, 3, 7, 8, 11, 12, 13, 17, 19, 20, 23, 27, 30, 33], "newer": 0, "code": [0, 1, 3, 5, 8, 9, 10, 13, 16, 19, 20, 22, 24, 25, 27, 28, 31, 33, 34, 35, 37, 38], "base": [0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 16, 20, 24, 29, 30, 31, 32, 33], "due": [0, 11, 17, 20, 27, 29, 30, 33], "differ": [0, 1, 4, 6, 7, 19, 20, 29, 30], "develop": [0, 2, 4, 8, 27, 34, 35], "schedul": [0, 9, 20], "ha": [0, 1, 4, 5, 7, 8, 13, 19, 23, 27, 30, 33, 35], "been": [0, 1, 4, 5, 8, 19, 23, 27, 33], "releas": [0, 1, 6, 12, 14, 19, 23, 27, 35, 37], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 16, 17, 18, 19, 20, 24, 27, 29, 30, 33, 38], "open": [0, 27, 33], "sourc": [0, 3, 8, 10, 16, 20, 22, 23, 24, 27, 28, 35], "project": [0, 4, 8], "github": [0, 3, 6, 7, 9, 11], "you": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 16, 18, 19, 20, 22, 23, 24, 27, 29, 30, 31, 33, 34, 35, 37], "find": [0, 4, 7, 8, 20, 27, 32, 33, 34], "how": [0, 4, 5, 6, 7, 8, 18, 19, 24], "get": [0, 1, 2, 4, 5, 6, 7, 8, 9, 18, 20, 27, 29, 33], "main": [0, 3, 4, 9, 23, 29, 30], "branch": 0, "quick": [0, 25, 26], "about": [0, 1, 3, 6, 8, 9, 23], "product": [0, 29, 33, 34, 35], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38], "structur": [0, 5, 7], "shown": [0, 1, 4, 6, 19, 20, 22, 29, 30], "follow": [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 27, 28, 29, 31, 33, 36], "figur": [0, 7, 29], "eager": [0, 17, 33], "mode": [0, 1, 3, 5, 10, 19, 24, 33], "frontend": [0, 1, 5, 29, 30, 33], "custom": [0, 1, 5, 8, 10, 13, 14, 20, 29, 33], "fusion": [0, 1, 4, 17, 23, 31, 33, 34, 36], "int8": [0, 2, 5, 17, 30, 31, 33], "quantiz": [0, 1, 2, 4, 14, 31, 33], "api": [0, 2, 4, 8, 9, 10, 17, 20, 23, 29, 30, 33, 34], "further": [0, 1, 4, 5, 18, 19, 29, 30], "improv": [0, 2, 11, 15, 29, 30, 33, 36], "achiev": [0, 1, 4, 33], "convert": [0, 1, 4, 5, 7, 11, 12, 15, 17, 19, 30, 31, 33, 36], "graph": [0, 1, 5, 11, 17, 23, 33, 36], "us": [0, 1, 2, 3, 6, 9, 10, 14, 15, 16, 17, 18, 19, 21, 23, 24, 25, 27, 28, 29, 30, 33, 34, 36, 37], "pass": [0, 3, 4, 8, 20, 27, 30], "reduc": [0, 1, 5, 9, 15, 20, 29, 30, 33, 38], "oper": [0, 1, 4, 5, 7, 8, 10, 11, 17, 18, 20, 21, 22, 23, 30, 33, 34], "kernel": [0, 1, 5, 8, 10, 13, 18, 20, 23, 27, 29, 30, 32, 33, 36], "invoc": [0, 27, 33], "overhead": [0, 1, 8, 29, 33, 34, 38], "result": [0, 1, 8, 19], "compar": [0, 1, 5, 19, 30, 33, 38], "normal": [0, 4, 9, 16, 20, 29, 30, 34, 36], "yield": 0, "better": [0, 1, 13, 17, 19, 29, 30, 33, 36, 38], "techniqu": [0, 1, 8], "like": [0, 1, 2, 3, 8, 10, 17, 18, 20, 27, 29, 30, 33], "amplifi": 0, "them": [0, 3, 10, 16, 19, 20, 27, 29, 30, 33, 36, 38], "comprehens": [0, 37], "both": [0, 1, 4, 5, 7, 15, 17, 19, 29, 30, 31, 33, 35, 36, 38], "torchscript": [0, 1, 5, 24, 38], "torchdynamo": 0, "With": [0, 1, 5, 6, 8, 16, 17, 20, 22], "we": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 17, 19, 23, 27, 29, 30, 32, 33, 37, 38], "recommend": [0, 4, 12, 13, 17, 23, 24, 27, 30, 33], "torch": [0, 1, 6, 7, 8, 9, 11, 13, 16, 17, 18, 19, 20, 22, 24, 27, 30, 31, 32, 33, 36, 38], "jit": [0, 1, 4, 11, 17, 19, 24, 31, 33, 34, 35], "trace": [0, 4, 10, 11, 17, 18, 24, 31], "your": [0, 3, 4, 6, 8, 9, 11, 16, 20, 22, 23, 24, 25, 27, 28, 30, 34, 35, 37], "prefer": [0, 13, 25], "option": [0, 1, 10, 20, 23, 34, 35, 36], "wider": [0, 4], "rang": [0, 6, 8, 9, 15, 16, 17, 20, 31], "ipex": [0, 1, 2, 4, 5, 8, 12, 18, 22, 24, 27, 29, 30, 31, 33], "backend": [0, 1, 2, 5, 6, 7, 8, 9, 13, 16, 23, 27, 29, 30, 33, 35], "avail": [0, 1, 4, 5, 6, 8, 10, 13, 23, 24, 30, 31, 33, 37], "good": [0, 1, 3, 19, 29, 38], "On": [0, 5, 15, 19, 29, 30], "automat": [0, 1, 4, 5, 12, 15, 17, 19, 20, 22, 29, 30, 33, 34, 35], "dispatch": [0, 8, 33], "underli": [0, 5, 29, 37], "detect": [0, 4], "set": [0, 1, 3, 4, 5, 6, 8, 9, 10, 13, 16, 22, 23, 24, 27, 30, 32, 33, 35, 36], "isa": 0, "leverag": [0, 23], "unit": [0, 8], "runtim": [0, 1, 5, 7, 8, 9, 11, 18, 20, 24, 27, 33, 35], "offer": [0, 3, 20, 30, 37], "finer": 0, "grain": [0, 2], "thread": [0, 1, 4, 8, 22, 32], "control": [0, 5, 20, 22], "weight": [0, 1, 4, 6, 8, 15, 16, 17, 19, 31, 33, 34, 36], "share": [0, 3, 5, 6, 7, 8, 27, 33], "increas": [0, 1, 2, 16, 20, 27, 29, 30, 33, 34, 35, 37], "effici": [0, 6, 8, 9, 15, 23, 29, 30, 33, 38], "implement": [0, 3, 4, 5, 6, 7, 8, 9, 19, 29, 30, 33, 38], "regist": [0, 10, 30, 33], "mechan": [0, 8, 33], "These": [0, 4, 11, 15, 29, 30, 33], "nativ": [0, 5, 11, 27, 33, 38], "calcul": [0, 1, 8, 11, 18, 20, 30, 33], "util": [0, 4, 5, 6, 7, 8, 9, 13, 15, 16, 18, 19, 27, 30, 35, 36], "dpc": [0, 7, 10, 27, 33], "compil": [0, 3, 4, 10, 20, 24, 27, 33], "sycl": [0, 1, 5, 7, 10, 13, 18, 33, 34, 36], "standard": [0, 8, 29], "also": [0, 1, 4, 5, 7, 8, 10, 17, 18, 19, 20, 27, 29, 30, 31, 33, 34, 35, 37, 38], "number": [0, 3, 4, 6, 8, 9, 16, 18, 20, 22, 27, 32, 33, 38], "which": [0, 1, 4, 5, 6, 7, 8, 10, 11, 15, 18, 19, 20, 22, 27, 29, 30, 33, 35, 37], "found": [0, 4, 17, 19, 31, 33], "doc": [0, 3, 17, 31, 36], "directori": [0, 3, 8, 10, 24, 27, 31, 33], "team": [0, 3], "track": [0, 1], "bug": [0, 3, 33, 34, 35], "enhanc": [0, 2, 23, 29, 30, 33], "request": [0, 1, 3, 34], "issu": [0, 3, 11, 27, 29, 30], "befor": [0, 1, 3, 4, 5, 10, 17, 19, 20, 22, 27, 30, 34, 35, 36], "submit": [0, 1, 3, 8], "suggest": [0, 19, 20], "report": [0, 18, 27], "search": [0, 3, 29, 33], "exist": [0, 3, 17, 20, 27, 30, 33], "see": [0, 1, 3, 8, 11, 15, 19, 20, 22, 27, 33, 35], "alreadi": [0, 3, 4, 16, 19, 29, 34], "dtype": [1, 4, 11, 13, 17, 20, 23, 24, 30, 31, 33, 36], "none": [1, 6, 9, 38], "level": [1, 5, 8, 10, 19, 20, 29, 30, 33, 35], "o1": 1, "inplac": [1, 17, 19, 30, 31], "fals": [1, 4, 9, 11, 17, 19, 20, 22, 24, 30, 31], "conv_bn_fold": 1, "linear_bn_fold": 1, "weights_prepack": 1, "replace_dropout_with_ident": 1, "optimize_lstm": 1, "split_master_weight_for_bf16": 1, "fuse_update_step": 1, "auto_kernel_select": 1, "sample_input": [1, 12], "graph_mod": 1, "concat_linear": 1, "appli": [1, 4, 5, 11, 16, 19, 24, 29, 30, 31, 33, 38], "given": [1, 29], "nn": [1, 4, 6, 9, 11, 13, 15, 19, 34, 36], "If": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13, 17, 18, 19, 20, 27, 30, 33, 34, 35, 36], "train": [1, 2, 6, 9, 15, 16, 17, 19, 24, 27, 30, 31, 33, 34, 36], "otherwis": [1, 9, 10, 30], "infer": [1, 2, 5, 14, 15, 17, 19, 23, 24, 27, 33, 36], "conv": [1, 11, 18, 36], "bn": 1, "fold": 1, "prepack": [1, 19, 29], "so": [1, 4, 5, 7, 11, 16, 19, 22, 23, 27, 33, 37, 38], "onednn": [1, 2, 5, 10, 13, 18, 23, 29, 33], "order": [1, 7, 13, 15, 19, 22, 27], "cach": [1, 3, 8, 10, 14, 18, 33, 34, 37, 38], "reus": 1, "layout": [1, 5], "call": [1, 4, 5, 6, 8, 11, 18, 19, 20, 22, 30, 36, 37], "block": [1, 3, 30, 33, 34, 36], "although": 1, "itself": [1, 19, 20], "fast": [1, 8, 16, 30, 34], "enough": [1, 27], "usag": [1, 5, 7, 11, 13, 17, 19, 20, 24, 26, 33], "perspect": [1, 19], "drawback": 1, "run": [1, 3, 4, 5, 6, 8, 9, 10, 11, 20, 22, 24, 27, 33, 34, 35, 36], "split": [1, 8, 10, 18, 34, 36], "one": [1, 3, 6, 7, 8, 10, 13, 16, 17, 18, 19, 20, 27, 31, 33, 38], "sever": [1, 10, 18, 20, 27, 32, 38], "dimens": [1, 8, 13, 19, 36], "data": [1, 4, 6, 8, 11, 12, 14, 16, 17, 19, 23, 24, 27, 30, 31, 33, 35, 38], "fix": [1, 3, 27, 33], "size": [1, 4, 6, 7, 8, 9, 10, 16, 18, 19, 27, 29, 30, 33, 34, 35], "each": [1, 5, 6, 8, 9, 10, 11, 13, 16, 18, 20, 22, 36], "time": [1, 3, 5, 6, 8, 9, 18, 19, 20, 21, 27, 29, 30, 38], "execut": [1, 4, 5, 7, 8, 10, 11, 13, 18, 20, 21, 27, 33, 34, 35, 36, 38], "detail": [1, 3, 4, 5, 6, 8, 10, 11, 12, 13, 17, 19, 23, 24, 26, 29, 30, 32, 33, 36], "mermori": 1, "format": [1, 3, 5, 6, 7, 9, 12, 13, 15, 16, 18, 20, 22, 30, 33, 36], "manual": [1, 5, 13, 19, 23, 36], "To": [1, 3, 4, 5, 6, 9, 16, 19, 20, 22, 24, 29, 30, 33], "thi": [1, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38], "predefin": 1, "shape": [1, 7, 13, 20, 29], "prior": [1, 24], "match": [1, 11, 36], "requir": [1, 3, 4, 6, 7, 8, 11, 13, 15, 17, 19, 24, 27, 29, 30, 31, 33, 36], "won": [1, 11, 20, 22], "t": [1, 3, 7, 8, 11, 19, 20, 22, 23, 33], "convers": [1, 4, 11, 17, 30, 33], "directli": [1, 8, 30], "go": [1, 3, 4, 8, 11, 34, 35], "methodologi": [1, 4, 5, 8, 38], "possibl": [1, 4, 7, 13], "avoid": [1, 3, 8, 27], "thu": [1, 8, 11, 13, 19, 33], "paramet": [1, 4, 5, 6, 9, 11, 16, 18, 20, 23, 29, 30, 33, 36, 38], "work": [1, 3, 4, 6, 8, 9, 10, 19, 20, 24, 27, 29, 31], "bfloat16": [1, 2, 5, 13, 23, 24, 33, 36], "half": 1, "k": [1, 30], "float16": [1, 5, 23, 24, 30, 31, 33], "cast": [1, 11], "accord": [1, 29, 30], "default": [1, 4, 5, 6, 7, 9, 10, 18, 20, 22, 23, 27, 33, 36], "valu": [1, 4, 10, 15, 27, 29, 30], "mean": [1, 10, 19, 22, 27, 29, 30, 33], "do": [1, 3, 5, 8, 11, 19, 20, 27, 29, 30], "noth": 1, "note": [1, 2, 3, 6, 7, 8, 9, 12, 14, 16, 19, 20, 23, 27, 29, 30, 33, 35], "type": [1, 3, 4, 5, 7, 8, 9, 17, 19, 24, 27, 30, 33, 34, 35, 36], "conv2d": [1, 9, 11, 19, 33], "linear": [1, 6, 9, 11, 13, 15, 19, 33, 34, 36], "convtranspose2d": 1, "case": [1, 4, 5, 8, 9, 10, 12, 19, 27, 30], "addit": [1, 4, 23, 30, 33, 34, 35, 36], "embed": [1, 29], "lstm": [1, 13], "sgd": [1, 4, 6, 11, 16, 23, 33, 34, 36, 38], "string": [1, 9, 18], "o0": 1, "No": [1, 19, 22, 27, 33], "function": [1, 3, 4, 5, 8, 9, 11, 17, 18, 20, 22, 23, 24, 29, 30, 31, 33, 38], "just": [1, 8, 30, 31, 33, 34, 35], "return": [1, 4, 6, 8, 9, 11, 19, 20], "origin": [1, 7, 15, 16, 30, 31, 36, 38], "dropout": [1, 9, 33, 34, 36], "remov": [1, 3, 10, 20, 33], "inferenc": 1, "master": [1, 6, 34, 36], "fuse": [1, 29, 30, 33, 34, 36, 38], "updat": [1, 3, 6, 9, 33, 34, 36, 38], "step": [1, 3, 4, 6, 8, 9, 11, 13, 16, 18, 20, 22, 23, 30, 34, 36], "overridden": 1, "explicitli": [1, 4, 8, 10, 11, 20], "bool": 1, "whether": [1, 11, 19, 20, 36], "conv_bn": 1, "It": [1, 5, 6, 7, 8, 9, 14, 18, 19, 22, 24, 27, 30, 31, 33, 34, 35], "knob": 1, "overwrit": 1, "configur": [1, 4, 8, 17, 18, 20, 24, 30, 33, 35], "linear_bn": 1, "convolut": [1, 5, 11, 23, 34, 36], "reorder": [1, 19, 29], "replac": [1, 3, 6, 9, 17, 30, 34, 36], "ident": [1, 4, 19, 34, 36], "aten": [1, 7, 8, 10], "opportunit": 1, "bf16": [1, 2, 24, 33, 38], "save": [1, 3, 4, 9, 15, 16, 19, 33, 36], "solut": [1, 2, 9, 27, 29, 33], "doesn": [1, 19, 20], "support": [1, 3, 4, 6, 7, 8, 10, 13, 17, 18, 21, 23, 26, 27, 29, 31, 33, 35, 36, 38], "all": [1, 3, 4, 6, 8, 9, 10, 11, 13, 16, 18, 20, 22, 29, 30, 31, 32, 33, 36, 37, 38], "param": [1, 38], "tupl": [1, 6], "tensor": [1, 4, 5, 7, 8, 11, 13, 17, 20, 29, 33, 37], "feed": [1, 12, 19], "sampl": [1, 6, 12], "input": [1, 4, 6, 8, 9, 12, 13, 17, 18, 19, 20, 23, 30, 32], "impact": 1, "pack": [1, 7], "intel": [1, 2, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31, 33, 34, 35, 36, 38], "extens": [1, 2, 4, 7, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38], "pytorch": [1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38], "per": [1, 7, 8, 10, 16, 17, 32], "some": [1, 3, 5, 8, 9, 11, 13, 19, 20, 24, 27], "heurist": 1, "real": [1, 4, 6], "best": [1, 11, 17, 29], "try": [1, 3, 4, 6, 18, 20, 27], "select": [1, 4, 5, 25, 30, 33], "true": [1, 4, 6, 8, 9, 13, 15, 17, 23, 24, 30, 31, 36], "might": [1, 19, 27, 38], "cost": [1, 5, 8, 20, 21], "extra": [1, 6, 33], "auto": [1, 4, 8, 19, 29, 33], "prototyp": [1, 33, 35], "combin": [1, 17], "method": [1, 7, 8, 11, 20, 22, 29, 30], "multipl": [1, 3, 5, 6, 11, 19, 23, 27, 29, 33, 35], "subgraph": 1, "modifi": [1, 3, 16], "other": [1, 5, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 24, 27, 29, 30, 33, 37, 38], "place": [1, 6, 11, 18, 29], "scenario": [1, 7, 14, 17, 30], "convolutuon": 1, "counterpart": [1, 33], "pleas": [1, 3, 5, 6, 9, 18, 20, 21, 23, 24, 30, 33, 35, 36], "invok": [1, 4, 8, 11, 24, 27, 31, 33], "ddp": [1, 5, 9, 33], "distribut": [1, 2, 6, 9, 10, 16, 27, 33, 34, 35], "deepcopi": 1, "rather": [1, 18, 19], "than": [1, 10, 13, 14, 17, 18, 19, 20, 23, 27, 30, 33], "allreduc": [1, 6, 16, 33], "caus": [1, 27, 29, 33, 35], "unpredict": 1, "accuraci": [1, 2, 9, 11, 29, 30, 33], "loss": [1, 4, 6, 9, 11, 16, 19, 23, 29, 30], "exampl": [1, 3, 5, 10, 11, 13, 16, 18, 19, 20, 22, 24, 25, 26, 29, 30, 31, 38], "load_state_dict": 1, "path": [1, 4, 8, 9, 10, 18, 19, 27, 36], "eval": [1, 4, 9, 11, 17, 24, 31], "optimized_model": 1, "evalu": [1, 24, 33], "optimized_optim": 1, "altern": [1, 4, 5, 6, 19], "motiv": [1, 4], "ad": [1, 4, 6, 20, 23, 33], "alia": [1, 4, 8], "unifi": [1, 4], "style": [1, 3, 4, 8], "modular": [1, 4], "optimize_transform": [1, 29, 30, 31, 33], "float32": [1, 20, 24], "quantization_config": [1, 30], "qconfig_summary_fil": 1, "low_precision_checkpoint": 1, "deployment_mod": 1, "transform": [1, 2, 4, 9, 14, 19, 32, 33], "focu": [1, 19, 29, 30, 31, 33], "especi": [1, 3, 8], "task": [1, 29, 30], "famili": [1, 29], "llama": [1, 2, 29, 33], "gpt": [1, 29, 30, 32, 33], "j": [1, 29, 30, 32, 33], "neox": 1, "opt": [1, 6, 24, 29, 32, 33], "falcon": 1, "now": [1, 5, 8, 19, 23], "float": [1, 4, 5, 8, 9, 11, 15, 36], "when": [1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 27, 29, 30, 33, 34, 35, 36, 38], "mix": [1, 4, 8, 33], "str": [1, 6, 20], "specifi": [1, 8, 9, 10, 18, 35, 36], "either": [1, 5, 6], "object": [1, 4, 27, 33], "defin": [1, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 31, 33, 36], "recip": [1, 15], "quant": 1, "static": [1, 5, 17, 29, 30, 36], "onc": [1, 3, 5, 13, 18, 19, 20, 30, 35, 36], "quantizat": 1, "config": [1, 4, 17], "json": [1, 20], "file": [1, 3, 4, 8, 10, 11, 18, 19, 20, 27, 33, 35], "under": [1, 7, 11, 14, 19, 27, 28, 33], "need": [1, 3, 4, 5, 6, 8, 9, 16, 17, 19, 20, 22, 23, 24, 31, 33, 34, 35, 36, 38], "calibr": [1, 4, 17, 31], "dict": 1, "int4": [1, 30, 33], "": [1, 2, 6, 8, 9, 11, 18, 19, 20, 27, 30, 38], "should": [1, 3, 8, 9, 11, 18, 20, 22, 27, 29, 30], "state_dict": [1, 4, 9, 16], "checkpoint": [1, 4, 16, 27], "pt": [1, 4, 9, 30], "gptq": [1, 30], "etc": [1, 3, 5, 14, 18], "where": [1, 3, 5, 6, 7, 9], "kei": [1, 5, 6, 29, 33, 34], "group": [1, 6, 8, 9], "chang": [1, 3, 4, 6, 8, 9, 11, 16, 19, 24, 27, 30, 31, 33], "make": [1, 3, 4, 5, 6, 8, 9, 16, 20, 23, 24, 29, 30, 33, 35], "n": [1, 4, 6, 8, 9, 19], "thei": [1, 8, 11, 19, 29, 30, 33], "uint4": 1, "compress": [1, 15, 30], "along": [1, 3, 22], "store": [1, 14, 19, 29, 38], "int32": [1, 33], "zero": [1, 5, 9, 10, 20, 27, 30, 35], "point": [1, 5, 7, 8, 11, 15, 30], "scale": [1, 2, 5, 9, 15, 16, 29, 30, 33], "bia": [1, 8, 11, 14, 19, 30, 33], "state": [1, 5, 8, 9, 16, 29, 37], "channel": [1, 2, 10, 17, 33, 34], "automaticlli": 1, "deploy": [1, 4, 30], "torchscirpt": 1, "workabl": 1, "forward": [1, 4, 6, 8, 9, 11, 13, 19, 36], "after": [1, 3, 4, 8, 17, 20, 22, 24, 25, 27, 30, 38], "deepspe": [1, 29, 32, 33], "parallel": [1, 6, 27, 29, 33], "get_fp32_math_mod": 1, "fpmath_mod": 1, "fpmath": 1, "fp32mathmod": 1, "fp32": [1, 10, 13, 14, 17, 24, 33, 38], "bf32": [1, 10], "tf32": [1, 10], "disabl": [1, 9, 10, 18, 36], "implicit": 1, "set_fp32_math_mod": 1, "current_devic": 1, "int": [1, 4, 6, 8, 9], "index": [1, 3, 6, 7, 8, 9, 19, 22, 29, 33], "current_stream": [1, 8], "class": [1, 6, 9, 11, 19], "ani": [1, 3, 11, 18, 19, 20, 27, 33, 34], "context": [1, 3, 5, 7, 8, 10, 11, 22, 29], "wrapper": [1, 6, 8, 22], "encapsul": [1, 8], "op": [1, 3, 9, 10, 13, 18, 22, 23, 29, 30], "argument": [1, 6, 9, 13, 20, 30], "neg": 1, "integ": [1, 30], "device_count": [1, 9, 20], "device_of": 1, "obj": 1, "storag": [1, 38], "alloc": [1, 7, 16, 18, 29, 34, 37], "get_device_nam": 1, "name": [1, 8, 13, 20, 22, 30], "get_device_properti": 1, "properti": [1, 4, 8], "_deviceproperti": 1, "init": [1, 3, 16], "initi": [1, 6, 9, 16], "lazi": 1, "until": [1, 3, 22], "first": [1, 2, 3, 4, 6, 8, 12, 16, 17, 20, 30, 33], "access": [1, 7, 8, 19, 27, 29, 33, 38], "veri": [1, 3, 8, 19, 20, 21, 27, 29], "rare": 1, "sinc": [1, 7, 8, 19, 30, 34, 35], "could": [1, 6, 10, 17, 18, 19, 20, 30, 31, 36], "demand": [1, 5], "doe": [1, 7, 8, 18, 19, 23, 27, 33, 36], "repeatedli": [1, 3], "is_avail": 1, "indic": [1, 19], "is_initi": 1, "set_devic": [1, 6, 9, 16], "discourag": 1, "favor": 1, "most": [1, 5, 10, 13, 17, 27, 29, 33], "ze_affinity_mask": 1, "environment": 1, "variabl": [1, 3, 5, 10, 16, 18, 24, 27], "restrict": 1, "visibl": [1, 33], "streamcontext": 1, "around": [1, 8], "synchron": [1, 7, 8, 10, 16, 20], "wait": [1, 7, 20, 30], "complet": [1, 3, 4, 18, 19, 20, 31, 37], "fp8": [1, 5, 33], "fp8_autocast": [1, 15], "fp8_recip": [1, 15], "delayedsc": [1, 15], "get_rng_stat": 1, "bytetensor": 1, "rng": 1, "eagerli": 1, "get_rng_state_al": 1, "list": [1, 3, 4, 10, 11, 19, 20, 26, 29, 31, 33, 35], "repres": [1, 3, 6, 18, 33], "set_rng_stat": 1, "new_stat": 1, "desir": [1, 4, 17], "set_rng_state_al": 1, "iter": [1, 6, 20, 29], "manual_se": [1, 6, 9], "seed": [1, 6, 9], "safe": [1, 7], "silent": 1, "ignor": [1, 36], "multi": [1, 6, 35], "insuffici": 1, "determin": [1, 30], "manual_seed_al": 1, "seed_al": 1, "initial_se": 1, "prioriti": [1, 13], "kwarg": [1, 20], "record_ev": 1, "record": [1, 9, 18, 20], "new": [1, 2, 3, 15, 19, 30, 31, 33, 34], "sycl_queu": [1, 4, 8], "pycapsul": [1, 8], "queue": [1, 4, 5, 10, 18], "correspond": [1, 6, 27, 30, 33], "void": [1, 8], "pointer": [1, 8, 27], "address": [1, 6, 19], "Its": [1, 34, 36], "capsul": 1, "self": [1, 6, 9, 11, 19, 20], "wait_ev": 1, "futur": [1, 3, 12], "wait_stream": 1, "anoth": [1, 30], "without": [1, 5, 7, 11, 20, 22, 27, 30, 33, 34], "enqueu": 1, "affect": 1, "elapsed_tim": [1, 9], "end_ev": 1, "elaps": [1, 9], "millisecond": 1, "wa": [1, 4, 7, 8, 22, 27, 33], "queri": [1, 19], "check": [1, 4, 5, 6, 8, 13, 19, 20, 24, 29, 31, 34, 36], "captur": [1, 37], "A": [1, 3, 4, 5, 17, 19, 27, 30, 33, 35], "boolean": 1, "prevent": [1, 16, 38], "proceed": 1, "empty_cach": [1, 37], "unoccupi": 1, "held": 1, "those": [1, 8, 16, 20, 29, 37], "sysman": 1, "toolkit": [1, 6, 24, 32, 33], "amount": [1, 37], "howev": [1, 3, 5, 8, 10, 11, 12, 19, 29, 30, 33, 37], "help": [1, 3, 4, 8, 9, 13, 29, 34, 35, 37], "fragment": 1, "memory_stat": [1, 37], "dictionari": 1, "statist": [1, 4, 17, 31], "non": [1, 3, 11, 19, 30], "core": [1, 16, 23, 27, 32, 33], "large_pool": 1, "small_pool": 1, "peak": 1, "freed": [1, 37], "receiv": [1, 27, 33], "allocated_byt": 1, "segment": [1, 5, 20, 33], "reserv": [1, 18, 34], "xpumalloc": 1, "reserved_byt": 1, "activ": [1, 4, 15, 16, 17, 20, 24, 27, 29, 30, 31], "active_byt": 1, "inactive_split": 1, "inact": 1, "inactive_split_byt": 1, "broken": 1, "down": [1, 27], "pool": [1, 34], "across": [1, 5, 6, 9], "octob": 1, "2019": [1, 2], "1mb": 1, "small": [1, 17], "metric": 1, "maximum": 1, "histor": 1, "total": [1, 8, 20, 32, 33, 37], "decreas": [1, 27], "simpl": [1, 8, 10, 11, 19, 23, 24, 30, 33], "counter": 1, "num_alloc_retri": 1, "fail": [1, 27, 33, 36], "flush": 1, "retri": 1, "num_oom": 1, "out": [1, 4, 5, 7, 8, 11, 20, 22, 27, 30, 38], "error": [1, 3, 4, 8, 9, 18, 19, 27, 30, 33], "thrown": [1, 27], "memory_summari": 1, "abbrevi": 1, "human": 1, "readabl": [1, 8], "printout": 1, "displai": 1, "period": 1, "dure": [1, 3, 4, 27, 29, 30, 35, 36], "handl": [1, 4, 7, 19, 33], "except": [1, 6, 29], "summari": 1, "memory_snapshot": [1, 37], "snapshot": [1, 37], "interpret": [1, 8], "output": [1, 4, 9, 10, 11, 13, 15, 16, 18, 19, 20, 22, 23, 30], "familiar": [1, 8], "intern": [1, 10, 19], "memory_alloc": [1, 37], "occupi": [1, 37], "byte": 1, "less": [1, 5, 10, 11, 27, 33], "unus": [1, 37], "creat": [1, 3, 4, 7, 8, 10, 14, 17, 20, 23, 31, 33, 36], "max_memory_alloc": [1, 37], "By": [1, 5, 36], "begin": [1, 3, 8, 22], "reset_peak_stat": 1, "reset": [1, 14], "two": [1, 5, 7, 8, 15, 18, 29, 34], "measur": 1, "loop": [1, 3, 10, 20], "memory_reserv": [1, 37], "max_memory_reserv": [1, 37], "reset_peak_memory_stat": 1, "stat": 1, "individu": [1, 3], "memory_stats_as_nested_dict": 1, "nest": [1, 22], "reset_accumulated_memory_stat": 1, "accumul": 1, "enum": 1, "fp32_math_mod": 1, "dpccp": 1, "packet": 1, "enumer": [1, 4, 7, 9, 16], "math": [1, 5, 8, 10, 13], "fp32_math_mode_min": 1, "fp32_math_mode_max": 1, "comput": [1, 4, 15, 16, 19, 23, 29, 30, 33, 34, 35, 36], "primit": [1, 10], "attribut": [1, 19], "descript": [1, 5, 9, 10, 18, 19, 26, 36], "definit": [1, 8], "numer": [1, 11], "behavior": [1, 13, 22], "get_queue_from_stream": [1, 4, 8], "c10": [1, 4], "dpcpp": [1, 4, 8, 27, 33], "dec": 2, "2023": [2, 30, 32, 33], "softwar": [2, 28], "jul": 2, "deep": [2, 6, 7, 9, 11, 13, 15, 16], "learn": [2, 6, 7, 9, 11, 15, 16], "boost": [2, 4, 12, 32, 33], "dl": 2, "hug": 2, "face": 2, "bert": [2, 15], "googl": [2, 3], "cloud": 2, "platform": [2, 4, 7, 19, 23, 27, 30, 33, 35, 36], "gcp": 2, "technologi": 2, "guid": [2, 6], "apr": 2, "mar": 2, "x86": 2, "sapphir": 2, "rapid": 2, "part": [2, 4, 8, 11, 18, 19, 20, 34, 36], "jan": 2, "secur": 2, "torchserv": 2, "confer": 2, "2022": [2, 30, 33], "what": [2, 11, 20, 22, 33, 34, 35], "pyg": 2, "stabl": [2, 5, 6, 7, 11, 27], "diffus": 2, "arc": [2, 27, 30, 33, 35], "nov": [2, 33], "13": [2, 22], "potenti": [2, 23, 36], "fine": [2, 8, 30], "fx": 2, "sep": 2, "empow": [2, 5, 23], "xeon": [2, 32], "scalabl": 2, "processor": [2, 38], "aug": 2, "vision": [2, 4], "last": [2, 8, 10, 33, 34], "One": [2, 7, 15, 19, 38], "click": 2, "compressor": 2, "throughput": [2, 29, 33], "4x": [2, 32], "jun": 2, "grokk": 2, "principl": [2, 16, 19], "kt": 2, "person": 2, "text": [2, 35], "speech": 2, "2021": [2, 6, 33], "tune": [2, 11, 30], "up": [2, 5, 7, 8, 9, 29, 33, 34], "modern": 2, "naver": 2, "low": [2, 5, 6, 8, 24, 33, 38], "latenc": [2, 29, 33], "machin": [2, 3, 6, 30], "feb": [2, 33], "dlrm": 2, "oneccl": [2, 5, 9, 33], "mention": [2, 6, 8, 18], "deprec": [2, 12], "facebook": [2, 29], "3rd": 2, "gen": 2, "capabl": [2, 5, 17, 18, 23, 30, 33, 37], "2020": [2, 7], "collabor": 2, "caff": 2, "2017": 2, "thank": 3, "interest": 3, "intent": 3, "want": [3, 8, 10, 19, 20, 24], "propos": [3, 19, 29, 30], "post": [3, 5, 17, 23, 29, 30], "intend": 3, "shall": [3, 19], "discuss": [3, 8, 19], "design": [3, 9, 11, 13, 14, 19, 30, 31, 33, 34, 36], "agre": 3, "plan": 3, "look": [3, 4, 8, 19, 30], "ahead": [3, 8, 20, 22], "outstand": 3, "pick": 3, "comment": 3, "d": [3, 4, 8, 11, 36], "particular": [3, 11, 27, 30, 31, 33], "ask": 3, "pull": 3, "http": [3, 6, 7, 8, 9, 27], "com": [3, 6, 9, 27], "full": [3, 8, 23], "instal": [3, 4, 9, 10, 20, 23, 24, 26, 27, 29, 33, 35], "here": [3, 6, 7, 8, 9, 11, 18, 19, 20, 22], "uninstal": 3, "pip": [3, 6, 16, 23, 30], "ll": [3, 8, 20, 22], "know": [3, 7, 8, 34, 35], "fulli": [3, 17, 31, 33], "warn": [3, 18], "skip": [3, 4, 19, 22, 30, 34, 35], "few": [3, 12, 19], "alwai": [3, 11, 13, 19, 27, 33], "timeout": 3, "ye": 3, "clone": [3, 22, 38], "copi": [3, 5, 7, 19], "git": 3, "b": [3, 11, 22], "cd": [3, 4], "rebas": 3, "submodul": 3, "sync": 3, "recurs": 3, "job": [3, 27], "setup": [3, 8, 9, 16, 20, 29], "py": [3, 6, 8, 9, 10, 20, 27], "symlink": 3, "tree": 3, "reinstal": [3, 23], "again": [3, 23, 38], "__init__": [3, 6, 9, 11, 19], "would": [3, 4, 8, 10, 13, 17, 18, 19, 27, 33], "interfac": [3, 4, 8, 19, 29, 30], "pyi": 3, "cpp": [3, 4, 8], "h": [3, 4, 8, 18, 19, 30], "sure": [3, 6, 9, 16, 20], "Then": [3, 6, 17, 23, 27, 30, 33], "clean": [3, 27, 33], "our": [3, 4, 8, 13, 17, 29, 30], "3": [3, 4, 6, 8, 9, 11, 13, 18, 19, 20, 22, 23, 27, 30, 32, 33, 36], "6": [3, 4, 22, 32], "binari": [3, 4, 11, 19, 34, 35], "folder": 3, "mani": [3, 5, 8, 20], "wai": [3, 8, 19, 29, 30, 38], "next": [3, 8, 13], "re": [3, 6, 8, 11], "rm": [3, 14], "rf": 3, "toplevel": 3, "over": [3, 5, 8, 11, 12, 19, 33], "made": [3, 7, 33], "edit": 3, "repo": [3, 6, 23], "commit": [3, 23, 32], "keep": [3, 19, 20], "command": [3, 4, 6, 8, 16, 23, 27, 33], "realli": [3, 8], "untrack": 3, "deinit": 3, "f": [3, 4, 9, 16], "xdf": 3, "within": [3, 18, 30, 31, 33, 34, 36], "experi": [3, 13, 19, 29, 30], "environ": [3, 5, 6, 9, 10, 16, 18, 24, 27, 29], "env_key1": 3, "env_val1": 3, "env_key2": 3, "env_val2": 3, "suit": 3, "locat": 3, "test_": 3, "sub_fold": 3, "filenam": 3, "contain": [3, 6, 7, 10, 13, 30], "wish": [3, 8, 19, 20], "experiment": 3, "port": [3, 6], "stock": [3, 7, 19, 33], "10": [3, 7, 9, 18, 19, 20, 22, 23], "regress": [3, 12], "don": [3, 8, 11, 20, 23], "offici": [3, 17, 20, 33], "via": [3, 5, 7, 8, 10, 17, 20, 23, 30, 35, 37], "read": [3, 8, 30, 38], "readm": 3, "md": [3, 19], "docstr": 3, "length": [3, 32], "line": [3, 8, 19, 20, 22], "insid": [3, 8, 18, 23, 34, 36], "must": [3, 8, 20, 35, 36, 38], "limit": [3, 11, 19, 29, 30, 33], "80": 3, "charact": 3, "fit": [3, 30, 34], "jupyt": 3, "popup": 3, "abov": [3, 6, 7, 9, 10, 18, 19, 20, 22, 29, 38], "prerequisit": [3, 4], "r": [3, 32], "txt": [3, 4, 8], "html": [3, 7, 8], "_build": 3, "rst": 3, "live": 3, "tutori": [3, 4, 6, 8, 9, 20], "autofunct": 3, "autoclass": 3, "direct": 3, "shorten": 3, "sphinx": 3, "produc": [3, 7, 8, 11, 37], "miss": 3, "still": [3, 5, 8, 11, 19, 20, 33, 35], "torchvis": [4, 9], "demonstr": [4, 7, 13, 19, 30], "box": 4, "benefit": [4, 5, 11, 17], "against": 4, "precis": [4, 6, 15, 24, 30, 32, 33], "amp": [4, 23, 24, 33], "criterion": [4, 11], "below": [4, 6, 8, 11, 13, 14, 18, 19, 20, 22, 24, 30, 33, 35, 38], "move": [4, 17, 19, 24], "dataload": [4, 6, 9, 16, 20], "target": [4, 8, 9, 16, 33, 34, 35], "zero_grad": [4, 9, 16, 23], "autocast": [4, 23, 24], "backward": [4, 6, 8, 9, 11, 16, 23, 36], "lr": [4, 6, 9, 11, 23, 38], "001": [4, 6, 11], "download": [4, 9, 23, 27], "dataset": [4, 6, 9, 16, 30], "cifar10": 4, "compos": [4, 9], "resiz": 4, "224": [4, 11, 23], "totensor": [4, 9], "5": [4, 6, 9, 13, 17, 18, 19, 22, 30, 31, 32], "train_dataset": [4, 6, 16], "root": [4, 6, 10, 27, 29, 33], "train_load": [4, 6, 9, 11, 16], "batch_siz": [4, 6, 8, 9, 16, 19], "128": [4, 9, 11], "crossentropyloss": 4, "momentum": [4, 23, 38], "9": [4, 22, 33], "batch_idx": [4, 9, 16], "print": [4, 5, 6, 9, 16, 17, 19, 20, 22, 31], "model_state_dict": 4, "optimizer_state_dict": 4, "pth": 4, "finish": [4, 13, 18, 27], "nlp": 4, "resnet50_weight": 4, "rand": [4, 11, 19, 23], "no_grad": [4, 9, 23, 24], "bertmodel": 4, "from_pretrain": [4, 30], "uncas": 4, "vocab_s": 4, "seq_length": 4, "randint": 4, "freez": [4, 11, 23, 24], "strict": 4, "becaus": [4, 11, 19, 29, 35], "insert": [4, 17, 31], "observ": [4, 12, 17, 31], "prepare_jit": [4, 17, 31], "convert_jit": [4, 17, 31], "separ": [4, 8, 10, 18, 27, 28], "process": [4, 5, 6, 8, 9, 13, 15, 16, 22, 27, 29, 30, 36], "collect": [4, 5, 6, 9, 17, 20, 33], "o": [4, 6, 9, 27, 32, 33], "_recurs": 4, "wrap_cpp_modul": 4, "quantize_jit": [4, 17, 31], "modeljit": [4, 17, 31], "qconfig": [4, 17, 31], "minmaxobserv": [4, 17, 31], "with_arg": [4, 17, 31], "qscheme": [4, 17, 31], "per_tensor_symmetr": [4, 17, 31], "reduce_rang": [4, 17, 31], "quint8": 4, "default_weight_observ": [4, 17, 31], "calibration_data_load": 4, "batch": [4, 8, 9, 16, 19, 33, 34, 36], "len": [4, 9, 16, 20], "memory_format": [4, 5, 19], "channels_last": [4, 5, 19], "libtorch": [4, 33], "its": [4, 6, 7, 10, 11, 13, 14, 20, 24, 30], "own": [4, 8], "servic": 4, "regular": 4, "unlik": [4, 5, 9, 29, 30], "cmake": [4, 5, 33], "cppsdk": 4, "ensur": [4, 16], "page": [4, 6, 9, 20, 23, 25, 27, 31, 32, 36], "version": [4, 7, 8, 23, 27, 28, 33, 36, 38], "app": 4, "iostream": 4, "memori": [4, 5, 7, 8, 9, 11, 12, 15, 18, 20, 27, 29, 30, 32, 33, 36, 38], "argc": 4, "const": [4, 8], "char": 4, "argv": 4, "catch": [4, 17, 18], "std": [4, 8], "cerr": 4, "kxpu": 4, "ivalu": 4, "push_back": 4, "cout": 4, "slice": [4, 8, 19], "dim": [4, 7, 8, 9, 13, 19], "end": [4, 18, 20, 22, 27, 30, 34, 35, 36], "endl": 4, "cmakelist": [4, 8], "cmake_minimum_requir": [4, 8], "fatal_error": [4, 8], "find_packag": [4, 8], "add_execut": 4, "target_link_librari": [4, 8], "torch_ipex_librari": [4, 8], "set_properti": [4, 8], "cxx_standard": [4, 8], "17": [4, 8, 22, 32], "mkdir": 4, "build": [4, 5, 16, 22, 23, 27, 33, 35], "cc": 4, "icx": [4, 8], "cxx": 4, "icpx": [4, 8], "dcmake_prefix_path": [4, 8], "libpytorch_path": 4, "libpytorch": 4, "_": [4, 6, 7, 8, 14, 17, 18, 19, 22, 27, 30, 33, 36], "absolut": 4, "verifi": [4, 23, 27, 29], "linux": [4, 8, 27, 33], "ldd": 4, "y": [4, 11, 30], "z": [4, 8], "log": [4, 5, 9, 10, 22, 33], "depend": [4, 19, 33], "choos": [4, 5, 11, 13, 20, 33], "workspac": [4, 14], "identif": 4, "intelllvm": 4, "202x": 4, "abi": [4, 33], "info": [4, 17, 18], "done": [4, 9], "oneapi": [4, 5, 6, 7, 9, 13, 16, 24, 27, 32, 33, 35], "bin": [4, 27, 32, 33], "pthread": 4, "test": [4, 9, 22, 23, 32, 33, 34, 35], "cmake_have_libc_pthread": 4, "success": [4, 25], "lib": [4, 10, 18, 27, 33], "libintel": 4, "ext": 4, "written": [4, 33, 36], "0x00007fd5bb927000": 4, "libc10": 4, "0x00007fd5bb895000": 4, "libtorch_cpu": 4, "0x00007fd5a44d8000": 4, "0x00007fd5a1a1b000": 4, "0x00007fd5862b0000": 4, "libmkl_intel_lp64": [4, 27, 33], "mkl": [4, 24, 27, 33], "intel64": [4, 27, 33], "0x00007fd584ab0000": 4, "libmkl_cor": [4, 27, 33], "0x00007fd5806cc000": 4, "libmkl_gnu_thread": [4, 27], "0x00007fd57eb1d000": 4, "libmkl_sycl": [4, 27, 33], "0x00007fd55512c000": 4, "libopencl": 4, "0x00007fd55511d000": 4, "libsvml": 4, "intel64_lin": 4, "0x00007fd553b11000": 4, "libirng": 4, "0x00007fd553600000": 4, "libimf": 4, "0x00007fd55321b000": 4, "libintlc": 4, "0x00007fd553a9c000": 4, "libsycl": 4, "0x00007fd552f36000": 4, "show": [4, 6, 7, 8, 11, 20, 22, 29, 30, 31, 32, 33], "fsycl": [4, 8, 35], "cmake_cxx_flag": 4, "usm": [4, 7], "cl": 4, "hpp": 4, "namespac": [4, 11], "fetch": 4, "stream": [4, 10], "device_typ": [4, 8], "devicetyp": [4, 8], "impl": [4, 8, 33], "virtualguardimpl": [4, 8], "xpu_stream": 4, "getstream": [4, 8], "input_ptr": 4, "malloc_devic": 4, "fromusm": 4, "scalartyp": 4, "nullopt": 4, "output_tensor": 4, "append": 4, "allow": [4, 11, 27, 30, 33, 34, 35], "former": [4, 8], "zoo": 4, "benchmark": [4, 32, 37], "mark": [4, 18, 20, 30], "document": [4, 5, 8, 10, 31, 33], "column": [4, 8, 20], "simpli": [4, 8], "guidanc": 5, "nchw": 5, "nhwc": [5, 33], "anymor": 5, "center": [5, 14, 23, 27, 30, 33, 35], "flex": [5, 27, 33, 35], "seri": [5, 14, 23, 27, 30, 33, 35], "typic": [5, 7, 16, 20, 30, 33], "speed": [5, 8, 29, 33, 34, 38], "side": [5, 7], "imper": 5, "illustr": [5, 6, 9, 17, 19], "workflow": [5, 17], "meet": [5, 15, 20, 36], "commun": [5, 6, 7, 9, 33], "bind": [5, 8, 9, 33], "formerli": [5, 6, 9], "known": [5, 6, 9, 29], "torch_ccl": [5, 6], "horovod": [5, 27, 33], "among": [5, 7, 16], "framework": [5, 7, 10, 16], "interopar": 5, "particularli": [5, 7], "describ": [5, 6, 11, 19, 27, 30], "write": [5, 20], "practic": [5, 8, 29], "setuptool": 5, "suffici": [5, 10, 30], "driver": [5, 27, 32, 35], "ze_flat_device_hierarchi": [5, 10], "hierarchi": 5, "expos": [5, 11], "tile": [5, 6, 10, 29, 32], "industri": [5, 9, 33], "grade": [5, 9, 33], "worker": [5, 6, 9, 16], "maintain": [5, 6, 8, 9, 11], "replica": [5, 6, 9], "gradient": [5, 9, 15, 16, 30], "rank": [5, 6, 9, 16], "footprint": [5, 9, 15, 29, 33], "feasibl": [5, 9], "seamlessli": [5, 23], "har": [5, 23], "flagship": [5, 23], "inductor": [5, 23, 33], "torchinductor": [5, 23], "built": [5, 8, 22, 27, 33, 35], "let": [5, 8, 19, 22, 38], "stack": [5, 11, 18, 22], "piec": [5, 22], "verbos": [5, 8, 10, 18, 22], "messag": [5, 8, 10, 18, 19, 22, 27, 33], "indent": [5, 20, 22], "distinguish": [5, 22], "field": [5, 20, 22], "statement": [5, 20], "capac": [5, 13, 32], "continu": [5, 13, 18, 22, 27, 33], "macro": [5, 18], "torch_check": [5, 8, 18], "torch_error": [5, 18], "replic": 6, "everi": [6, 22, 29], "fed": 6, "c10d": [6, 9], "ccl": [6, 9, 16, 32], "processgroup": [6, 9], "hold": [6, 9, 19], "allgath": [6, 9, 16, 33], "alltoal": [6, 16], "successfulli": [6, 20], "apt": 6, "yum": 6, "dnf": 6, "sudo": 6, "devel": 6, "12": [6, 22, 33], "309": 6, "oneccl_bindings_for_pytorch": [6, 9], "repo_url": 6, "whl": [6, 23, 27], "u": [6, 8], "repositori": 6, "holder": [6, 18], "url": 6, "m": [6, 8, 9, 16, 23], "oneccl_bind_pt": 6, "basekit": [6, 16], "oneapi_root": 6, "env": [6, 16, 24], "var": [6, 16, 24], "sh": [6, 16, 24, 30], "manag": [6, 8, 11, 29], "modif": [6, 9, 16, 17], "necessari": [6, 9, 16, 19, 20, 22], "dist": [6, 9, 11, 27], "init_process_group": [6, 9], "exclus": [6, 9, 10], "id": [6, 7, 9, 22, 29], "local": [6, 9, 16], "arg": [6, 9, 16, 18, 20, 38], "local_rank": [6, 9, 16], "wrap": [6, 9, 16, 36], "device_id": [6, 7, 9, 20], "exactli": [6, 8], "resid": 6, "seed_numb": 6, "same": [6, 8, 9, 18, 19, 27, 29, 33], "launcher": 6, "cwd": 6, "setvar": 6, "Or": 6, "example_ddp": 6, "def": [6, 8, 9, 11, 19, 20, 27, 33], "super": [6, 9, 11, 19], "4": [6, 8, 18, 19, 22, 30, 33], "__name__": [6, 9], "__main__": [6, 9], "123": 6, "mpi_world_s": 6, "pmi_siz": 6, "mpi_rank": 6, "pmi_rank": 6, "world_siz": [6, 9], "els": [6, 19, 20, 38], "world": 6, "master_addr": [6, 9], "127": 6, "master_port": [6, 9], "29500": 6, "global": [6, 20], "get_rank": 6, "get_world_s": 6, "loss_fn": 6, "mseloss": 6, "rune": 6, "randn": [6, 13, 19, 20, 22], "label": [6, 11, 15], "l": 6, "mpirun": 6, "card": [6, 19, 27, 29, 33], "regard": [6, 19], "explicit": [6, 22], "minor": [6, 33], "single_card": 6, "single_card_dist": 6, "importerror": [6, 27, 33], "rais": 6, "spawn": [6, 9], "multiprocess": [6, 9], "multi_process_spawn": 6, "main_work": 6, "put": [6, 7, 9, 20], "train_sampl": [6, 16], "epoch": [6, 9, 16], "set_epoch": [6, 9], "adjust": [6, 30], "warp": 6, "sampler": [6, 9, 16], "loader": 6, "shuffl": [6, 9], "num_work": [6, 9], "pin_memori": [6, 9], "wide": [7, 30], "adopt": [7, 29, 30, 33], "numpi": 7, "domain": [7, 15], "interoper": 7, "v0": [7, 33], "7": [7, 9, 22], "relat": [7, 9, 17, 18, 20], "extern": 7, "from_dlpack": 7, "t2": 7, "empti": [7, 18, 19, 22], "capsule2": 7, "to_dlpack": 7, "dlmanagedtensor": 7, "stride": [7, 11], "pars": [7, 9], "extract": 7, "data_ptr": 7, "respons": [7, 17, 22, 29], "atendlmtensor": 7, "ndim": 7, "dmlc": 7, "io": 7, "spec": 7, "dldevicetyp": 7, "kdloneapi": 7, "between": [7, 11, 22, 29, 30], "kdlsycl": 7, "reli": [7, 19], "filter": 7, "selector": 7, "actual": [7, 8, 19, 27, 33], "parent": 7, "get_devic": 7, "consum": [7, 20], "valid": [7, 9, 10, 14], "three": [7, 29], "host": [7, 20, 30, 32], "far": [7, 23], "recogn": 7, "situat": 7, "probabl": [7, 9, 27], "hard": [7, 19], "variou": [7, 20, 23, 29, 30, 33], "monitor": [7, 37], "flow": 7, "readi": 7, "highli": [8, 13, 17, 24, 29, 30, 33], "org": 8, "walk": 8, "come": 8, "flavor": 8, "aot": [8, 10], "cpp_extens": 8, "approach": [8, 27, 29], "latter": 8, "afterward": 8, "besid": [8, 20, 29, 30, 33], "long": [8, 19, 29], "term": [8, 28], "lltm": 8, "dpcppextens": 8, "dpcppbuildextens": 8, "ext_modul": 8, "lltm_xpu": 8, "lltm_xpu_kernel": 8, "cmdclass": 8, "build_ext": 8, "conveni": [8, 11], "correct": [8, 9, 19], "equival": [8, 30, 33, 38], "vanilla": 8, "include_dir": 8, "include_path": 8, "And": [8, 33], "goe": 8, "plug": 8, "previous": 8, "were": 8, "elabor": 8, "fly": 8, "background": 8, "temporari": 8, "tmp": [8, 20], "torch_extens": 8, "ver": 8, "_xpu": 8, "emit": 8, "ninja": 8, "fact": [8, 19], "home": [8, 16, 27], "user_nam": 8, "ones": [8, 17, 30], "complic": [8, 22], "power": [8, 15], "system": [8, 27, 33], "increment": 8, "reload": 8, "second": [8, 16, 18, 20, 27], "18": [8, 22, 33], "compon": [8, 10, 28, 29], "set_source_files_properti": 8, "compile_flag": 8, "add_librari": 8, "torch_librari": 8, "target_include_directori": 8, "public": [8, 33], "python_include_dir": 8, "torch_ipex_include_dir": 8, "prefix": 8, "cmake_prefix_path": 8, "dcmake_c_compil": 8, "dcmake_cxx_compil": 8, "aval": 8, "c10_stream": 8, "associ": [8, 34], "subsequ": [8, 19], "yourself": 8, "strategi": 8, "pybind11": 8, "ultim": 8, "care": [8, 22], "consid": 8, "cuda": [8, 9, 20, 33], "declar": 8, "lltm_xpu_forward": 8, "old_h": 8, "old_cel": 8, "lltm_xpu_backward": 8, "grad_h": 8, "grad_cel": 8, "new_cel": 8, "input_g": 8, "output_g": 8, "candidate_cel": 8, "gate_weight": 8, "check_xpu": 8, "is_xpu": 8, "check_contigu": 8, "is_contigu": [8, 19], "contigu": [8, 19, 29, 33], "check_input": 8, "lltm_forward": 8, "lltm_backward": 8, "pybind11_modul": 8, "torch_extension_nam": 8, "bridg": 8, "natur": [8, 19, 29, 30], "templat": [8, 13, 18, 29], "typenam": 8, "scalar_t": 8, "sigmoid": [8, 33], "0f": 8, "exp": [8, 33], "At": [8, 29], "header": 8, "essenti": 8, "helper": 8, "d_sigmoid": 8, "d_tanh": 8, "tanh": [8, 33], "elu": [8, 33], "alpha": [8, 38], "fmax": 8, "fmin": 8, "d_elu": 8, "d_relu": 8, "hand": 8, "cat": [8, 11, 13], "gate": 8, "addmm": [8, 11, 33], "transpos": [8, 33], "state_s": 8, "new_h": 8, "zeros_lik": 8, "at_dispatch_floating_typ": 8, "lltm_forward_xpu": 8, "lltm_xpu_forward_kernel": 8, "purpos": 8, "lambda": 8, "As": [8, 17, 27, 30, 38], "instanti": 8, "retriev": 8, "doubl": 8, "at_dispatch_all_typ": 8, "size_t": 8, "1024": [8, 20, 32], "work_group": 8, "cgf": 8, "handler": [8, 15, 20], "cgh": 8, "kfn": 8, "nd_item": 8, "item": [8, 9, 16], "get_group": 8, "get_group_rang": 8, "get_local_id": 8, "gates_row": 8, "parallel_for": 8, "nd_rang": 8, "entir": [8, 29], "grid": 8, "fill": 8, "matric": [8, 30], "2048": 8, "launch": [8, 10, 18, 29, 33], "8": [8, 15, 22], "introductori": 8, "underlai": 8, "right": [8, 24, 29], "inde": [8, 27], "high": [8, 29, 30, 33, 38], "agnost": 8, "ineffici": 8, "eas": [8, 19], "dimension": 8, "abstract": 8, "much": [8, 19, 38], "pattern": [8, 17, 19, 31, 33, 37], "packedtensoraccessor32": 8, "lltm_xpu_backward_kernel": 8, "d_old_cel": 8, "d_gate": 8, "d_gates_": 8, "d_old_cell_": 8, "d_output_g": 8, "d_tanh_new_cel": 8, "d_new_cel": 8, "d_candidate_cel": 8, "d_input_g": 8, "lltm_backward_xpu": 8, "packed_accessor32": 8, "d_gate_weight": 8, "reshap": 8, "d_weight": 8, "mm": [8, 11], "d_bia": 8, "sum": [8, 9, 19, 33], "keepdim": [8, 9], "d_x": 8, "d_old_h": 8, "d_input": 8, "similar": [9, 20, 27, 33], "reducescatt": [9, 33], "align": [9, 18, 20, 33], "convent": 9, "fullyshardeddataparallel": 9, "singl": [9, 16, 18, 29, 32, 38], "trigger": [9, 17, 27, 31, 33, 35], "throw": 9, "switch": [9, 20], "argpars": 9, "functool": 9, "lr_schedul": 9, "steplr": 9, "mp": 9, "distributeddataparallel": [9, 33], "distributedsampl": [9, 16], "fully_sharded_data_parallel": 9, "cpuoffload": 9, "backwardprefetch": 9, "size_based_auto_wrap_polici": 9, "enable_wrap": 9, "localhost": 9, "12355": 9, "cleanup": [9, 27], "destroy_process_group": [9, 27], "toi": 9, "handwritten": 9, "digit": 9, "classif": 9, "net": 9, "conv1": 9, "32": [9, 19], "conv2": 9, "64": [9, 11, 23, 30], "dropout1": 9, "25": [9, 32], "dropout2": 9, "fc1": 9, "9216": 9, "fc2": 9, "relu": [9, 19, 33], "max_pool2d": 9, "flatten": 9, "log_softmax": [9, 11], "logic": [9, 19, 22, 27], "ddp_loss": 9, "nll_loss": [9, 11, 16], "reduct": 9, "all_reduc": 9, "reduceop": 9, "tloss": [9, 16], "6f": 9, "test_load": 9, "pred": 9, "argmax": 9, "max": [9, 14, 23, 27, 30, 32, 33, 35], "eq": [9, 33], "view_a": 9, "test_loss": 9, "averag": [9, 16, 20], "4f": 9, "2f": 9, "100": [9, 16, 20, 22, 32, 33], "fsdp_main": 9, "1307": 9, "3081": 9, "dataset1": 9, "mnist": 9, "dataset2": 9, "sampler1": 9, "num_replica": [9, 16], "sampler2": 9, "train_kwarg": 9, "test_kwarg": 9, "test_batch_s": 9, "xpu_kwarg": 9, "my_auto_wrap_polici": 9, "partial": 9, "min_num_param": 9, "init_start_ev": 9, "event": 9, "enable_tim": 9, "init_end_ev": 9, "adadelta": 9, "step_siz": 9, "gamma": 9, "1000": 9, "sec": 9, "save_model": 9, "barrier": 9, "mnist_cnn": 9, "final": [9, 34, 35], "parser": 9, "argumentpars": 9, "add_argu": 9, "metavar": 9, "14": [9, 22], "rate": [9, 16], "action": [9, 18], "store_tru": 9, "random": [9, 16, 27], "parse_arg": 9, "nproc": 9, "join": 9, "snippet": [9, 13, 20, 31], "fsdp_mnist_xpu": 9, "who": [10, 33, 35], "overrid": 10, "ON": [10, 20, 22, 32], "off": [10, 11, 20, 22, 27, 29, 30, 33], "defaultvalu": 10, "use_onemkl": [10, 27, 33], "onemkl": [10, 13, 18, 27, 33], "bla": 10, "use_channels_last_1d": 10, "1d": 10, "use_persist_stream": 10, "persist": 10, "use_scratchpad_mod": 10, "scratchpad": 10, "use_primitive_cach": 10, "use_queue_barri": 10, "submit_barri": 10, "dummi": 10, "use_multi_context": 10, "use_profil": 10, "legaci": 10, "profil": [10, 33], "use_kineto": [10, 20], "kineto": [10, 21, 33], "use_sycl_assert": 10, "assert": [10, 20], "use_itt_annot": 10, "itt": 10, "annot": 10, "use_split_fp64_loop": 10, "fp64": [10, 27, 33], "element": [10, 19, 38], "wise": [10, 30, 31, 33, 38], "use_xetla": 10, "xetla": [10, 13, 33], "build_by_per_kernel": 10, "per_kernel": 10, "use_aot_devlist": [10, 35], "build_internal_debug": 10, "debug": [10, 17, 18, 22, 31], "build_separate_op": 10, "build_simple_trac": 10, "use_onednn_dir": 10, "use_xetla_src": 10, "ipex_gpu_root_dir": 10, "dir": 10, "build_opt_level": 10, "add": [10, 11, 14, 16, 18, 19, 22, 27, 33, 38], "ox": 10, "accept": [10, 18], "while": [10, 11, 17, 19, 20, 29, 30, 33], "equal": [10, 27], "optioncpu": 10, "ipex_fp32_math_mod": 10, "optiongpu": 10, "ipex_verbos": 10, "ipex_xpu_sync_mod": 10, "enforc": 10, "ipex_tile_as_devic": 10, "partit": [10, 16], "map": [10, 19], "composit": 10, "ipex_log_level": 10, "ipex_log_compon": [10, 18], "pl": 10, "sepreat": 10, "sub_compon": 10, "ipex_log_rotate_s": [10, 18], "rotat": [10, 18], "ipex_log": 10, "ipex_log_split_s": [10, 18], "ipex_log_output": [10, 18], "null": [10, 18], "optionexperiment": 10, "ipex_simple_trac": [10, 22], "ipex_ze_trac": [10, 20], "export": [10, 18, 22, 27, 33], "resnet50": [10, 20], "lower": [11, 17, 29, 30, 33], "lighter": 11, "smaller": [11, 33], "sacrif": 11, "trade": [11, 29, 30, 33], "slower": 11, "accur": [11, 29, 30], "faster": 11, "autom": 11, "speedup": [11, 13, 29, 33], "simplenet": [11, 23], "pad": [11, 19, 33], "scope": 11, "chosen": [11, 13], "categori": [11, 14], "circumst": 11, "imag": [11, 19], "float64": 11, "variant": 11, "suppli": [11, 19], "region": 11, "addmm_": 11, "cannot": [11, 19, 20, 27, 33], "stabil": 11, "regardless": 11, "unlist": 11, "downstream": 11, "assum": [11, 24], "believ": [11, 19], "unstabl": 11, "conv1d": [11, 19], "conv3d": [11, 33], "_convolut": 11, "conv_tbc": 11, "conv_transpose1d": 11, "conv_transpose3d": 11, "prelu": 11, "addmv": 11, "addr": 11, "matmul": [11, 14, 30, 33], "mv": 11, "bmm": 11, "baddbmm": 11, "addbmm": 11, "chain_matmul": 11, "linalg_multi_dot": 11, "_thnn_fused_gru_cel": 11, "gru_cel": 11, "scaled_dot_product_attent": 11, "binary_cross_entropi": 11, "binary_cross_entropy_with_logit": 11, "nll_loss2d": 11, "nll_loss_nd": 11, "cross_entropy_loss": 11, "fft_fft": 11, "fft_ifft": 11, "fft_fft2": 11, "fft_ifft2": 11, "fft_fftn": 11, "fft_ifftn": 11, "fft_rfft": 11, "fft_irfft": 11, "fft_rfft2": 11, "fft_irfft2": 11, "fft_rfftn": 11, "fft_irfftn": 11, "fft_hfft": 11, "fft_ihfft": 11, "reciproc": 11, "pow": [11, 33], "frobenius_norm": 11, "nuclear_norm": 11, "cosine_similar": 11, "poisson_nll_loss": 11, "cosine_embedding_loss": 11, "hinge_embedding_loss": 11, "kl_div": 11, "l1_loss": 11, "smooth_l1_loss": 11, "huber_loss": 11, "mse_loss": 11, "margin_ranking_loss": 11, "multilabel_margin_loss": 11, "soft_margin_loss": 11, "triplet_margin_loss": 11, "multi_margin_loss": 11, "pdist": 11, "cdist": 11, "renorm": 11, "addcdiv": 11, "addcmul": 11, "atan2": 11, "bilinear": 11, "cross": [11, 33], "dot": [11, 19, 29], "grid_sampl": 11, "index_put": 11, "tensordot": 11, "scatter_add": 11, "g": [11, 17, 19, 27, 29, 33, 34, 35], "intervent": 11, "mixtur": 11, "enable_auto_channels_last": [12, 36], "disable_auto_channels_last": [12, 36], "broad": 12, "bring": [12, 17, 29, 34], "concaten": [13, 29], "special": [13, 29], "basic": [13, 33], "empir": 13, "guarante": 13, "ideal": 13, "xe": [13, 29, 33], "algebra": [13, 29], "compute_eng": 13, "xpucomputeeng": 13, "x1": 13, "20": [13, 19, 22, 27], "x2": 13, "onednn_layout": 13, "highest": 13, "upsampl": [13, 19], "align_corn": 13, "step2": 13, "step3": 13, "step4": 13, "fall": 13, "back": [13, 19], "averagepool2d": 13, "concat": [13, 19, 29], "maxpool2d": 13, "maxpool3d": 13, "layernorm": [13, 14], "permutecontigu": 13, "softmax": [13, 33], "greater": [13, 27], "fp16": [13, 14, 24, 29, 30, 32, 33], "upsampleblinear2d": 13, "upsamplenearest": 13, "divis": 13, "integr": [14, 33, 35], "ecolog": 14, "worth": 14, "therefor": [14, 17], "NOT": [14, 19], "necessarili": 14, "common": 14, "being": [14, 22], "dequant": [14, 33], "geglu": 14, "residu": 14, "pre": [14, 29, 30, 35], "norm": [14, 33], "mlp": [14, 30], "moe": 14, "retak": 14, "bit": 15, "dnn": 15, "e4m3": 15, "sign": [15, 20, 30], "expon": 15, "mantissa": 15, "e5m2": 15, "FOR": 15, "onlin": 15, "decompress": 15, "delai": 15, "algorithm": [15, 19, 30, 33], "quantizaiton": 15, "showcas": [15, 30], "_fp8_convert": 15, "convert_fp8_model": 15, "fp8_autocas": 15, "input_id": [15, 30], "token_type_id": 15, "segment_id": 15, "attention_mask": 15, "input_mask": 15, "masked_lm_label": 15, "next_sentence_label": 15, "tensorflow": [16, 19], "kera": 16, "apach": [16, 28], "mxnet": 16, "goal": 16, "mpi": [16, 27], "concept": [16, 19], "broadcast": 16, "hvd": [16, 27], "pin": [16, 23], "server": [16, 27], "forth": 16, "devid": 16, "effect": [16, 17, 30], "compens": 16, "distributedoptim": 16, "deleg": [16, 34], "broadcast_paramet": 16, "root_rank": 16, "broadcast_optimizer_st": 16, "consist": [16, 29], "restor": 16, "corrupt": 16, "accomplish": 16, "guard": 16, "named_paramet": 16, "log_interv": 16, "overal": 17, "view": [17, 19, 20, 33], "conv_relu": 17, "deliv": 17, "modelimp": [17, 31], "quantwrapp": [17, 31], "perchannel": [17, 31], "prepar": [17, 31], "obtain": [17, 29, 30, 31], "calib_dataset": [17, 31], "inference_data": [17, 31], "stage": [17, 27, 38], "symmetr": 17, "asymmetr": [17, 33], "uint8": 17, "zero_point": 17, "swap": [17, 27], "Be": 17, "free": [17, 18], "scriptmodul": [17, 31], "example_input": [17, 31], "warmup": [17, 20, 31], "warmup_data": [17, 31], "graph_for": [17, 31], "inference_dta": [17, 31], "whole": [17, 18, 33], "conv_unari": 17, "conv_binari": 17, "linear_unari": 17, "conv_sum_relu": 17, "henc": [17, 33], "consider": 17, "dump": [17, 33], "analysi": 17, "attempt": [18, 34], "realloc": 18, "err": 18, "critic": [18, 33], "belong": 18, "syngraph": 18, "logutil": 18, "ipex_xxx_log": 18, "xxx": [18, 33], "There": [18, 20, 24, 29], "four": 18, "sub": [18, 33], "fmt": 18, "ab": [18, 33], "ipex_info_log": 18, "identifi": 18, "uniqu": [18, 20, 22], "event_id": 18, "step_id": 18, "ipex_xxx_event_end": 18, "ipex_event_log": 18, "record_avg_pool": 18, "prepare_data": 18, "data_prepare_finish": 18, "avg_pool": 18, "ipex_info_event_end": 18, "five": 18, "ipex_logging_level": 18, "integar": 18, "consol": [18, 20, 22], "mb": 18, "set_log_level": 18, "log_level": 18, "get_log_level": 18, "set_log_output_file_path": 18, "log_path": 18, "get_log_output_file_path": 18, "set_log_rotate_file_s": 18, "get_log_rotate_file_s": 18, "set_log_split_file_s": 18, "get_log_split_file_s": 18, "set_log_compon": 18, "log_compon": 18, "get_log_compon": 18, "previou": [18, 19], "represent": 19, "multidimension": 19, "arrai": [19, 36], "nd": 19, "space": [19, 30], "semant": 19, "dens": 19, "spars": 19, "coo": 19, "cnn": 19, "canon": 19, "assign": [19, 20], "2d": [19, 36], "height": 19, "width": [19, 29], "bmp": 19, "contiguous_format": 19, "reason": 19, "close": 19, "higher": [19, 29], "difficult": 19, "manipul": 19, "to_dens": 19, "upstream": 19, "Will": 19, "easier": [19, 23], "secret": 19, "ingredi": 19, "cover": [19, 29], "almost": 19, "foundat": 19, "upper": 19, "expens": 19, "sequenc": [19, 20, 29], "benefici": 19, "nb": 19, "aka": 19, "me": 19, "roughli": 19, "50": 19, "perf": 19, "mkldnn": 19, "mkldnn_util": 19, "to_mkldnn": 19, "explain": [19, 30], "diagram": 19, "conclus": 19, "minimum": 19, "But": 19, "usual": [19, 29, 30], "neglig": 19, "organ": 19, "question": 19, "reinterpret": 19, "w": [19, 30], "answer": 19, "chw": 19, "hw": [19, 35], "offset": [19, 29], "stride_n": 19, "stride_c": 19, "stride_h": 19, "stride_w": 19, "merit": 19, "express": 19, "noncontigu": 19, "big": 19, "n1": 19, "n2": 19, "mind": 19, "someth": 19, "rfc": 19, "hwc": 19, "wc": 19, "chwn": 19, "hwn": 19, "wn": 19, "outplac": 19, "_appli": 19, "spontan": 19, "tell": 19, "compris": 19, "guidelin": 19, "awar": [19, 30], "repeat": [19, 20], "my": 19, "recent": [19, 30], "pr": 19, "cudnn": 19, "accommod": 19, "hidden": [19, 29], "ideep": 19, "format_tag": 19, "src_md": 19, "desc": 19, "data_typ": 19, "f32": 19, "src_mem": 19, "src_data_ptr": 19, "hwio": 19, "gemm": [19, 29, 33], "avx512": [19, 33], "3d": 19, "batchnorm1d": 19, "maxpool1d": 19, "div": [19, 33], "nearest": [19, 30], "sycl_devic": 19, "sequenti": 19, "kernel_s": 19, "test_input": 19, "test_input_xpu": 19, "to_channels_last_1d": 19, "tenor": 19, "xpu_r": 19, "is_contiguous_channels_last_1d": 19, "input_xpu": 19, "meta": [19, 29], "expect": [19, 27], "invalid": [19, 27], "corrspond": 19, "prebuilt": [20, 23, 27, 33, 35], "wheel": [20, 23, 27, 33, 35], "affili": 20, "use_onetrac": 20, "onetrac": 20, "layer": [20, 29, 33, 34, 36], "profileract": 20, "input_tensor": 20, "prof": 20, "proper": 20, "output_tensor_1": 20, "nonzero": 20, "output_tensor_2": 20, "tabl": [20, 35], "key_averag": 20, "my_schedul": 20, "skip_first": 20, "trace_handl": 20, "p": [20, 32], "sort_bi": 20, "self_xpu_time_tot": 20, "row_limit": 20, "trace_": 20, "step_num": 20, "outsid": 20, "on_trace_readi": 20, "forget": 20, "record_shap": 20, "rememb": 20, "effort": [20, 30], "contextlib": 20, "profiler_setup": 20, "nullcontext": 20, "should_profil": 20, "profileact": 20, "unset": 20, "stop": [20, 27], "involv": 20, "Such": [20, 33], "a_0": 20, "a_1": 20, "b_0": 20, "b_1": 20, "export_chrome_trac": 20, "trace_example_on_multi_devic": 20, "exclud": 20, "children": 20, "percentag": 20, "propot": 20, "percentasg": 20, "avg": 20, "consumpt": 20, "sonsumpt": 20, "viewer": 20, "perfetto": 20, "ui": 20, "dev": 20, "trace_fil": 20, "examin": [20, 38], "failur": [20, 27, 33], "tracer": 20, "collector": 20, "workaround": [20, 27], "ze_enable_tracing_lay": 20, "soon": 21, "instead": [21, 31, 33, 38], "screen": 22, "turn": 22, "bracket": 22, "enable_simple_trac": 22, "disable_simple_trac": 22, "using_simple_trac": 22, "unintention": 22, "exmapl": 22, "262618": 22, "wrapper__empty_strid": 22, "atenipextypexpu": [22, 30], "empty_strid": 22, "wrapper__copy_": 22, "copy_": 22, "wrapper___unique2": 22, "_unique2": 22, "wrapper__clon": 22, "wrapper___reshape_alia": 22, "_reshape_alia": 22, "wrapper_memory_format_empti": 22, "11": 22, "wrapper__as_strid": 22, "as_strid": 22, "15": 22, "wrapper___local_scalar_dens": 22, "_local_scalar_dens": 22, "16": [22, 32], "wrapper__resize_": 22, "resize_": 22, "19": 22, "pid": 22, "tid": 22, "name1": 22, "name2": 22, "arrow": 22, "relationship": 22, "child": 22, "gdb": 22, "triton": [23, 33], "codegen": 23, "addition": [23, 30], "facilit": 23, "contribut": 23, "intens": 23, "ever": 23, "unlock": 23, "v2": [23, 33], "firstli": [23, 29, 30], "llvm": 23, "forc": 23, "cp310": 23, "manylinux_2_17_x86_64": 23, "manylinux2014_x86_64": 23, "triton_codegen_intel_xpu_backend": 23, "compiled_model": 23, "weight_decai": [23, 38], "loss_funct": 23, "demostr": 24, "cache_en": 24, "suppos": 24, "bash": [24, 27, 30], "problem": [27, 38], "unsupport": [27, 33], "graphic": [27, 30, 33, 35], "improp": 27, "unload": 27, "conda": [27, 33], "encount": [27, 34, 35], "ship": 27, "libstdc": 27, "conflict": 27, "ld_preload": [27, 33], "symbol": [27, 33], "undefin": [27, 33], "_glibcxx_use_cxx11_abi": 27, "_znk5torch8autograd4node4nameb5cxx11ev": [27, 33], "appear": [27, 33], "glibcxx_use_cxx11_abi": 27, "bad": 27, "termin": 27, "rn50": 27, "friendli": 27, "ungracefulli": 27, "116312": 27, "997": 27, "170": [27, 33, 35], "progress": [27, 29], "wsl2": [27, 33], "ram": 27, "killer": 27, "dmesg": 27, "oom": 27, "had": 27, "kill": 27, "max_job": 27, "conserv": 27, "slow": 27, "thing": 27, "lot": [27, 29, 33, 34, 35], "cl_device_not_found": 27, "tdr": 27, "window": [27, 33], "tdrdelai": 27, "registri": 27, "reboot": 27, "converg": 27, "24": 27, "hour": 27, "divid": [27, 36], "phase": [27, 29], "instabl": 27, "fault": 27, "atom": 27, "violat": 27, "lt": [27, 32, 33], "803": 27, "29": 27, "investig": 27, "roll": 27, "775": 27, "usr": [27, 33], "ld": [27, 33], "lmkl_sycl": [27, 33], "lmkl_intel_ilp64": [27, 33], "lmkl_core": [27, 33], "lmkl_tbb_thread": [27, 33], "linker": [27, 33], "exit": [27, 33], "v": [27, 33], "occur": [27, 30, 33], "resolv": [27, 33], "mkl_dpcpp_root": [27, 33], "mkl_lapack_dspevd": 27, "fatal": [27, 33], "libmkl_vml_avx512": 27, "libmkl": [27, 33], "vml": [27, 33], "incorrectli": [27, 33], "oserror": [27, 33], "wrong": [27, 33], "preload": [27, 33], "libmkl_intel_ilp64": [27, 33], "suffix": [27, 33], "test_weight_norm": 27, "testnnmethod": 27, "test_weight_norm_differnt_typ": 27, "copyright": 28, "notic": 28, "subject": 28, "condit": [28, 36], "architectur": [29, 30], "decod": 29, "multiheadattent": 29, "feedforward": 29, "bound": [29, 38], "kv_cach": 29, "smoothquant": 29, "huggingfac": [29, 30], "hub": [29, 30], "woq": 29, "llama2": [29, 32], "7b": [29, 30, 32, 33], "hf": 29, "13b": [29, 32, 33], "70b": 29, "eleutherai": 29, "6b": [29, 30, 32, 33], "qwen": [29, 30, 33], "30b": 29, "3b": 29, "bloom": [29, 32], "bigscienc": 29, "7b1": 29, "chatglm3": [29, 33], "thudm": 29, "baichuan2": [29, 33], "baichuan": 29, "inc": 29, "chat": 29, "codellama": 29, "indirect": 29, "rope": 29, "tpp": 29, "expand": 29, "brief": 29, "introduct": 29, "xelta": 29, "rotari": 29, "posit": [29, 30], "squar": [29, 33], "rmsnorm": 29, "beam": [29, 33], "idx": 29, "reorder_cach": 29, "bottleneck": [29, 30], "prompt": [29, 30], "kept": 29, "buffer": 29, "wast": 29, "prefil": 29, "influenc": 29, "histori": 29, "left": 29, "decid": 29, "timestamp": 29, "elimin": 29, "sdpa": 29, "shard": [29, 33], "lead": 29, "significantli": [29, 30], "heavier": 29, "becom": 29, "remark": [29, 30, 33], "deploi": [29, 30, 33], "resourc": [29, 30, 33], "challeng": [29, 30, 33], "overcom": [29, 30], "complex": [29, 30], "w8a8": [29, 30], "bandwidth": [29, 30], "preserv": [29, 30], "minim": [29, 30], "qualiti": [29, 30, 34, 35], "rtn": 30, "awq": 30, "teq": 30, "autoround": 30, "10004": 30, "stai": 30, "int4_fullrang": 30, "datatyp": [30, 32, 33], "procedur": 30, "constrain": 30, "round": [30, 33], "intuit": 30, "boast": 30, "simplic": 30, "easili": 30, "nf4": 30, "uniform": 30, "w4g32": 30, "w8": 30, "wors": 30, "explor": 30, "min": [30, 33], "handcraft": 30, "impos": 30, "broader": [30, 33], "knowledg": 30, "trainabl": 30, "ransform": 30, "summit": 30, "peer": 30, "review": 30, "202306": 30, "brain": 30, "surgeon": 30, "remain": [30, 38], "unquant": 30, "mitig": [30, 32], "occasion": 30, "semidefinit": 30, "necessit": 30, "hyperparamet": 30, "descent": 30, "minmax": 30, "200": 30, "impress": 30, "hypeparamet": 30, "compat": [30, 33], "relianc": 30, "backpropag": 30, "quit": [30, 33], "onnx": 30, "gunho": 30, "park": 30, "baeseong": 30, "se": 30, "jung": 30, "kwon": 30, "byeongwook": 30, "kim": 30, "youngjoo": 30, "lee": 30, "dongsoo": 30, "nuqmm": 30, "arxiv": 30, "preprint": 30, "2206": 30, "09557": 30, "lin": 30, "ji": 30, "et": 30, "al": 30, "2306": 30, "00978": 30, "cheng": 30, "cai": 30, "lv": 30, "shen": 30, "2310": 30, "10944": 30, "frantar": 30, "elia": 30, "2210": 30, "17323": 30, "zhang": 30, "he": 30, "2309": 30, "05516": 30, "easiest": 30, "load_in_4bit": 30, "hook": 30, "automodelforcausallm": [30, 31], "4bit": 30, "qmodel": 30, "model_nam": 30, "device_map": 30, "trust_remote_cod": 30, "use_llm_runtim": 30, "weightonlyquantconfig": 30, "woq_quantization_config": 30, "compute_dtyp": 30, "weight_dtyp": 30, "scale_dtyp": 30, "group_siz": 30, "inc_model": 30, "conf": 30, "calib_func": 30, "calib_dataload": 30, "export_compressed_model": 30, "compression_dtyp": 30, "compression_dim": 30, "use_optimum_format": 30, "convert_dtype_str2torch": 30, "weightonlyquantizedlinear": 30, "blocksiz": 30, "front": 30, "present": 30, "ipextransformerlinear": 30, "ipextransformerattnoptimizedint4": 30, "ipextransformermlpoptimizedint4": 30, "major": 30, "qkv": 30, "torch_ipex": 30, "mm_qkv_out_int4": 30, "mm_bias_int4": 30, "correspondingli": 30, "mm_silu_mul_int4": 30, "substitut": 30, "ipex_op_regist": 30, "hgemmxetla_int4": 30, "polici": [30, 33], "beforehand": [30, 34, 35], "later": 30, "has_2d_block_arrai": 30, "curdevid": 30, "suitabl": 30, "ordered_gemm_wint4_config_set_pvc": 30, "ordered_gemm_wint4_config_set_arc": 30, "hgemm_int4_common_dispatch": 30, "hgemm_bias_wint4_arc": 30, "intel_extension_for_transform": 30, "autotoken": 30, "token": [30, 32], "upon": 30, "littl": 30, "girl": 30, "return_tensor": 30, "save_pretrain": 30, "saved_dir": 30, "loaded_model": 30, "run_benchmark_woq": 30, "content": [30, 31, 33], "transpar": [31, 33], "undergo": 31, "overview": [31, 33], "model_name_or_path": 31, "amp_dtyp": 31, "topologi": [32, 38], "1550": 32, "2024": [32, 33], "736": 32, "v4": 32, "31": 32, "4fc181b0": 32, "ec33277": 32, "platinum": 32, "8480": 32, "node": 32, "socket": 32, "56": 32, "ucod": 32, "0x2b0004b1": 32, "hyper": 32, "turboboost": 32, "bio": 32, "se5c7411": 32, "86b": 32, "9525": 32, "d25": 32, "2304190630": 32, "ddr": 32, "slot": 32, "64gb": 32, "frequenc": 32, "4800": 32, "dcpmm": 32, "1024gb": 32, "ubuntu": [32, 33], "22": 32, "04": [32, 33], "1020": 32, "oem": 32, "spectr": 32, "meltdown": 32, "pvc": [32, 35], "oam": 32, "ifwi": 32, "b4": 32, "si": 32, "ww42": 32, "3_25mhzi_quad_dameni_oam600w_ifrv2332i_pscnull_ifwi": 32, "ecc": 32, "amc": 32, "sw": 32, "fw": 32, "v3": 33, "beta": 33, "hbm": 33, "kv": 33, "chines": 33, "5x": 33, "torch_llm_allreduc": 33, "xelink": 33, "webpag": 33, "uplift": 33, "3696": 33, "sdp": 33, "fallback": 33, "3706": 33, "3788": 33, "3841": 33, "workgroup": 33, "3796": 33, "3808": 33, "dockerfil": 33, "3829": 33, "3882": 33, "3887": 33, "3970": 33, "patch": 33, "fft": 33, "21": 33, "date": 33, "top": [33, 36], "reach": 33, "competit": 33, "a770": 33, "primari": 33, "verif": 33, "vehicl": 33, "emul": 33, "fsdp": 33, "merg": 33, "publicli": 33, "oct": 33, "focus": 33, "oob": 33, "v1": 33, "unaryop": 33, "sqrt": 33, "log_sigmoid": 33, "hardswish": 33, "hardsigmoid": 33, "silu": 33, "hardtanh": 33, "leaky_relu": 33, "binaryop": 33, "mul": 33, "ne": 33, "ge": 33, "gt": 33, "le": 33, "gelu": 33, "mish": 33, "concret": 33, "adamw": [33, 38], "permut": 33, "scalar": 33, "pixelshuffl": 33, "leaki": 33, "softplu": 33, "glibcxx": 33, "cxx11": 33, "gcc": 33, "path_to_your_onemkl": 33, "__release_lnx": 33, "lapack": 33, "dspevd": 33, "lp64": 33, "libmkl_sequenti": 33, "lifecycl": [34, 35], "benifit": [34, 35], "deliveri": [34, 35], "disadvantag": [34, 35], "500mb": [34, 35], "5gb": [34, 35], "dealloc": 34, "smallest": 34, "unabl": 34, "appropri": 34, "comma": 35, "delimit": 35, "ats": 35, "m150": 35, "togeth": 35, "seper": 35, "opencl": 35, "spir64_gen": 35, "flag": 36, "convtranspos": 36, "tri": 36, "connect": 36, "batchnorm": 36, "instanc": 36, "opportun": 36, "met": 36, "parameterwrapp": 36, "_parameter_wrapp": 36, "can_cast_train": 36, "ipex_weight_convert_module_xpu": 36, "bottom": 36, "simultan": 36, "referenc": 36, "ipex_fused_optimizer_list_xpu": 36, "_optimizer_util": 36, "_original_step": 36, "understand": 37, "adam": 38, "lamb": 38, "lar": 38, "grad": 38, "buf": 38, "momentum_buffer_list": 38, "detach": 38, "mul_": 38, "add_": 38, "dampen": 38, "nesterov": 38, "claus": 38, "bottl": 38, "neck": 38, "solv": 38, "pseudo": 38, "sgd_fused_step": 38}, "objects": {"": [[1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E", "xpu::BF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E", "xpu::FP32"], [1, 1, 1, "_CPPv4N3xpu14FP32_MATH_MODEE", "xpu::FP32_MATH_MODE"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E", "xpu::FP32_MATH_MODE::BF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E", "xpu::FP32_MATH_MODE::FP32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "xpu::FP32_MATH_MODE::FP32_MATH_MODE_MAX"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "xpu::FP32_MATH_MODE::FP32_MATH_MODE_MIN"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E", "xpu::FP32_MATH_MODE::TF32"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "xpu::FP32_MATH_MODE_MAX"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "xpu::FP32_MATH_MODE_MIN"], [1, 0, 1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E", "xpu::TF32"], [1, 2, 1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE", "xpu::get_queue_from_stream"], [1, 3, 1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE", "xpu::get_queue_from_stream::stream"], [1, 2, 1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "xpu::set_fp32_math_mode"], [1, 3, 1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "xpu::set_fp32_math_mode::mode"]], "intel_extension_for_pytorch": [[1, 4, 1, "", "get_fp32_math_mode"], [1, 4, 1, "", "optimize"], [1, 4, 1, "", "optimize_transformers"], [1, 4, 1, "", "set_fp32_math_mode"]], "intel_extension_for_pytorch.xpu": [[1, 5, 1, "", "Event"], [1, 5, 1, "", "Stream"], [1, 4, 1, "", "current_device"], [1, 4, 1, "", "current_stream"], [1, 5, 1, "", "device"], [1, 4, 1, "", "device_count"], [1, 5, 1, "", "device_of"], [1, 4, 1, "", "empty_cache"], [1, 4, 1, "", "get_device_name"], [1, 4, 1, "", "get_device_properties"], [1, 4, 1, "", "get_rng_state"], [1, 4, 1, "", "get_rng_state_all"], [1, 4, 1, "", "init"], [1, 4, 1, "", "initial_seed"], [1, 4, 1, "", "is_available"], [1, 4, 1, "", "is_initialized"], [1, 4, 1, "", "manual_seed"], [1, 4, 1, "", "manual_seed_all"], [1, 4, 1, "", "max_memory_allocated"], [1, 4, 1, "", "max_memory_reserved"], [1, 4, 1, "", "memory_allocated"], [1, 4, 1, "", "memory_reserved"], [1, 4, 1, "", "memory_snapshot"], [1, 4, 1, "", "memory_stats"], [1, 4, 1, "", "memory_stats_as_nested_dict"], [1, 4, 1, "", "memory_summary"], [1, 4, 1, "", "reset_accumulated_memory_stats"], [1, 4, 1, "", "reset_peak_memory_stats"], [1, 4, 1, "", "seed"], [1, 4, 1, "", "seed_all"], [1, 4, 1, "", "set_device"], [1, 4, 1, "", "set_rng_state"], [1, 4, 1, "", "set_rng_state_all"], [1, 4, 1, "", "stream"], [1, 4, 1, "", "synchronize"]], "intel_extension_for_pytorch.xpu.Event": [[1, 6, 1, "", "elapsed_time"], [1, 6, 1, "", "query"], [1, 6, 1, "", "record"], [1, 6, 1, "", "synchronize"], [1, 6, 1, "", "wait"]], "intel_extension_for_pytorch.xpu.Stream": [[1, 6, 1, "", "record_event"], [1, 7, 1, "", "sycl_queue"], [1, 6, 1, "", "synchronize"], [1, 6, 1, "", "wait_event"], [1, 6, 1, "", "wait_stream"]], "intel_extension_for_pytorch.xpu.fp8.fp8": [[1, 4, 1, "", "fp8_autocast"]]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:enum", "2": "cpp:function", "3": "cpp:functionParam", "4": "py:function", "5": "py:class", "6": "py:method", "7": "py:property"}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "enum", "C++ enum"], "2": ["cpp", "function", "C++ function"], "3": ["cpp", "functionParam", "C++ function parameter"], "4": ["py", "function", "Python function"], "5": ["py", "class", "Python class"], "6": ["py", "method", "Python method"], "7": ["py", "property", "Python property"]}, "titleterms": {"intel": [0, 3, 4, 6, 14, 17, 30, 32], "extens": [0, 3, 5, 6, 8, 14, 17, 30], "pytorch": [0, 3, 6, 14, 16, 17, 19, 30], "architectur": 0, "support": [0, 5, 11, 14, 15, 19, 20, 30], "api": [1, 5, 6, 12, 19, 26, 31, 36], "document": [1, 3, 26], "gener": [1, 27], "miscellan": 1, "random": 1, "number": 1, "stream": [1, 8], "event": [1, 18], "memori": [1, 19, 34, 37], "manag": [1, 34, 37], "c": [1, 4, 18, 19], "blog": 2, "public": 2, "contribut": 3, "develop": 3, "xpu": [3, 4, 19, 20, 33], "tip": 3, "debug": [3, 5, 13], "unit": [3, 27], "test": [3, 27], "better": 3, "local": 3, "pytest": 3, "write": [3, 8, 19], "build": [3, 8, 10, 20], "exampl": [4, 6, 7, 8, 9, 15], "python": [4, 5, 18], "train": [4, 5, 11, 23], "singl": [4, 6], "instanc": 4, "float32": [4, 11], "bfloat16": [4, 11], "infer": [4, 11, 29, 30, 31], "imper": [4, 11, 17, 31], "mode": [4, 15, 17, 31], "resnet50": 4, "bert": 4, "torchscript": [4, 11, 17, 31], "float16": [4, 11], "int8": 4, "torch": [4, 5, 23], "optim": [4, 17, 29, 31, 34, 36, 38], "basic": 4, "usag": [4, 6, 9, 15, 16, 18, 27, 30, 31], "us": [4, 5, 7, 8, 11, 12, 13, 20, 22, 35], "sycl": [4, 8], "code": 4, "custom": 4, "dpc": [4, 5, 8], "kernel": [4, 14, 19], "ai": [4, 32], "refer": [4, 11, 30], "model": [4, 19, 20, 22, 29, 30], "featur": [5, 13, 30], "easi": 5, "channel": [5, 12, 19, 36], "last": [5, 12, 19, 36], "auto": [5, 11, 12], "mix": [5, 11], "precis": [5, 11, 29], "amp": [5, 11], "quantiz": [5, 15, 17, 29, 30], "distribut": [5, 29, 31], "dlpack": [5, 7], "solut": [5, 7], "advanc": [5, 10], "configur": [5, 10, 32], "fulli": [5, 9], "shard": [5, 9], "data": [5, 7, 9, 15, 29, 32], "parallel": [5, 9], "fsdp": [5, 9], "compil": [5, 8, 23, 34, 35], "gpu": [5, 6, 9, 11, 17, 23, 30, 33, 34, 38], "beta": [5, 23], "simpl": [5, 18, 22], "trace": [5, 20, 22], "tool": [5, 20, 21, 22], "prototyp": [5, 15, 16, 18, 20, 22, 30], "kineto": [5, 20], "profil": [5, 20, 21], "comput": [5, 13], "engin": [5, 13], "ipex_log": [5, 18], "distributeddataparallel": 6, "ddp": 6, "introduct": [6, 7, 8, 9, 11, 13, 14, 18, 20, 21, 22, 23, 26, 30, 35, 38], "instal": [6, 16, 25, 30], "oneccl": 6, "bind": 6, "recommend": 6, "from": 6, "prebuilt": 6, "wheel": 6, "sourc": 6, "runtim": [6, 10, 30], "dynam": 6, "link": 6, "mpi": 6, "launch": 6, "node": 6, "scale": 6, "onli": [6, 9, 29, 30], "case": [7, 11, 13, 20, 22, 35], "design": 7, "import": 7, "capsul": 7, "export": [7, 20], "dldevic": 7, "pointer": 7, "asynchron": 7, "program": 7, "motiv": 8, "setuptool": 8, "jit": 8, "cmake": 8, "request": 8, "current": 8, "c10": 8, "fetch": 8, "correspond": 8, "queue": 8, "op": [8, 11], "accessor": 8, "time": [10, 34, 35], "default": [11, 12, 19], "path": 11, "autocast": 11, "elig": 11, "specif": 11, "behavior": 11, "can": 11, "promot": 11, "widest": 11, "input": 11, "type": [11, 15, 29], "eas": 12, "enabl": [12, 22], "disabl": [12, 20, 22], "known": [12, 20, 33], "issu": [12, 20, 33], "experiment": 13, "select": 13, "polici": [13, 29], "multipl": 13, "implement": 13, "oper": [13, 15, 19, 29, 38], "deepspe": [14, 31], "platform": 14, "float8": 15, "fp8": 15, "run": [15, 30], "horovod": 16, "definit": 18, "log": 18, "level": 18, "compon": 18, "enviorn": 18, "set": [18, 20], "replac": 18, "ipex_simple_trac": 18, "ipex_verbos": 18, "what": 19, "i": 19, "format": 19, "all": 19, "That": 19, "matter": 19, "nchw": 19, "b": 19, "nhwc": 19, "block": 19, "nchw16c": 19, "cpu": 19, "stride": 19, "layout": 19, "tensor": 19, "creation": 19, "convers": 19, "d": 19, "coverag": 19, "regist": 19, "aten": 19, "nativ": 19, "manner": 19, "onednn": 19, "creat": 19, "convolut": 19, "primit": 19, "1d": 19, "determin": 19, "environ": [20, 30], "variabl": 20, "add": 20, "Into": 20, "script": [20, 30], "partli": 20, "backend": 20, "multi": 20, "devic": 20, "applic": 20, "result": [20, 22], "chrome": 20, "legaci": 21, "deprec": 21, "requir": [23, 35], "depend": [23, 27], "inferenec": 23, "quick": 24, "start": [24, 26], "execut": [24, 30], "get": 26, "troubleshoot": 27, "librari": 27, "licens": 28, "larg": 29, "languag": 29, "llm": [29, 30, 32], "overview": [29, 32], "methodologi": 29, "linear": [29, 30], "deep": 29, "fusion": [29, 38], "segment": 29, "kv": 29, "cach": 29, "low": 29, "weight": [29, 30], "int4": 29, "framework": 30, "matrix": 30, "initi": 30, "dispatch": 30, "For": 30, "setup": 30, "transform": [30, 31], "neural": 30, "compressor": 30, "save": 30, "load": 30, "option": 30, "woq": 30, "benchmark": 30, "frontend": [31, 36], "pseudocod": 31, "common": 31, "scenario": 31, "fp16": 31, "smoothquant": 31, "perform": 32, "center": 32, "product": 32, "v2": 32, "1": [32, 33], "10": [32, 33], "softwar": 32, "version": 32, "hardwar": 32, "releas": 33, "2": 33, "30": 33, "highlight": 33, "20": 33, "0": 33, "110": 33, "13": 33, "120": 33, "200": 33, "technic": 34, "detail": 34, "ahead": [34, 35], "aot": [34, 35], "ipex": [34, 36], "automat": 36, "conv_bn_fold": 36, "linear_bn_fold": 36, "replace_dropout_with_ident": 36, "split_master_weight_for_bf16": 36, "fuse_update_step": 36}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 58}, "alltitles": {"Intel\u00ae Extension for PyTorch*": [[0, "intel-extension-for-pytorch"]], "Architecture": [[0, "architecture"]], "Support": [[0, "support"]], "API Documentation": [[1, "api-documentation"], [26, "api-documentation"]], "General": [[1, "general"]], "Miscellaneous": [[1, "miscellaneous"]], "Random Number Generator": [[1, "random-number-generator"]], "Streams and events": [[1, "streams-and-events"]], "Memory management": [[1, "memory-management"]], "C++ API": [[1, "c-api"]], "Blogs & Publications": [[2, "blogs-publications"]], "Contribution": [[3, "contribution"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[3, "contributing-to-intel-extension-for-pytorch"]], "Developing Intel\u00ae Extension for PyTorch* on XPU": [[3, "developing-intel-extension-for-pytorch-on-xpu"]], "Tips and Debugging": [[3, "tips-and-debugging"]], "Unit testing": [[3, "unit-testing"]], "Better local unit tests with pytest": [[3, "better-local-unit-tests-with-pytest"]], "Writing documentation": [[3, "writing-documentation"]], "Building documentation": [[3, "building-documentation"]], "Tips": [[3, "tips"]], "Examples": [[4, "examples"]], "Python": [[4, "python"]], "Training": [[4, "training"]], "Single-Instance Training": [[4, "single-instance-training"]], "Float32": [[4, "float32"], [4, "id1"]], "BFloat16": [[4, "bfloat16"], [4, "id4"]], "Inference": [[4, "inference"]], "Imperative Mode": [[4, "imperative-mode"], [4, "id5"], [4, "id11"], [17, "imperative-mode"]], "Resnet50": [[4, "resnet50"], [4, "id2"], [4, "id6"], [4, "id9"], [4, "id12"], [4, "id15"]], "BERT": [[4, "bert"], [4, "id3"], [4, "id7"], [4, "id10"], [4, "id13"], [4, "id16"]], "TorchScript Mode": [[4, "torchscript-mode"], [4, "id8"], [4, "id14"], [17, "torchscript-mode"], [31, "torchscript-mode"]], "Float16": [[4, "float16"]], "INT8": [[4, "int8"]], "torch.xpu.optimize": [[4, "torch-xpu-optimize"]], "C++": [[4, "c"]], "Basic Usage": [[4, "basic-usage"]], "Use SYCL code": [[4, "use-sycl-code"]], "Customize DPC++ kernels": [[4, "customize-dpc-kernels"]], "Intel\u00ae AI Reference Models": [[4, "intel-ai-reference-models"]], "Features": [[5, "features"]], "Easy-to-use Python API": [[5, "easy-to-use-python-api"]], "Channels Last": [[5, "channels-last"], [19, "channels-last"]], "Auto Mixed Precision (AMP)": [[5, "auto-mixed-precision-amp"]], "Quantization": [[5, "quantization"]], "Distributed Training": [[5, "distributed-training"]], "DLPack Solution": [[5, "dlpack-solution"], [7, "dlpack-solution"]], "DPC++ Extension": [[5, "dpc-extension"], [8, "dpc-extension"]], "Advanced Configuration": [[5, "advanced-configuration"], [10, "advanced-configuration"]], "Fully Sharded Data Parallel (FSDP)": [[5, "fully-sharded-data-parallel-fsdp"], [9, "fully-sharded-data-parallel-fsdp"]], "torch.compile for GPU (Beta)": [[5, "torch-compile-for-gpu-beta"], [23, "torch-compile-for-gpu-beta"]], "Simple Trace Tool (Prototype)": [[5, "simple-trace-tool-prototype"], [22, "simple-trace-tool-prototype"]], "Kineto Supported Profiler Tool (Prototype)": [[5, "kineto-supported-profiler-tool-prototype"], [20, "kineto-supported-profiler-tool-prototype"]], "Compute Engine (Prototype feature for debug)": [[5, "compute-engine-prototype-feature-for-debug"]], "IPEX_LOGGING (Prototype feature for debug)": [[5, "ipex-logging-prototype-feature-for-debug"]], "DistributedDataParallel (DDP)": [[6, "distributeddataparallel-ddp"]], "Introduction": [[6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"], [11, "introduction"], [13, "introduction"], [14, "introduction"], [18, "introduction"], [20, "introduction"], [21, "introduction"], [22, "introduction"], [23, "introduction"], [26, "introduction"], [30, "introduction"], [35, "introduction"], [38, "introduction"]], "Installation of Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "installation-of-intel-oneccl-bindings-for-pytorch"]], "Install PyTorch and Intel\u00ae Extension for PyTorch*": [[6, "install-pytorch-and-intel-extension-for-pytorch"]], "Install Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "install-intel-oneccl-bindings-for-pytorch"]], "[Recommended] Install from prebuilt wheels": [[6, "recommended-install-from-prebuilt-wheels"]], "Install from source": [[6, "install-from-source"]], "Runtime Dynamic Linking": [[6, "runtime-dynamic-linking"]], "DDP Usage": [[6, "ddp-usage"]], "Example Usage (MPI launch for single node):": [[6, "example-usage-mpi-launch-for-single-node"]], "DDP scaling API (GPU Only)": [[6, "ddp-scaling-api-gpu-only"]], "Usage of DDP scaling API": [[6, "usage-of-ddp-scaling-api"]], "Use Case": [[7, "use-case"], [11, "use-case"], [13, "use-case"], [20, "use-case"], [22, "use-case"]], "Design": [[7, "design"]], "Import DLPack Capsule": [[7, "import-dlpack-capsule"]], "Export DLPack Capsule": [[7, "export-dlpack-capsule"]], "DLDevice and data pointer": [[7, "dldevice-and-data-pointer"]], "Asynchronous Programming": [[7, "asynchronous-programming"]], "Example Case": [[7, "example-case"]], "Motivation and Example": [[8, "motivation-and-example"]], "Writing a DPC++ Extension": [[8, "writing-a-dpc-extension"]], "Building with setuptools": [[8, "building-with-setuptools"]], "JIT Compiling Extensions": [[8, "jit-compiling-extensions"]], "Building with CMake": [[8, "building-with-cmake"]], "Requesting the current c10::Stream": [[8, "requesting-the-current-c10-stream"]], "Fetching the corresponding sycl::queue": [[8, "fetching-the-corresponding-sycl-queue"]], "Writing the DPC++ Op": [[8, "writing-the-dpc-op"]], "Using accessors": [[8, "using-accessors"]], "FSDP Usage (GPU only)": [[9, "fsdp-usage-gpu-only"]], "Example": [[9, "example"]], "Build Time Configuration": [[10, "build-time-configuration"]], "Runtime Configuration": [[10, "runtime-configuration"]], "Auto Mixed Precision (AMP) on GPU": [[11, "auto-mixed-precision-amp-on-gpu"]], "Default Precision": [[11, "default-precision"]], "Inference with Imperative Path": [[11, "inference-with-imperative-path"]], "Inference with TorchScript Path": [[11, "inference-with-torchscript-path"]], "Training Support": [[11, "training-support"]], "Autocast Op Reference": [[11, "autocast-op-reference"]], "Op Eligibility": [[11, "op-eligibility"]], "Op-Specific Behavior": [[11, "op-specific-behavior"]], "Ops that can autocast to bfloat16": [[11, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float16": [[11, "ops-that-can-autocast-to-float16"]], "Ops that can autocast to float32": [[11, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[11, "ops-that-promote-to-the-widest-input-type"]], "Auto Channels Last": [[12, "auto-channels-last"]], "Ease-of-use auto channels last API": [[12, "ease-of-use-auto-channels-last-api"]], "default": [[12, "default"]], "enable": [[12, "enable"]], "disable": [[12, "disable"]], "Known issue": [[12, "known-issue"]], "Compute Engine (Experimental feature for debug)": [[13, "compute-engine-experimental-feature-for-debug"]], "Engine Selection Policy": [[13, "engine-selection-policy"]], "Multiple Implementations Operators and Engines": [[13, "multiple-implementations-operators-and-engines"]], "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels": [[14, "intel-extension-for-pytorch-deepspeed-kernels"]], "Supported Platform": [[14, "supported-platform"]], "Float8 Data Type Support (Prototype)": [[15, "float8-data-type-support-prototype"]], "Float8 Data Type": [[15, "float8-data-type"]], "FP8 Quantization": [[15, "fp8-quantization"]], "Supported running mode": [[15, "supported-running-mode"]], "Supported operators": [[15, "supported-operators"]], "FP8 usage example": [[15, "fp8-usage-example"]], "Horovod with PyTorch (Prototype)": [[16, "horovod-with-pytorch-prototype"]], "Install Horovod with PyTorch": [[16, "install-horovod-with-pytorch"]], "Horovod with PyTorch Usage": [[16, "horovod-with-pytorch-usage"]], "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]": [[17, "intel-extension-for-pytorch-optimizations-for-quantization-gpu"]], "IPEX_LOGGING (Prototype)": [[18, "ipex-logging-prototype"]], "IPEX_LOGGING Definition": [[18, "ipex-logging-definition"]], "Log Level": [[18, "log-level"]], "Log Component": [[18, "log-component"]], "Usage in C++": [[18, "usage-in-c"]], "Simple Log": [[18, "simple-log"]], "Event Log": [[18, "event-log"]], "Enviornment settings": [[18, "enviornment-settings"]], "Usage in python": [[18, "usage-in-python"]], "Replace IPEX_SIMPLE_TRACE": [[18, "replace-ipex-simple-trace"]], "Replace IPEX_VERBOSE": [[18, "replace-ipex-verbose"]], "What is Channels Last": [[19, "what-is-channels-last"]], "Memory Format Is All That Matters": [[19, "memory-format-is-all-that-matters"]], "a. NCHW (default)": [[19, "a-nchw-default"]], "b. NHWC": [[19, "b-nhwc"]], "c. Blocked (nChw16c, on CPU)": [[19, "c-blocked-nchw16c-on-cpu"]], "PyTorch Strided Layout": [[19, "pytorch-strided-layout"]], "Channels Last Memory Format APIs": [[19, "channels-last-memory-format-apis"]], "a. tensor creation": [[19, "a-tensor-creation"]], "b. tensor conversion": [[19, "b-tensor-conversion"]], "c. model conversion": [[19, "c-model-conversion"]], "d. operator coverage in PyTorch": [[19, "d-operator-coverage-in-pytorch"]], "Writing Channels Last Kernels on CPU": [[19, "writing-channels-last-kernels-on-cpu"]], "a. Register Channels Last Kernel in ATen Native Manner": [[19, "a-register-channels-last-kernel-in-aten-native-manner"]], "b. Register oneDNN Kernel on Channels Last": [[19, "b-register-onednn-kernel-on-channels-last"]], "oneDNN NHWC APIs": [[19, "onednn-nhwc-apis"]], "a. Create NHWC Memory": [[19, "a-create-nhwc-memory"]], "b. Create Convolution Primitive": [[19, "b-create-convolution-primitive"]], "Channels Last 1D support on XPU": [[19, "channels-last-1d-support-on-xpu"]], "a. tensor conversion with Channels Last 1D": [[19, "a-tensor-conversion-with-channels-last-1d"]], "b. model conversion with Channels Last 1D": [[19, "b-model-conversion-with-channels-last-1d"]], "c. determine if in Channels Last 1D memory format": [[19, "c-determine-if-in-channels-last-1d-memory-format"]], "Build Tool": [[20, "build-tool"]], "Use Tool": [[20, "use-tool"]], "Set Environment Variable": [[20, "set-environment-variable"]], "Add Profiler Into Script": [[20, "add-profiler-into-script"]], "Disable Tool in Model Script": [[20, "disable-tool-in-model-script"]], "Disable Tool Partly for XPU Backend": [[20, "disable-tool-partly-for-xpu-backend"]], "Profile on Multi-device Application": [[20, "profile-on-multi-device-application"]], "Result": [[20, "result"]], "Export to Chrome Trace": [[20, "export-to-chrome-trace"]], "Known issues": [[20, "known-issues"]], "Legacy Profiler Tool (Deprecated)": [[21, "legacy-profiler-tool-deprecated"]], "Enable and Disable Tool": [[22, "enable-and-disable-tool"]], "Use Simple Trace in Model": [[22, "use-simple-trace-in-model"]], "Results": [[22, "results"]], "Required Dependencies": [[23, "required-dependencies"]], "Inferenece with torch.compile": [[23, "inferenece-with-torch-compile"]], "Training with torch.compile": [[23, "training-with-torch-compile"]], "Quick Start": [[24, "quick-start"]], "Execution": [[24, "execution"]], "Installation": [[25, "installation"]], "Get Started": [[26, "get-started"]], "Troubleshooting": [[27, "troubleshooting"]], "General Usage": [[27, "general-usage"]], "Library Dependencies": [[27, "library-dependencies"]], "Unit Test": [[27, "unit-test"]], "License": [[28, "license"]], "Large Language Models (LLM) Optimizations Overview": [[29, "large-language-models-llm-optimizations-overview"]], "Optimized Models": [[29, "optimized-models"]], "Optimization Methodologies": [[29, "optimization-methodologies"]], "Linear Operator Optimization": [[29, "linear-operator-optimization"]], "Deep Fusion Policy": [[29, "deep-fusion-policy"]], "Segment KV Cache": [[29, "segment-kv-cache"]], "Distributed Inference": [[29, "distributed-inference"]], "Low Precision Data Types": [[29, "low-precision-data-types"]], "Weight Only Quantization INT4": [[29, "weight-only-quantization-int4"]], "Weight-Only Quantization (Prototype)": [[30, "weight-only-quantization-prototype"]], "Supported Framework Model Matrix": [[30, "supported-framework-model-matrix"]], "References": [[30, "references"]], "Weight-Only Quantization LLM features in Intel\u00ae Extension for PyTorch*": [[30, "weight-only-quantization-llm-features-in-intel-extension-for-pytorch"]], "Weight-Only Quantization Initialization": [[30, "weight-only-quantization-initialization"]], "Weight-Only Quantization Runtime": [[30, "weight-only-quantization-runtime"]], "Weight-Only Quantization Linear Dispatch": [[30, "weight-only-quantization-linear-dispatch"]], "Usage of running Weight-Only Quantization LLM For Intel\u00ae GPU": [[30, "usage-of-running-weight-only-quantization-llm-for-intel-gpu"]], "Environment Setup": [[30, "environment-setup"]], "Run Weight-Only Quantization LLM on Intel\u00ae GPU": [[30, "run-weight-only-quantization-llm-on-intel-gpu"]], "Install Intel-extension-for-transformers and Neural-compressor": [[30, "install-intel-extension-for-transformers-and-neural-compressor"]], "Quantize Model and Inference": [[30, "quantize-model-and-inference"]], "Save and Load Quantized Model (Optional)": [[30, "save-and-load-quantized-model-optional"]], "Execute WOQ benchmark script": [[30, "execute-woq-benchmark-script"]], "Transformers Optimization Frontend API": [[31, "transformers-optimization-frontend-api"]], "Pseudocode of Common Usage Scenarios": [[31, "pseudocode-of-common-usage-scenarios"]], "FP16": [[31, "fp16"]], "SmoothQuant": [[31, "smoothquant"]], "Imperative mode": [[31, "imperative-mode"]], "Distributed Inference with DeepSpeed": [[31, "distributed-inference-with-deepspeed"]], "Performance": [[32, "performance"]], "Overview": [[32, "overview"]], "Performance Data for Intel\u00ae AI Data Center Products": [[32, "performance-data-for-intel-ai-data-center-products"]], "LLM Performance v2.1.10": [[32, "llm-performance-v2-1-10"]], "Configuration": [[32, "configuration"]], "Software Version": [[32, "software-version"]], "Hardware Configuration": [[32, "hardware-configuration"]], "Releases": [[33, "releases"]], "2.1.30+xpu": [[33, "xpu"]], "Highlights": [[33, "highlights"], [33, "id2"], [33, "id5"], [33, "id8"], [33, "id11"], [33, "id14"], [33, "id16"]], "Known Issues": [[33, "known-issues"], [33, "id3"], [33, "id6"], [33, "id9"], [33, "id12"], [33, "id15"], [33, "id17"]], "2.1.20+xpu": [[33, "id1"]], "2.1.10+xpu": [[33, "id4"]], "2.0.110+xpu": [[33, "id7"]], "1.13.120+xpu": [[33, "id10"]], "1.13.10+xpu": [[33, "id13"]], "1.10.200+gpu": [[33, "gpu"]], "Technical Details": [[34, "technical-details"]], "Optimizer Optimization [GPU]": [[34, "optimizer-optimization-gpu"]], "Ahead of Time Compilation (AOT) [GPU]": [[34, "ahead-of-time-compilation-aot-gpu"]], "Memory Management [GPU]": [[34, "memory-management-gpu"]], "ipex.optimize [GPU]": [[34, "ipex-optimize-gpu"]], "Ahead of Time (AOT) Compilation": [[35, "ahead-of-time-aot-compilation"]], "Use case": [[35, "use-case"]], "Requirement": [[35, "requirement"]], "ipex.optimize Frontend API": [[36, "ipex-optimize-frontend-api"]], "Automatic Channels Last": [[36, "automatic-channels-last"]], "conv_bn_folding": [[36, "conv-bn-folding"]], "linear_bn_folding": [[36, "linear-bn-folding"]], "replace_dropout_with_identity": [[36, "replace-dropout-with-identity"]], "split_master_weight_for_bf16": [[36, "split-master-weight-for-bf16"]], "fuse_update_step": [[36, "fuse-update-step"]], "Memory Management": [[37, "memory-management"]], "Optimizer Fusion on GPU": [[38, "optimizer-fusion-on-gpu"]], "Operation Fusion": [[38, "operation-fusion"]]}, "indexentries": {"event (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.Event"]], "stream (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.Stream"]], "current_device() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.current_device"]], "current_stream() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.current_stream"]], "device (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device"]], "device_count() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device_count"]], "device_of (class in intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.device_of"]], "elapsed_time() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.elapsed_time"]], "empty_cache() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.empty_cache"]], "fp8_autocast() (in module intel_extension_for_pytorch.xpu.fp8.fp8)": [[1, "intel_extension_for_pytorch.xpu.fp8.fp8.fp8_autocast"]], "get_device_name() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_device_name"]], "get_device_properties() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_device_properties"]], "get_fp32_math_mode() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.get_fp32_math_mode"]], "get_rng_state() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_rng_state"]], "get_rng_state_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.get_rng_state_all"]], "init() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.init"]], "initial_seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.initial_seed"]], "is_available() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.is_available"]], "is_initialized() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.is_initialized"]], "manual_seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.manual_seed"]], "manual_seed_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.manual_seed_all"]], "max_memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.max_memory_allocated"]], "max_memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.max_memory_reserved"]], "memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_allocated"]], "memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_reserved"]], "memory_snapshot() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_snapshot"]], "memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_stats"]], "memory_stats_as_nested_dict() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_stats_as_nested_dict"]], "memory_summary() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.memory_summary"]], "optimize() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.optimize"]], "optimize_transformers() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.optimize_transformers"]], "query() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.query"]], "record() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.record"]], "record_event() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.record_event"]], "reset_accumulated_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.reset_accumulated_memory_stats"]], "reset_peak_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.reset_peak_memory_stats"]], "seed() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.seed"]], "seed_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.seed_all"]], "set_device() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_device"]], "set_fp32_math_mode() (in module intel_extension_for_pytorch)": [[1, "intel_extension_for_pytorch.set_fp32_math_mode"]], "set_rng_state() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_rng_state"]], "set_rng_state_all() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.set_rng_state_all"]], "stream() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.stream"]], "sycl_queue (intel_extension_for_pytorch.xpu.stream property)": [[1, "intel_extension_for_pytorch.xpu.Stream.sycl_queue"]], "synchronize() (in module intel_extension_for_pytorch.xpu)": [[1, "intel_extension_for_pytorch.xpu.synchronize"]], "synchronize() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.synchronize"]], "synchronize() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.synchronize"]], "wait() (intel_extension_for_pytorch.xpu.event method)": [[1, "intel_extension_for_pytorch.xpu.Event.wait"]], "wait_event() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.wait_event"]], "wait_stream() (intel_extension_for_pytorch.xpu.stream method)": [[1, "intel_extension_for_pytorch.xpu.Stream.wait_stream"]], "xpu::fp32_math_mode (c++ enum)": [[1, "_CPPv4N3xpu14FP32_MATH_MODEE"]], "xpu::fp32_math_mode::bf32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4BF32E"]], "xpu::fp32_math_mode::fp32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4FP32E"]], "xpu::fp32_math_mode::fp32_math_mode_max (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE"]], "xpu::fp32_math_mode::fp32_math_mode_min (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE"]], "xpu::fp32_math_mode::tf32 (c++ enumerator)": [[1, "_CPPv4N3xpu14FP32_MATH_MODE4TF32E"]], "xpu::get_queue_from_stream (c++ function)": [[1, "_CPPv4N3xpu21get_queue_from_streamEN3c106StreamE"]], "xpu::set_fp32_math_mode (c++ function)": [[1, "_CPPv4N3xpu18set_fp32_math_modeE14FP32_MATH_MODE"]]}}) \ No newline at end of file diff --git a/xpu/2.1.30+xpu/tutorials/api_doc.html b/xpu/2.1.30+xpu/tutorials/api_doc.html index c6cab16e1..e82ff17a0 100644 --- a/xpu/2.1.30+xpu/tutorials/api_doc.html +++ b/xpu/2.1.30+xpu/tutorials/api_doc.html @@ -1181,7 +1181,7 @@

C++ APISphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/blogs_publications.html b/xpu/2.1.30+xpu/tutorials/blogs_publications.html index a7c99b69d..872de784c 100644 --- a/xpu/2.1.30+xpu/tutorials/blogs_publications.html +++ b/xpu/2.1.30+xpu/tutorials/blogs_publications.html @@ -162,7 +162,7 @@

Blogs & PublicationsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/contribution.html b/xpu/2.1.30+xpu/tutorials/contribution.html index 4189887dd..2ecf5f115 100644 --- a/xpu/2.1.30+xpu/tutorials/contribution.html +++ b/xpu/2.1.30+xpu/tutorials/contribution.html @@ -268,7 +268,7 @@

Tips Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/examples.html b/xpu/2.1.30+xpu/tutorials/examples.html index 8fa9ee073..62bcd10cd 100644 --- a/xpu/2.1.30+xpu/tutorials/examples.html +++ b/xpu/2.1.30+xpu/tutorials/examples.html @@ -1112,7 +1112,7 @@

Intel® AI Reference ModelsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features.html b/xpu/2.1.30+xpu/tutorials/features.html index 74719ae56..f0f555816 100644 --- a/xpu/2.1.30+xpu/tutorials/features.html +++ b/xpu/2.1.30+xpu/tutorials/features.html @@ -280,7 +280,7 @@

IPEX_LOGGINGSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/DDP.html b/xpu/2.1.30+xpu/tutorials/features/DDP.html index 5f0d453a9..586ba3aa6 100644 --- a/xpu/2.1.30+xpu/tutorials/features/DDP.html +++ b/xpu/2.1.30+xpu/tutorials/features/DDP.html @@ -400,7 +400,7 @@

Usage of DDP scaling APISphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/DLPack.html b/xpu/2.1.30+xpu/tutorials/features/DLPack.html index 22450f194..15f4d1b1c 100644 --- a/xpu/2.1.30+xpu/tutorials/features/DLPack.html +++ b/xpu/2.1.30+xpu/tutorials/features/DLPack.html @@ -221,7 +221,7 @@

Example CaseSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/DPC++_Extension.html b/xpu/2.1.30+xpu/tutorials/features/DPC++_Extension.html index c7486a8ca..56d2cdbde 100644 --- a/xpu/2.1.30+xpu/tutorials/features/DPC++_Extension.html +++ b/xpu/2.1.30+xpu/tutorials/features/DPC++_Extension.html @@ -645,7 +645,7 @@

Using accessorsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/FSDP.html b/xpu/2.1.30+xpu/tutorials/features/FSDP.html index 5e80ba770..7865c84d3 100644 --- a/xpu/2.1.30+xpu/tutorials/features/FSDP.html +++ b/xpu/2.1.30+xpu/tutorials/features/FSDP.html @@ -447,7 +447,7 @@

ExampleSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/advanced_configuration.html b/xpu/2.1.30+xpu/tutorials/features/advanced_configuration.html index 32d15558f..64a3a30b6 100644 --- a/xpu/2.1.30+xpu/tutorials/features/advanced_configuration.html +++ b/xpu/2.1.30+xpu/tutorials/features/advanced_configuration.html @@ -381,7 +381,7 @@

Runtime ConfigurationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/amp_gpu.html b/xpu/2.1.30+xpu/tutorials/features/amp_gpu.html index 75c1d7f76..21635d864 100644 --- a/xpu/2.1.30+xpu/tutorials/features/amp_gpu.html +++ b/xpu/2.1.30+xpu/tutorials/features/amp_gpu.html @@ -259,7 +259,7 @@

Ops that promote to the widest input typeSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/auto_channels_last.html b/xpu/2.1.30+xpu/tutorials/features/auto_channels_last.html index a4e8e367c..b339e3ca3 100644 --- a/xpu/2.1.30+xpu/tutorials/features/auto_channels_last.html +++ b/xpu/2.1.30+xpu/tutorials/features/auto_channels_last.html @@ -186,7 +186,7 @@

Known issueSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/compute_engine.html b/xpu/2.1.30+xpu/tutorials/features/compute_engine.html index c37573d55..eba996d55 100644 --- a/xpu/2.1.30+xpu/tutorials/features/compute_engine.html +++ b/xpu/2.1.30+xpu/tutorials/features/compute_engine.html @@ -197,7 +197,7 @@

Multiple Implementations Operators and EnginesSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/deepspeed_kernels.html b/xpu/2.1.30+xpu/tutorials/features/deepspeed_kernels.html index 25f2983f9..b34e4fc85 100644 --- a/xpu/2.1.30+xpu/tutorials/features/deepspeed_kernels.html +++ b/xpu/2.1.30+xpu/tutorials/features/deepspeed_kernels.html @@ -132,7 +132,7 @@

Supported PlatformSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/float8.html b/xpu/2.1.30+xpu/tutorials/features/float8.html index 6d5010c8a..8e22ca338 100644 --- a/xpu/2.1.30+xpu/tutorials/features/float8.html +++ b/xpu/2.1.30+xpu/tutorials/features/float8.html @@ -187,7 +187,7 @@

FP8 usage exampleSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/horovod.html b/xpu/2.1.30+xpu/tutorials/features/horovod.html index f34f41e48..c8837fc0c 100644 --- a/xpu/2.1.30+xpu/tutorials/features/horovod.html +++ b/xpu/2.1.30+xpu/tutorials/features/horovod.html @@ -245,7 +245,7 @@

Horovod with PyTorch UsageSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/int8_overview_xpu.html b/xpu/2.1.30+xpu/tutorials/features/int8_overview_xpu.html index 1afdfe45e..38c81f6de 100644 --- a/xpu/2.1.30+xpu/tutorials/features/int8_overview_xpu.html +++ b/xpu/2.1.30+xpu/tutorials/features/int8_overview_xpu.html @@ -237,7 +237,7 @@

TorchScript ModeSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/ipex_log.html b/xpu/2.1.30+xpu/tutorials/features/ipex_log.html index 29e951066..928df25fa 100644 --- a/xpu/2.1.30+xpu/tutorials/features/ipex_log.html +++ b/xpu/2.1.30+xpu/tutorials/features/ipex_log.html @@ -308,7 +308,7 @@

Replace IPEX_VE Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/nhwc.html b/xpu/2.1.30+xpu/tutorials/features/nhwc.html index 8a310ad81..03c119a88 100644 --- a/xpu/2.1.30+xpu/tutorials/features/nhwc.html +++ b/xpu/2.1.30+xpu/tutorials/features/nhwc.html @@ -423,7 +423,7 @@

c. determine if in Channels Last 1D memory formatSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/profiler_kineto.html b/xpu/2.1.30+xpu/tutorials/features/profiler_kineto.html index 4904a4ec4..877b3faf4 100644 --- a/xpu/2.1.30+xpu/tutorials/features/profiler_kineto.html +++ b/xpu/2.1.30+xpu/tutorials/features/profiler_kineto.html @@ -326,7 +326,7 @@

Known issuesSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/profiler_legacy.html b/xpu/2.1.30+xpu/tutorials/features/profiler_legacy.html index fa108e50d..b81b69f6a 100644 --- a/xpu/2.1.30+xpu/tutorials/features/profiler_legacy.html +++ b/xpu/2.1.30+xpu/tutorials/features/profiler_legacy.html @@ -122,7 +122,7 @@

IntroductionSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/simple_trace.html b/xpu/2.1.30+xpu/tutorials/features/simple_trace.html index b218d1df3..0d32a1623 100644 --- a/xpu/2.1.30+xpu/tutorials/features/simple_trace.html +++ b/xpu/2.1.30+xpu/tutorials/features/simple_trace.html @@ -248,7 +248,7 @@

ResultsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/features/torch_compile_gpu.html b/xpu/2.1.30+xpu/tutorials/features/torch_compile_gpu.html index 5dc6c69e9..f6b42c7a9 100644 --- a/xpu/2.1.30+xpu/tutorials/features/torch_compile_gpu.html +++ b/xpu/2.1.30+xpu/tutorials/features/torch_compile_gpu.html @@ -218,7 +218,7 @@

Training with torch.compileSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/getting_started.html b/xpu/2.1.30+xpu/tutorials/getting_started.html index 04f00abe3..0d78ab396 100644 --- a/xpu/2.1.30+xpu/tutorials/getting_started.html +++ b/xpu/2.1.30+xpu/tutorials/getting_started.html @@ -177,7 +177,7 @@

ExecutionSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/installation.html b/xpu/2.1.30+xpu/tutorials/installation.html index f0bcc81b0..40352ad1b 100644 --- a/xpu/2.1.30+xpu/tutorials/installation.html +++ b/xpu/2.1.30+xpu/tutorials/installation.html @@ -125,7 +125,7 @@

InstallationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/introduction.html b/xpu/2.1.30+xpu/tutorials/introduction.html index d174d10bf..f8504d6a5 100644 --- a/xpu/2.1.30+xpu/tutorials/introduction.html +++ b/xpu/2.1.30+xpu/tutorials/introduction.html @@ -143,7 +143,7 @@

API DocumentationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/known_issues.html b/xpu/2.1.30+xpu/tutorials/known_issues.html index 0aac58577..9306c78b7 100644 --- a/xpu/2.1.30+xpu/tutorials/known_issues.html +++ b/xpu/2.1.30+xpu/tutorials/known_issues.html @@ -261,7 +261,7 @@

Unit TestSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/license.html b/xpu/2.1.30+xpu/tutorials/license.html index 7abe1b65e..bab9fe943 100644 --- a/xpu/2.1.30+xpu/tutorials/license.html +++ b/xpu/2.1.30+xpu/tutorials/license.html @@ -126,7 +126,7 @@

LicenseSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/llm.html b/xpu/2.1.30+xpu/tutorials/llm.html index 27866c623..96d2ce305 100644 --- a/xpu/2.1.30+xpu/tutorials/llm.html +++ b/xpu/2.1.30+xpu/tutorials/llm.html @@ -131,33 +131,45 @@

Optimized Models

Model Family

-

LLAMA2

-

GPT-J

-

Qwen

-

OPT

-

BLOOM

+

Verified < MODEL ID > (Huggingface hub)

+

FP16

+

INT4 WOQ

-

Verified < MODEL ID > (Huggingface hub)

+

Llama2

“meta-llama/Llama-2-7b-hf”, “meta-llama/Llama-2-13b-hf”, “meta-llama/Llama-2-70b-hf”

-

“EleutherAI/gpt-j-6b”

-

“Qwen/Qwen-7B”

-

“facebook/opt-30b”, “facebook/opt-1.3b”

-

“bigscience/bloom-7b1”, “bigscience/bloom”

- -

FP16

+ +

GPT-J

+

“EleutherAI/gpt-j-6b”

+ +

Qwen

+

“Qwen/Qwen-7B”

+

-

INT4 WOQ

+

OPT

+

“facebook/opt-30b”, “facebook/opt-1.3b”

+

+ +

Bloom

+

“bigscience/bloom-7b1”, “bigscience/bloom”

+

+ +

ChatGLM3-6B

+

“THUDM/chatglm3-6b”

+ +

Baichuan2-13B

+

“baichuan-inc/Baichuan2-13B-Chat”

+

@@ -244,7 +256,7 @@

Weight Only Quantization INT4Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
@@ -260,4 +272,4 @@

Weight Only Quantization INT4Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/llm/llm_optimize_transformers.html b/xpu/2.1.30+xpu/tutorials/llm/llm_optimize_transformers.html index a98b15c93..e95b1817c 100644 --- a/xpu/2.1.30+xpu/tutorials/llm/llm_optimize_transformers.html +++ b/xpu/2.1.30+xpu/tutorials/llm/llm_optimize_transformers.html @@ -255,7 +255,7 @@

Distributed Inference with DeepSpeedSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/performance.html b/xpu/2.1.30+xpu/tutorials/performance.html index a40a6e4f6..eef119c01 100644 --- a/xpu/2.1.30+xpu/tutorials/performance.html +++ b/xpu/2.1.30+xpu/tutorials/performance.html @@ -297,7 +297,7 @@

Hardware ConfigurationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/releases.html b/xpu/2.1.30+xpu/tutorials/releases.html index 6723df8f3..0a9e99d07 100644 --- a/xpu/2.1.30+xpu/tutorials/releases.html +++ b/xpu/2.1.30+xpu/tutorials/releases.html @@ -469,7 +469,7 @@

Known IssuesSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/technical_details.html b/xpu/2.1.30+xpu/tutorials/technical_details.html index e5809e7d4..46a34a31d 100644 --- a/xpu/2.1.30+xpu/tutorials/technical_details.html +++ b/xpu/2.1.30+xpu/tutorials/technical_details.html @@ -183,7 +183,7 @@

ipex.optimizeSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/technical_details/AOT.html b/xpu/2.1.30+xpu/tutorials/technical_details/AOT.html index b96fbf0be..68e8328b2 100644 --- a/xpu/2.1.30+xpu/tutorials/technical_details/AOT.html +++ b/xpu/2.1.30+xpu/tutorials/technical_details/AOT.html @@ -174,7 +174,7 @@

RequirementSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/technical_details/ipex_optimize.html b/xpu/2.1.30+xpu/tutorials/technical_details/ipex_optimize.html index 8405ffb5f..5d307c410 100644 --- a/xpu/2.1.30+xpu/tutorials/technical_details/ipex_optimize.html +++ b/xpu/2.1.30+xpu/tutorials/technical_details/ipex_optimize.html @@ -186,7 +186,7 @@

fuse_update_stepSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/technical_details/memory_management.html b/xpu/2.1.30+xpu/tutorials/technical_details/memory_management.html index 5fade111b..da882e35d 100644 --- a/xpu/2.1.30+xpu/tutorials/technical_details/memory_management.html +++ b/xpu/2.1.30+xpu/tutorials/technical_details/memory_management.html @@ -146,7 +146,7 @@

Memory ManagementSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.1.30+xpu/tutorials/technical_details/optimizer_fusion_gpu.html b/xpu/2.1.30+xpu/tutorials/technical_details/optimizer_fusion_gpu.html index 2d0c9b347..192e64ce4 100644 --- a/xpu/2.1.30+xpu/tutorials/technical_details/optimizer_fusion_gpu.html +++ b/xpu/2.1.30+xpu/tutorials/technical_details/optimizer_fusion_gpu.html @@ -171,7 +171,7 @@

Operation FusionSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.