Merge pull request #82 from fabio-sim/feat/torch-export

feat: Dynamic batch
fabio-sim · Jul 17, 2024 · 9ebf215 · 9ebf215
2 parents fc1d67a + a40fb4c
commit 9ebf215
Show file tree

Hide file tree

Showing 23 changed files with 2,105 additions and 197 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,15 +1,78 @@
-*.egg-info
-*.pyc
-/.idea/
-/data/
-/outputs/
-__pycache__
-/lightglue/weights/
-lightglue/_flash/
-*-checkpoint.ipynb
+# Models / data
 *.pth
+*.pt2
 *.onnx
 *.engine
 *.profile
-.vscode
+*.timing
+data
 megadepth_test_1500
+
+# VSCode
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
diff --git a/LICENSE b/LICENSE
@@ -187,7 +187,7 @@
       identification within third-party archives.
 
    Copyright 2023 ETH Zurich
-   Copyright 2023 Fabio Milentiansen Sim
+   Copyright 2023-2024 Fabio Milentiansen Sim
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/README.md b/README.md
@@ -4,19 +4,22 @@
 [![TensorRT](https://img.shields.io/badge/TensorRT-76B900)](https://developer.nvidia.com/tensorrt)
 [![GitHub Repo stars](https://img.shields.io/github/stars/fabio-sim/LightGlue-ONNX)](https://github.com/fabio-sim/LightGlue-ONNX/stargazers)
 [![GitHub all releases](https://img.shields.io/github/downloads/fabio-sim/LightGlue-ONNX/total)](https://github.com/fabio-sim/LightGlue-ONNX/releases)
+[![Blog](https://img.shields.io/badge/Blog-blue)](https://fabio-sim.github.io/blog/accelerating-lightglue-inference-onnx-runtime-tensorrt/)
 
 # LightGlue ONNX
 
 Open Neural Network Exchange (ONNX) compatible implementation of [LightGlue: Local Feature Matching at Light Speed](https://github.com/cvg/LightGlue). The ONNX model format allows for interoperability across different platforms with support for multiple execution providers, and removes Python-specific dependencies such as PyTorch. Supports TensorRT and OpenVINO.
 
-> ✨ ***What's New - 04 October 2023:*** Fused LightGlue ONNX Models with support for FlashAttention-2 via `onnxruntime>=1.16.0`, up to 80% faster inference on long sequence lengths (number of keypoints).
+> ✨ ***What's New***: End-to-end parallel dynamic batch size support. Read more in this [blog post](https://fabio-sim.github.io/blog/accelerating-lightglue-inference-onnx-runtime-tensorrt/).
 
 <p align="center"><a href="https://arxiv.org/abs/2306.13643"><img src="assets/easy_hard.jpg" alt="LightGlue figure" width=80%></a>
 
 <details>
 <summary>Changelog</summary>
 
+- **17 July 2024**: End-to-end parallel dynamic batch size support. Revamp script UX. Add [blog post](https://fabio-sim.github.io/blog/accelerating-lightglue-inference-onnx-runtime-tensorrt/).
 - **02 November 2023**: Introduce TopK-trick to optimize out ArgMax for about 30% speedup.
+- **04 October 2023:** Fused LightGlue ONNX Models with support for FlashAttention-2 via `onnxruntime>=1.16.0`, up to 80% faster inference on long sequence lengths (number of keypoints).
 - **27 October 2023**: LightGlue-ONNX added to [Kornia](https://kornia.readthedocs.io/en/latest/feature.html#kornia.feature.OnnxLightGlue)!
 - **04 October 2023**: Multihead-attention fusion optimization.
 - **19 July 2023**: Add support for TensorRT.
@@ -28,6 +31,30 @@ Open Neural Network Exchange (ONNX) compatible implementation of [LightGlue: Loc
 - **28 June 2023**: Add end-to-end SuperPoint+LightGlue export & inference pipeline.
 </details>
 
+## ⭐ ONNX Export & Inference
+
+We provide a [typer](https://github.com/tiangolo/typer) CLI [`dynamo.py`](dynamo.py) to easily export LightGlue to ONNX and perform inference using ONNX Runtime. If you would like to try out inference right away, you can download ONNX models that have already been exported [here](https://github.com/fabio-sim/LightGlue-ONNX/releases).
+
+```shell
+$ python dynamo.py --help
+
+Usage: dynamo.py [OPTIONS] COMMAND [ARGS]...
+
+LightGlue Dynamo CLI
+
+╭─ Commands ───────────────────────────────────────╮
+│ export   Export LightGlue to ONNX.               │
+│ infer    Run inference for LightGlue ONNX model. │
+╰──────────────────────────────────────────────────╯
+```
+
+Pass `--help` to see the available options for each command. The CLI will export the full extractor-matcher pipeline so that you don't have to worry about orchestrating intermediate steps.
+
+---
+
+> [!NOTE]
+> *The following sections use the legacy scripts.*
+
 ## 🔥 ONNX Export
 
 Prior to exporting the ONNX models, please install the [export requirements](/requirements-export.txt).

diff --git a/dynamo.py b/dynamo.py
@@ -0,0 +1,203 @@
+from pathlib import Path
+from typing import Annotated, Optional
+
+import cv2
+import typer
+
+from lightglue_dynamo.cli_utils import multiple_of
+from lightglue_dynamo.config import Extractor, InferenceDevice
+
+app = typer.Typer()
+
+
+@app.callback()
+def callback():
+    """LightGlue Dynamo CLI"""
+
+
+@app.command()
+def export(
+    extractor_type: Annotated[Extractor, typer.Argument()] = Extractor.superpoint,
+    output: Annotated[
+        Optional[Path],  # typer does not support Path | None # noqa: UP007
+        typer.Option("-o", "--output", dir_okay=False, writable=True, help="Path to save exported model."),
+    ] = None,
+    batch_size: Annotated[
+        int,
+        typer.Option(
+            "-b",
+            "--batch-size",
+            min=0,
+            help="Batch size of exported ONNX model. Set to 0 to mark as dynamic.",
+            callback=multiple_of(2),
+        ),
+    ] = 0,
+    height: Annotated[
+        int,
+        typer.Option(
+            "-h", "--height", min=0, help="Height of input image. Set to 0 to mark as dynamic.", callback=multiple_of(8)
+        ),
+    ] = 0,
+    width: Annotated[
+        int,
+        typer.Option(
+            "-w", "--width", min=0, help="Width of input image. Set to 0 to mark as dynamic.", callback=multiple_of(8)
+        ),
+    ] = 0,
+    num_keypoints: Annotated[
+        int, typer.Option(min=128, help="Number of keypoints outputted by feature extractor.")
+    ] = 1024,
+    fuse_multi_head_attention: Annotated[
+        bool,
+        typer.Option(
+            "--fuse-multi-head-attention",
+            help="Fuse multi-head attention subgraph into one optimized operation. (ONNX Runtime-only).",
+        ),
+    ] = False,
+    opset: Annotated[int, typer.Option(min=16, max=20, help="ONNX opset version of exported model.")] = 17,
+    fp16: Annotated[bool, typer.Option("--fp16", help="Whether to also convert to FP16.")] = False,
+):
+    """Export LightGlue to ONNX."""
+    import onnx
+    import torch
+    from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+    from onnxruntime.transformers.float16 import convert_float_to_float16
+
+    from lightglue_dynamo.models import LightGlue, Pipeline, SuperPoint
+    from lightglue_dynamo.ops import use_fused_multi_head_attention
+
+    match extractor_type:
+        case Extractor.superpoint:
+            extractor = SuperPoint(num_keypoints=num_keypoints).eval()
+    matcher = LightGlue(extractor_type).eval()
+    pipeline = Pipeline(extractor, matcher)
+
+    if output is None:
+        output = Path(f"weights/{extractor_type}_lightglue_pipeline.onnx")
+
+    if height > 0 and width > 0 and num_keypoints > height * width:
+        raise typer.BadParameter("num_keypoints cannot be greater than height * width.")
+
+    if fuse_multi_head_attention:
+        typer.echo(
+            "Warning: Multi-head attention nodes will be fused. Exported model will only work with ONNX Runtime CPU & CUDA execution providers."
+        )
+        if torch.__version__ < "2.4":
+            raise typer.Abort("Fused multi-head attention requires PyTorch 2.4 or later.")
+        use_fused_multi_head_attention()
+
+    dynamic_axes = {"images": {}, "keypoints": {}}
+    if batch_size == 0:
+        dynamic_axes["images"][0] = "batch_size"
+        dynamic_axes["keypoints"][0] = "batch_size"
+    if height == 0:
+        dynamic_axes["images"][2] = "height"
+    if width == 0:
+        dynamic_axes["images"][3] = "width"
+    dynamic_axes |= {"matches": {0: "num_matches"}, "mscores": {0: "num_matches"}}
+    torch.onnx.export(
+        pipeline,
+        torch.zeros(batch_size or 2, 1, height or 256, width or 256),
+        str(output),
+        input_names=["images"],
+        output_names=["keypoints", "matches", "mscores"],
+        opset_version=opset,
+        dynamic_axes=dynamic_axes,
+    )
+    onnx.checker.check_model(output)
+    onnx.save_model(SymbolicShapeInference.infer_shapes(onnx.load_model(output), auto_merge=True), output)  # type: ignore
+    if fp16:
+        typer.echo(
+            "Converting to FP16. Warning: This FP16 model should NOT be used for TensorRT. TRT provides its own fp16 option."
+        )
+        onnx.save_model(convert_float_to_float16(onnx.load_model(output)), output.with_suffix(".fp16.onnx"))
+
+
+@app.command()
+def infer(
+    model_path: Annotated[Path, typer.Argument(exists=True, dir_okay=False, readable=True, help="Path to ONNX model.")],
+    left_image_path: Annotated[
+        Path, typer.Argument(exists=True, dir_okay=False, readable=True, help="Path to first image.")
+    ],
+    right_image_path: Annotated[
+        Path, typer.Argument(exists=True, dir_okay=False, readable=True, help="Path to second image.")
+    ],
+    extractor_type: Annotated[Extractor, typer.Argument()] = Extractor.superpoint,
+    output_path: Annotated[
+        Optional[Path],  # noqa: UP007
+        typer.Option(
+            "-o",
+            "--output",
+            dir_okay=False,
+            writable=True,
+            help="Path to save output matches figure. If not given, show visualization.",
+        ),
+    ] = None,
+    height: Annotated[
+        int,
+        typer.Option("-h", "--height", min=1, help="Height of input image at which to perform inference."),
+    ] = 1024,
+    width: Annotated[
+        int,
+        typer.Option("-w", "--width", min=1, help="Width of input image at which to perform inference."),
+    ] = 1024,
+    device: Annotated[
+        InferenceDevice, typer.Option("-d", "--device", help="Device to run inference on.")
+    ] = InferenceDevice.cpu,
+    fp16: Annotated[bool, typer.Option("--fp16", help="Whether model uses FP16 precision.")] = False,
+    profile: Annotated[bool, typer.Option("--profile", help="Whether to profile model execution.")] = False,
+):
+    """Run inference for LightGlue ONNX model."""
+    import numpy as np
+    import onnxruntime as ort
+
+    from lightglue_dynamo import viz
+    from lightglue_dynamo.preprocessors import SuperPointPreprocessor
+
+    raw_images = [left_image_path, right_image_path]
+    raw_images = [cv2.resize(cv2.imread(str(i)), (width, height)) for i in raw_images]
+    images = np.stack(raw_images)
+    match extractor_type:
+        case Extractor.superpoint:
+            images = SuperPointPreprocessor.preprocess(images).astype(
+                np.float16 if fp16 and device != InferenceDevice.tensorrt else np.float32
+            )
+
+    session_options = ort.SessionOptions()
+    session_options.enable_profiling = profile
+    # session_options.optimized_model_filepath = "weights/ort_optimized.onnx"
+
+    providers = [("CPUExecutionProvider", {})]
+    if device == InferenceDevice.cuda:
+        providers.insert(0, ("CUDAExecutionProvider", {}))
+    elif device == InferenceDevice.tensorrt:
+        providers.insert(0, ("CUDAExecutionProvider", {}))
+        providers.insert(
+            0,
+            (
+                "TensorrtExecutionProvider",
+                {
+                    "trt_engine_cache_enable": True,
+                    "trt_engine_cache_path": "weights/.trtcache_engines",
+                    "trt_timing_cache_enable": True,
+                    "trt_timing_cache_path": "weights/.trtcache_timings",
+                    "trt_fp16_enable": fp16,
+                },
+            ),
+        )
+
+    session = ort.InferenceSession(model_path, session_options, providers)
+
+    for _ in range(100 if profile else 1):
+        keypoints, matches, mscores = session.run(None, {"images": images})
+
+    viz.plot_images(raw_images)
+    viz.plot_matches(keypoints[0][matches[..., 1]], keypoints[1][matches[..., 2]], color="lime", lw=0.2)
+    if output_path is None:
+        viz.plt.show()
+    else:
+        viz.save_plot(output_path)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/lightglue/__init__.py b/lightglue/__init__.py
@@ -1,4 +1,4 @@
+from .disk import DISK
 from .lightglue import LightGlue
 from .superpoint import SuperPoint
-from .disk import DISK
-from .utils import match_pair
+from .utils import match_pair